1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_ddb.h" 41 #include "opt_ipsec.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_ratelimit.h" 45 #include "opt_route.h" 46 #include "opt_rss.h" 47 48 #include <sys/param.h> 49 #include <sys/hash.h> 50 #include <sys/systm.h> 51 #include <sys/libkern.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/eventhandler.h> 56 #include <sys/domain.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/smp.h> 60 #include <sys/smr.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_private.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 115 116 /* 117 * These configure the range of local port addresses assigned to 118 * "unspecified" outgoing connections/packets/whatever. 119 */ 120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 126 127 /* 128 * Reserved ports accessible only to root. There are significant 129 * security considerations that must be accounted for when changing these, 130 * but the security benefits can be great. Please be careful. 131 */ 132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 133 VNET_DEFINE(int, ipport_reservedlow); 134 135 /* Enable random ephemeral port allocation by default. */ 136 VNET_DEFINE(int, ipport_randomized) = 1; 137 138 #ifdef INET 139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 140 struct in_addr faddr, u_int fport_arg, 141 struct in_addr laddr, u_int lport_arg, 142 int lookupflags, uint8_t numa_domain, int fib); 143 144 #define RANGECHK(var, min, max) \ 145 if ((var) < (min)) { (var) = (min); } \ 146 else if ((var) > (max)) { (var) = (max); } 147 148 static int 149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 150 { 151 int error; 152 153 error = sysctl_handle_int(oidp, arg1, arg2, req); 154 if (error == 0) { 155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 161 } 162 return (error); 163 } 164 165 #undef RANGECHK 166 167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 169 "IP Ports"); 170 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 174 ""); 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 197 &VNET_NAME(ipport_reservedhigh), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 201 CTLFLAG_VNET | CTLFLAG_RW, 202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 203 204 #ifdef RATELIMIT 205 counter_u64_t rate_limit_new; 206 counter_u64_t rate_limit_chg; 207 counter_u64_t rate_limit_active; 208 counter_u64_t rate_limit_alloc_fail; 209 counter_u64_t rate_limit_set_ok; 210 211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 212 "IP Rate Limiting"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 214 &rate_limit_active, "Active rate limited connections"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 216 &rate_limit_alloc_fail, "Rate limited connection failures"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 218 &rate_limit_set_ok, "Rate limited setting succeeded"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 220 &rate_limit_new, "Total Rate limit new attempts"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 222 &rate_limit_chg, "Total Rate limited change attempts"); 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, NULL); 236 237 #ifdef INET 238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; 239 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 241 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 242 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 243 #endif 244 245 static void in_pcbremhash(struct inpcb *); 246 247 /* 248 * in_pcb.c: manage the Protocol Control Blocks. 249 * 250 * NOTE: It is assumed that most of these functions will be called with 251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 252 * functions often modify hash chains or addresses in pcbs. 253 */ 254 255 static struct inpcblbgroup * 256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 257 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) 258 { 259 struct inpcblbgroup *grp; 260 size_t bytes; 261 262 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 263 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 264 if (grp == NULL) 265 return (NULL); 266 LIST_INIT(&grp->il_pending); 267 grp->il_cred = crhold(cred); 268 grp->il_vflag = vflag; 269 grp->il_lport = port; 270 grp->il_numa_domain = numa_domain; 271 grp->il_fibnum = fib; 272 grp->il_dependladdr = *addr; 273 grp->il_inpsiz = size; 274 return (grp); 275 } 276 277 static void 278 in_pcblbgroup_free_deferred(epoch_context_t ctx) 279 { 280 struct inpcblbgroup *grp; 281 282 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 283 crfree(grp->il_cred); 284 free(grp, M_PCB); 285 } 286 287 static void 288 in_pcblbgroup_free(struct inpcblbgroup *grp) 289 { 290 KASSERT(LIST_EMPTY(&grp->il_pending), 291 ("local group %p still has pending inps", grp)); 292 293 CK_LIST_REMOVE(grp, il_list); 294 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 295 } 296 297 static struct inpcblbgroup * 298 in_pcblbgroup_find(struct inpcb *inp) 299 { 300 struct inpcbinfo *pcbinfo; 301 struct inpcblbgroup *grp; 302 struct inpcblbgrouphead *hdr; 303 304 INP_LOCK_ASSERT(inp); 305 306 pcbinfo = inp->inp_pcbinfo; 307 INP_HASH_LOCK_ASSERT(pcbinfo); 308 309 hdr = &pcbinfo->ipi_lbgrouphashbase[ 310 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 311 CK_LIST_FOREACH(grp, hdr, il_list) { 312 struct inpcb *inp1; 313 314 for (unsigned int i = 0; i < grp->il_inpcnt; i++) { 315 if (inp == grp->il_inp[i]) 316 goto found; 317 } 318 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 319 if (inp == inp1) 320 goto found; 321 } 322 } 323 found: 324 return (grp); 325 } 326 327 static void 328 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 329 { 330 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 331 ("invalid local group size %d and count %d", grp->il_inpsiz, 332 grp->il_inpcnt)); 333 INP_WLOCK_ASSERT(inp); 334 335 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && 336 !SOLISTENING(inp->inp_socket)) { 337 /* 338 * If this is a TCP socket, it should not be visible to lbgroup 339 * lookups until listen() has been called. 340 */ 341 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); 342 grp->il_pendcnt++; 343 } else { 344 grp->il_inp[grp->il_inpcnt] = inp; 345 346 /* 347 * Synchronize with in_pcblookup_lbgroup(): make sure that we 348 * don't expose a null slot to the lookup path. 349 */ 350 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 351 } 352 353 inp->inp_flags |= INP_INLBGROUP; 354 } 355 356 static struct inpcblbgroup * 357 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 358 struct inpcblbgroup *old_grp, int size) 359 { 360 struct inpcblbgroup *grp; 361 int i; 362 363 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 364 old_grp->il_lport, &old_grp->il_dependladdr, size, 365 old_grp->il_numa_domain, old_grp->il_fibnum); 366 if (grp == NULL) 367 return (NULL); 368 369 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 370 ("invalid new local group size %d and old local group count %d", 371 grp->il_inpsiz, old_grp->il_inpcnt)); 372 373 for (i = 0; i < old_grp->il_inpcnt; ++i) 374 grp->il_inp[i] = old_grp->il_inp[i]; 375 grp->il_inpcnt = old_grp->il_inpcnt; 376 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 377 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, 378 inp_lbgroup_list); 379 grp->il_pendcnt = old_grp->il_pendcnt; 380 old_grp->il_pendcnt = 0; 381 in_pcblbgroup_free(old_grp); 382 return (grp); 383 } 384 385 /* 386 * Add PCB to load balance group for SO_REUSEPORT_LB option. 387 */ 388 static int 389 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 390 { 391 const static struct timeval interval = { 60, 0 }; 392 static struct timeval lastprint; 393 struct inpcbinfo *pcbinfo; 394 struct inpcblbgrouphead *hdr; 395 struct inpcblbgroup *grp; 396 uint32_t idx; 397 int fib; 398 399 pcbinfo = inp->inp_pcbinfo; 400 401 INP_WLOCK_ASSERT(inp); 402 INP_HASH_WLOCK_ASSERT(pcbinfo); 403 404 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? 405 inp->inp_inc.inc_fibnum : RT_ALL_FIBS; 406 407 #ifdef INET6 408 /* 409 * Don't allow IPv4 mapped INET6 wild socket. 410 */ 411 if ((inp->inp_vflag & INP_IPV4) && 412 inp->inp_laddr.s_addr == INADDR_ANY && 413 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 414 return (0); 415 } 416 #endif 417 418 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 419 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 420 CK_LIST_FOREACH(grp, hdr, il_list) { 421 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 422 grp->il_vflag == inp->inp_vflag && 423 grp->il_lport == inp->inp_lport && 424 grp->il_numa_domain == numa_domain && 425 grp->il_fibnum == fib && 426 memcmp(&grp->il_dependladdr, 427 &inp->inp_inc.inc_ie.ie_dependladdr, 428 sizeof(grp->il_dependladdr)) == 0) { 429 break; 430 } 431 } 432 if (grp == NULL) { 433 /* Create new load balance group. */ 434 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 435 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 436 INPCBLBGROUP_SIZMIN, numa_domain, fib); 437 if (grp == NULL) 438 return (ENOMEM); 439 in_pcblbgroup_insert(grp, inp); 440 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 441 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { 442 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 443 if (ratecheck(&lastprint, &interval)) 444 printf("lb group port %d, limit reached\n", 445 ntohs(grp->il_lport)); 446 return (0); 447 } 448 449 /* Expand this local group. */ 450 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 451 if (grp == NULL) 452 return (ENOMEM); 453 in_pcblbgroup_insert(grp, inp); 454 } else { 455 in_pcblbgroup_insert(grp, inp); 456 } 457 return (0); 458 } 459 460 /* 461 * Remove PCB from load balance group. 462 */ 463 static void 464 in_pcbremlbgrouphash(struct inpcb *inp) 465 { 466 struct inpcbinfo *pcbinfo; 467 struct inpcblbgrouphead *hdr; 468 struct inpcblbgroup *grp; 469 struct inpcb *inp1; 470 int i; 471 472 pcbinfo = inp->inp_pcbinfo; 473 474 INP_WLOCK_ASSERT(inp); 475 MPASS(inp->inp_flags & INP_INLBGROUP); 476 INP_HASH_WLOCK_ASSERT(pcbinfo); 477 478 hdr = &pcbinfo->ipi_lbgrouphashbase[ 479 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 480 CK_LIST_FOREACH(grp, hdr, il_list) { 481 for (i = 0; i < grp->il_inpcnt; ++i) { 482 if (grp->il_inp[i] != inp) 483 continue; 484 485 if (grp->il_inpcnt == 1 && 486 LIST_EMPTY(&grp->il_pending)) { 487 /* We are the last, free this local group. */ 488 in_pcblbgroup_free(grp); 489 } else { 490 grp->il_inp[i] = 491 grp->il_inp[grp->il_inpcnt - 1]; 492 493 /* 494 * Synchronize with in_pcblookup_lbgroup(). 495 */ 496 atomic_store_rel_int(&grp->il_inpcnt, 497 grp->il_inpcnt - 1); 498 } 499 inp->inp_flags &= ~INP_INLBGROUP; 500 return; 501 } 502 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 503 if (inp == inp1) { 504 LIST_REMOVE(inp, inp_lbgroup_list); 505 grp->il_pendcnt--; 506 inp->inp_flags &= ~INP_INLBGROUP; 507 return; 508 } 509 } 510 } 511 __assert_unreachable(); 512 } 513 514 int 515 in_pcblbgroup_numa(struct inpcb *inp, int arg) 516 { 517 struct inpcbinfo *pcbinfo; 518 int error; 519 uint8_t numa_domain; 520 521 switch (arg) { 522 case TCP_REUSPORT_LB_NUMA_NODOM: 523 numa_domain = M_NODOM; 524 break; 525 case TCP_REUSPORT_LB_NUMA_CURDOM: 526 numa_domain = PCPU_GET(domain); 527 break; 528 default: 529 if (arg < 0 || arg >= vm_ndomains) 530 return (EINVAL); 531 numa_domain = arg; 532 } 533 534 pcbinfo = inp->inp_pcbinfo; 535 INP_WLOCK_ASSERT(inp); 536 INP_HASH_WLOCK(pcbinfo); 537 if (in_pcblbgroup_find(inp) != NULL) { 538 /* Remove it from the old group. */ 539 in_pcbremlbgrouphash(inp); 540 /* Add it to the new group based on numa domain. */ 541 in_pcbinslbgrouphash(inp, numa_domain); 542 error = 0; 543 } else { 544 error = ENOENT; 545 } 546 INP_HASH_WUNLOCK(pcbinfo); 547 return (error); 548 } 549 550 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 551 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 552 553 /* 554 * Initialize an inpcbinfo - a per-VNET instance of connections db. 555 */ 556 void 557 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 558 u_int hash_nelements, u_int porthash_nelements) 559 { 560 561 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 562 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 563 NULL, MTX_DEF); 564 #ifdef VIMAGE 565 pcbinfo->ipi_vnet = curvnet; 566 #endif 567 CK_LIST_INIT(&pcbinfo->ipi_listhead); 568 pcbinfo->ipi_count = 0; 569 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 570 &pcbinfo->ipi_hashmask); 571 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 572 &pcbinfo->ipi_hashmask); 573 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 574 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 575 &pcbinfo->ipi_porthashmask); 576 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 577 &pcbinfo->ipi_lbgrouphashmask); 578 pcbinfo->ipi_zone = pcbstor->ips_zone; 579 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 580 } 581 582 /* 583 * Destroy an inpcbinfo. 584 */ 585 void 586 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 587 { 588 589 KASSERT(pcbinfo->ipi_count == 0, 590 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 591 592 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 593 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 594 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 595 pcbinfo->ipi_porthashmask); 596 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 597 pcbinfo->ipi_lbgrouphashmask); 598 mtx_destroy(&pcbinfo->ipi_hash_lock); 599 mtx_destroy(&pcbinfo->ipi_lock); 600 } 601 602 /* 603 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 604 */ 605 static void inpcb_fini(void *, int); 606 void 607 in_pcbstorage_init(void *arg) 608 { 609 struct inpcbstorage *pcbstor = arg; 610 611 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 612 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 613 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 614 } 615 616 /* 617 * Destroy a pcbstorage - used by unloadable protocols. 618 */ 619 void 620 in_pcbstorage_destroy(void *arg) 621 { 622 struct inpcbstorage *pcbstor = arg; 623 624 uma_zdestroy(pcbstor->ips_zone); 625 } 626 627 /* 628 * Allocate a PCB and associate it with the socket. 629 * On success return with the PCB locked. 630 */ 631 int 632 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 633 { 634 struct inpcb *inp; 635 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 636 int error; 637 #endif 638 639 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 640 if (inp == NULL) 641 return (ENOBUFS); 642 bzero(&inp->inp_start_zero, inp_zero_size); 643 #ifdef NUMA 644 inp->inp_numa_domain = M_NODOM; 645 #endif 646 inp->inp_pcbinfo = pcbinfo; 647 inp->inp_socket = so; 648 inp->inp_cred = crhold(so->so_cred); 649 inp->inp_inc.inc_fibnum = so->so_fibnum; 650 #ifdef MAC 651 error = mac_inpcb_init(inp, M_NOWAIT); 652 if (error != 0) 653 goto out; 654 mac_inpcb_create(so, inp); 655 #endif 656 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 657 error = ipsec_init_pcbpolicy(inp); 658 if (error != 0) { 659 #ifdef MAC 660 mac_inpcb_destroy(inp); 661 #endif 662 goto out; 663 } 664 #endif /*IPSEC*/ 665 #ifdef INET6 666 if (INP_SOCKAF(so) == AF_INET6) { 667 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 668 if (V_ip6_v6only) 669 inp->inp_flags |= IN6P_IPV6_V6ONLY; 670 #ifdef INET 671 else 672 inp->inp_vflag |= INP_IPV4; 673 #endif 674 if (V_ip6_auto_flowlabel) 675 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 676 inp->in6p_hops = -1; /* use kernel default */ 677 } 678 #endif 679 #if defined(INET) && defined(INET6) 680 else 681 #endif 682 #ifdef INET 683 inp->inp_vflag |= INP_IPV4; 684 #endif 685 inp->inp_smr = SMR_SEQ_INVALID; 686 687 /* 688 * Routes in inpcb's can cache L2 as well; they are guaranteed 689 * to be cleaned up. 690 */ 691 inp->inp_route.ro_flags = RT_LLE_CACHE; 692 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 693 INP_WLOCK(inp); 694 INP_INFO_WLOCK(pcbinfo); 695 pcbinfo->ipi_count++; 696 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 697 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 698 INP_INFO_WUNLOCK(pcbinfo); 699 so->so_pcb = inp; 700 701 return (0); 702 703 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 704 out: 705 crfree(inp->inp_cred); 706 #ifdef INVARIANTS 707 inp->inp_cred = NULL; 708 #endif 709 uma_zfree_smr(pcbinfo->ipi_zone, inp); 710 return (error); 711 #endif 712 } 713 714 #ifdef INET 715 int 716 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, 717 struct ucred *cred) 718 { 719 int anonport, error; 720 721 KASSERT(sin == NULL || sin->sin_family == AF_INET, 722 ("%s: invalid address family for %p", __func__, sin)); 723 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 724 ("%s: invalid address length for %p", __func__, sin)); 725 INP_WLOCK_ASSERT(inp); 726 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 727 728 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 729 return (EINVAL); 730 anonport = sin == NULL || sin->sin_port == 0; 731 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 732 &inp->inp_lport, flags, cred); 733 if (error) 734 return (error); 735 if (__predict_false((error = in_pcbinshash(inp)) != 0)) { 736 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB); 737 inp->inp_laddr.s_addr = INADDR_ANY; 738 inp->inp_lport = 0; 739 inp->inp_flags &= ~INP_BOUNDFIB; 740 return (error); 741 } 742 if (anonport) 743 inp->inp_flags |= INP_ANONPORT; 744 return (0); 745 } 746 #endif 747 748 #if defined(INET) || defined(INET6) 749 /* 750 * Assign a local port like in_pcb_lport(), but also used with connect() 751 * and a foreign address and port. If fsa is non-NULL, choose a local port 752 * that is unused with those, otherwise one that is completely unused. 753 * lsa can be NULL for IPv6. 754 */ 755 int 756 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, 757 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, 758 int lookupflags) 759 { 760 struct inpcbinfo *pcbinfo; 761 struct inpcb *tmpinp; 762 unsigned short *lastport; 763 int count, error; 764 u_short aux, first, last, lport; 765 #ifdef INET 766 struct in_addr laddr, faddr; 767 #endif 768 #ifdef INET6 769 struct in6_addr *laddr6, *faddr6; 770 #endif 771 772 pcbinfo = inp->inp_pcbinfo; 773 774 /* 775 * Because no actual state changes occur here, a global write lock on 776 * the pcbinfo isn't required. 777 */ 778 INP_LOCK_ASSERT(inp); 779 INP_HASH_LOCK_ASSERT(pcbinfo); 780 781 if (inp->inp_flags & INP_HIGHPORT) { 782 first = V_ipport_hifirstauto; /* sysctl */ 783 last = V_ipport_hilastauto; 784 lastport = &pcbinfo->ipi_lasthi; 785 } else if (inp->inp_flags & INP_LOWPORT) { 786 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 787 if (error) 788 return (error); 789 first = V_ipport_lowfirstauto; /* 1023 */ 790 last = V_ipport_lowlastauto; /* 600 */ 791 lastport = &pcbinfo->ipi_lastlow; 792 } else { 793 first = V_ipport_firstauto; /* sysctl */ 794 last = V_ipport_lastauto; 795 lastport = &pcbinfo->ipi_lastport; 796 } 797 798 /* 799 * Instead of having two loops further down counting up or down 800 * make sure that first is always <= last and go with only one 801 * code path implementing all logic. 802 */ 803 if (first > last) { 804 aux = first; 805 first = last; 806 last = aux; 807 } 808 809 #ifdef INET 810 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 811 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 812 if (lsa != NULL) 813 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 814 if (fsa != NULL) 815 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 816 } 817 #endif 818 #ifdef INET6 819 laddr6 = NULL; 820 if ((inp->inp_vflag & INP_IPV6) != 0) { 821 if (lsa != NULL) 822 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 823 if (fsa != NULL) 824 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 825 } 826 #endif 827 828 tmpinp = NULL; 829 830 if (V_ipport_randomized) 831 *lastport = first + (arc4random() % (last - first)); 832 833 count = last - first; 834 835 do { 836 if (count-- < 0) /* completely used? */ 837 return (EADDRNOTAVAIL); 838 ++*lastport; 839 if (*lastport < first || *lastport > last) 840 *lastport = first; 841 lport = htons(*lastport); 842 843 if (fsa != NULL) { 844 #ifdef INET 845 if (lsa->sa_family == AF_INET) { 846 tmpinp = in_pcblookup_hash_locked(pcbinfo, 847 faddr, fport, laddr, lport, lookupflags, 848 M_NODOM, RT_ALL_FIBS); 849 } 850 #endif 851 #ifdef INET6 852 if (lsa->sa_family == AF_INET6) { 853 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 854 faddr6, fport, laddr6, lport, lookupflags, 855 M_NODOM, RT_ALL_FIBS); 856 } 857 #endif 858 } else { 859 #ifdef INET6 860 if ((inp->inp_vflag & INP_IPV6) != 0) { 861 tmpinp = in6_pcblookup_local(pcbinfo, 862 &inp->in6p_laddr, lport, RT_ALL_FIBS, 863 lookupflags, cred); 864 #ifdef INET 865 if (tmpinp == NULL && 866 (inp->inp_vflag & INP_IPV4)) 867 tmpinp = in_pcblookup_local(pcbinfo, 868 laddr, lport, RT_ALL_FIBS, 869 lookupflags, cred); 870 #endif 871 } 872 #endif 873 #if defined(INET) && defined(INET6) 874 else 875 #endif 876 #ifdef INET 877 tmpinp = in_pcblookup_local(pcbinfo, laddr, 878 lport, RT_ALL_FIBS, lookupflags, cred); 879 #endif 880 } 881 } while (tmpinp != NULL); 882 883 *lportp = lport; 884 885 return (0); 886 } 887 888 /* 889 * Select a local port (number) to use. 890 */ 891 int 892 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 893 struct ucred *cred, int lookupflags) 894 { 895 struct sockaddr_in laddr; 896 897 if (laddrp) { 898 bzero(&laddr, sizeof(laddr)); 899 laddr.sin_family = AF_INET; 900 laddr.sin_addr = *laddrp; 901 } 902 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 903 NULL, lportp, NULL, 0, cred, lookupflags)); 904 } 905 #endif /* INET || INET6 */ 906 907 #ifdef INET 908 /* 909 * Determine whether the inpcb can be bound to the specified address/port tuple. 910 */ 911 static int 912 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 913 const u_short lport, const int fib, int sooptions, int lookupflags, 914 struct ucred *cred) 915 { 916 int reuseport, reuseport_lb; 917 918 INP_LOCK_ASSERT(inp); 919 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 920 921 reuseport = (sooptions & SO_REUSEPORT); 922 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 923 924 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 925 /* 926 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 927 * allow complete duplication of binding if 928 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 929 * and a multicast address is bound on both 930 * new and duplicated sockets. 931 */ 932 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 933 reuseport = SO_REUSEADDR | SO_REUSEPORT; 934 /* 935 * XXX: How to deal with SO_REUSEPORT_LB here? 936 * Treat same as SO_REUSEPORT for now. 937 */ 938 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 939 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 940 } else if (!in_nullhost(laddr)) { 941 struct sockaddr_in sin; 942 943 memset(&sin, 0, sizeof(sin)); 944 sin.sin_family = AF_INET; 945 sin.sin_len = sizeof(sin); 946 sin.sin_addr = laddr; 947 948 /* 949 * Is the address a local IP address? 950 * If INP_BINDANY is set, then the socket may be bound 951 * to any endpoint address, local or not. 952 */ 953 if ((inp->inp_flags & INP_BINDANY) == 0 && 954 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 955 return (EADDRNOTAVAIL); 956 } 957 958 if (lport != 0) { 959 struct inpcb *t; 960 961 if (ntohs(lport) <= V_ipport_reservedhigh && 962 ntohs(lport) >= V_ipport_reservedlow && 963 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 964 return (EACCES); 965 966 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 967 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 968 /* 969 * If a socket owned by a different user is already 970 * bound to this port, fail. In particular, SO_REUSE* 971 * can only be used to share a port among sockets owned 972 * by the same user. 973 * 974 * However, we can share a port with a connected socket 975 * which has a unique 4-tuple. 976 */ 977 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 978 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); 979 if (t != NULL && 980 (inp->inp_socket->so_type != SOCK_STREAM || 981 in_nullhost(t->inp_faddr)) && 982 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 983 return (EADDRINUSE); 984 } 985 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, 986 lookupflags, cred); 987 if (t != NULL && ((reuseport | reuseport_lb) & 988 t->inp_socket->so_options) == 0) { 989 #ifdef INET6 990 if (!in_nullhost(laddr) || 991 !in_nullhost(t->inp_laddr) || 992 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 993 (t->inp_vflag & INP_IPV6PROTO) == 0) 994 #endif 995 return (EADDRINUSE); 996 } 997 } 998 return (0); 999 } 1000 1001 /* 1002 * Set up a bind operation on a PCB, performing port allocation 1003 * as required, but do not actually modify the PCB. Callers can 1004 * either complete the bind by setting inp_laddr/inp_lport and 1005 * calling in_pcbinshash(), or they can just use the resulting 1006 * port and address to authorise the sending of a once-off packet. 1007 * 1008 * On error, the values of *laddrp and *lportp are not changed. 1009 */ 1010 int 1011 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 1012 u_short *lportp, int flags, struct ucred *cred) 1013 { 1014 struct socket *so = inp->inp_socket; 1015 struct in_addr laddr; 1016 u_short lport = 0; 1017 int error, fib, lookupflags, sooptions; 1018 1019 /* 1020 * No state changes, so read locks are sufficient here. 1021 */ 1022 INP_LOCK_ASSERT(inp); 1023 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1024 1025 laddr.s_addr = *laddrp; 1026 if (sin != NULL && laddr.s_addr != INADDR_ANY) 1027 return (EINVAL); 1028 1029 lookupflags = 0; 1030 sooptions = atomic_load_int(&so->so_options); 1031 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 1032 lookupflags = INPLOOKUP_WILDCARD; 1033 if (sin == NULL) { 1034 if ((error = prison_local_ip4(cred, &laddr)) != 0) 1035 return (error); 1036 } else { 1037 KASSERT(sin->sin_family == AF_INET, 1038 ("%s: invalid family for address %p", __func__, sin)); 1039 KASSERT(sin->sin_len == sizeof(*sin), 1040 ("%s: invalid length for address %p", __func__, sin)); 1041 1042 error = prison_local_ip4(cred, &sin->sin_addr); 1043 if (error) 1044 return (error); 1045 if (sin->sin_port != *lportp) { 1046 /* Don't allow the port to change. */ 1047 if (*lportp != 0) 1048 return (EINVAL); 1049 lport = sin->sin_port; 1050 } 1051 laddr = sin->sin_addr; 1052 1053 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : 1054 RT_ALL_FIBS; 1055 1056 /* See if this address/port combo is available. */ 1057 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, 1058 lookupflags, cred); 1059 if (error != 0) 1060 return (error); 1061 } 1062 if (*lportp != 0) 1063 lport = *lportp; 1064 if (lport == 0) { 1065 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1066 if (error != 0) 1067 return (error); 1068 } 1069 *laddrp = laddr.s_addr; 1070 *lportp = lport; 1071 if ((flags & INPBIND_FIB) != 0) 1072 inp->inp_flags |= INP_BOUNDFIB; 1073 return (0); 1074 } 1075 1076 /* 1077 * Connect from a socket to a specified address. 1078 * Both address and port must be specified in argument sin. 1079 * If don't have a local address for this socket yet, 1080 * then pick one. 1081 */ 1082 int 1083 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1084 { 1085 struct in_addr laddr, faddr; 1086 u_short lport; 1087 int error; 1088 bool anonport; 1089 1090 INP_WLOCK_ASSERT(inp); 1091 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1092 KASSERT(in_nullhost(inp->inp_faddr), 1093 ("%s: inp is already connected", __func__)); 1094 KASSERT(sin->sin_family == AF_INET, 1095 ("%s: invalid address family for %p", __func__, sin)); 1096 KASSERT(sin->sin_len == sizeof(*sin), 1097 ("%s: invalid address length for %p", __func__, sin)); 1098 1099 if (sin->sin_port == 0) 1100 return (EADDRNOTAVAIL); 1101 1102 anonport = (inp->inp_lport == 0); 1103 1104 if (__predict_false(in_broadcast(sin->sin_addr))) { 1105 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead)) 1106 return (ENETUNREACH); 1107 /* 1108 * If the destination address is INADDR_ANY, use the primary 1109 * local address. If the supplied address is INADDR_BROADCAST, 1110 * and the primary interface supports broadcast, choose the 1111 * broadcast address for that interface. 1112 */ 1113 if (in_nullhost(sin->sin_addr)) { 1114 faddr = 1115 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1116 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1117 return (error); 1118 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST && 1119 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags 1120 & IFF_BROADCAST) { 1121 faddr = satosin(&CK_STAILQ_FIRST( 1122 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1123 } else 1124 faddr = sin->sin_addr; 1125 } else 1126 faddr = sin->sin_addr; 1127 1128 if (in_nullhost(inp->inp_laddr)) { 1129 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1130 if (error) 1131 return (error); 1132 } else 1133 laddr = inp->inp_laddr; 1134 1135 if (anonport) { 1136 struct sockaddr_in lsin = { 1137 .sin_family = AF_INET, 1138 .sin_addr = laddr, 1139 }; 1140 struct sockaddr_in fsin = { 1141 .sin_family = AF_INET, 1142 .sin_addr = faddr, 1143 }; 1144 1145 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin, 1146 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred, 1147 INPLOOKUP_WILDCARD); 1148 if (error) 1149 return (error); 1150 } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1151 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) != 1152 NULL) 1153 return (EADDRINUSE); 1154 else 1155 lport = inp->inp_lport; 1156 1157 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 || 1158 !(inp->inp_flags & INP_INHASHLIST)); 1159 1160 inp->inp_faddr = faddr; 1161 inp->inp_fport = sin->sin_port; 1162 inp->inp_laddr = laddr; 1163 inp->inp_lport = lport; 1164 1165 if ((inp->inp_flags & INP_INHASHLIST) == 0) { 1166 error = in_pcbinshash(inp); 1167 MPASS(error == 0); 1168 } else 1169 in_pcbrehash(inp); 1170 #ifdef ROUTE_MPATH 1171 if (CALC_FLOWID_OUTBOUND) { 1172 uint32_t hash_val, hash_type; 1173 1174 hash_val = fib4_calc_software_hash(inp->inp_laddr, 1175 inp->inp_faddr, 0, sin->sin_port, 1176 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1177 1178 inp->inp_flowid = hash_val; 1179 inp->inp_flowtype = hash_type; 1180 } 1181 #endif 1182 if (anonport) 1183 inp->inp_flags |= INP_ANONPORT; 1184 return (0); 1185 } 1186 1187 /* 1188 * Do proper source address selection on an unbound socket in case 1189 * of connect. Take jails into account as well. 1190 */ 1191 int 1192 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, 1193 struct in_addr *laddr, struct ucred *cred) 1194 { 1195 struct ifaddr *ifa; 1196 struct sockaddr *sa; 1197 struct sockaddr_in *sin, dst; 1198 struct nhop_object *nh; 1199 int error; 1200 1201 NET_EPOCH_ASSERT(); 1202 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1203 1204 /* 1205 * Bypass source address selection and use the primary jail IP 1206 * if requested. 1207 */ 1208 if (!prison_saddrsel_ip4(cred, laddr)) 1209 return (0); 1210 1211 /* 1212 * If the destination address is multicast and an outgoing 1213 * interface has been set as a multicast option, prefer the 1214 * address of that interface as our source address. 1215 */ 1216 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL && 1217 inp->inp_moptions->imo_multicast_ifp != NULL) { 1218 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp; 1219 struct in_ifaddr *ia; 1220 1221 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1222 if (ia->ia_ifp == ifp && 1223 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) 1224 break; 1225 } 1226 if (ia == NULL) 1227 return (EADDRNOTAVAIL); 1228 *laddr = ia->ia_addr.sin_addr; 1229 return (0); 1230 } 1231 1232 error = 0; 1233 1234 nh = NULL; 1235 bzero(&dst, sizeof(dst)); 1236 sin = &dst; 1237 sin->sin_family = AF_INET; 1238 sin->sin_len = sizeof(struct sockaddr_in); 1239 sin->sin_addr.s_addr = faddr->s_addr; 1240 1241 /* 1242 * If route is known our src addr is taken from the i/f, 1243 * else punt. 1244 * 1245 * Find out route to destination. 1246 */ 1247 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1248 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1249 0, NHR_NONE, 0); 1250 1251 /* 1252 * If we found a route, use the address corresponding to 1253 * the outgoing interface. 1254 * 1255 * Otherwise assume faddr is reachable on a directly connected 1256 * network and try to find a corresponding interface to take 1257 * the source address from. 1258 */ 1259 if (nh == NULL || nh->nh_ifp == NULL) { 1260 struct in_ifaddr *ia; 1261 struct ifnet *ifp; 1262 1263 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1264 inp->inp_socket->so_fibnum)); 1265 if (ia == NULL) { 1266 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1267 inp->inp_socket->so_fibnum)); 1268 } 1269 if (ia == NULL) { 1270 error = ENETUNREACH; 1271 goto done; 1272 } 1273 1274 if (!prison_flag(cred, PR_IP4)) { 1275 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1276 goto done; 1277 } 1278 1279 ifp = ia->ia_ifp; 1280 ia = NULL; 1281 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1282 sa = ifa->ifa_addr; 1283 if (sa->sa_family != AF_INET) 1284 continue; 1285 sin = (struct sockaddr_in *)sa; 1286 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1287 ia = (struct in_ifaddr *)ifa; 1288 break; 1289 } 1290 } 1291 if (ia != NULL) { 1292 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1293 goto done; 1294 } 1295 1296 /* 3. As a last resort return the 'default' jail address. */ 1297 error = prison_get_ip4(cred, laddr); 1298 goto done; 1299 } 1300 1301 /* 1302 * If the outgoing interface on the route found is not 1303 * a loopback interface, use the address from that interface. 1304 * In case of jails do those three steps: 1305 * 1. check if the interface address belongs to the jail. If so use it. 1306 * 2. check if we have any address on the outgoing interface 1307 * belonging to this jail. If so use it. 1308 * 3. as a last resort return the 'default' jail address. 1309 */ 1310 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1311 struct in_ifaddr *ia; 1312 struct ifnet *ifp; 1313 1314 /* If not jailed, use the default returned. */ 1315 if (!prison_flag(cred, PR_IP4)) { 1316 ia = (struct in_ifaddr *)nh->nh_ifa; 1317 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1318 goto done; 1319 } 1320 1321 /* Jailed. */ 1322 /* 1. Check if the iface address belongs to the jail. */ 1323 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1324 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1325 ia = (struct in_ifaddr *)nh->nh_ifa; 1326 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1327 goto done; 1328 } 1329 1330 /* 1331 * 2. Check if we have any address on the outgoing interface 1332 * belonging to this jail. 1333 */ 1334 ia = NULL; 1335 ifp = nh->nh_ifp; 1336 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1337 sa = ifa->ifa_addr; 1338 if (sa->sa_family != AF_INET) 1339 continue; 1340 sin = (struct sockaddr_in *)sa; 1341 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1342 ia = (struct in_ifaddr *)ifa; 1343 break; 1344 } 1345 } 1346 if (ia != NULL) { 1347 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1348 goto done; 1349 } 1350 1351 /* 3. As a last resort return the 'default' jail address. */ 1352 error = prison_get_ip4(cred, laddr); 1353 goto done; 1354 } 1355 1356 /* 1357 * The outgoing interface is marked with 'loopback net', so a route 1358 * to ourselves is here. 1359 * Try to find the interface of the destination address and then 1360 * take the address from there. That interface is not necessarily 1361 * a loopback interface. 1362 * In case of jails, check that it is an address of the jail 1363 * and if we cannot find, fall back to the 'default' jail address. 1364 */ 1365 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1366 struct in_ifaddr *ia; 1367 1368 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1369 inp->inp_socket->so_fibnum)); 1370 if (ia == NULL) 1371 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1372 inp->inp_socket->so_fibnum)); 1373 if (ia == NULL) 1374 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1375 1376 if (!prison_flag(cred, PR_IP4)) { 1377 if (ia == NULL) { 1378 error = ENETUNREACH; 1379 goto done; 1380 } 1381 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1382 goto done; 1383 } 1384 1385 /* Jailed. */ 1386 if (ia != NULL) { 1387 struct ifnet *ifp; 1388 1389 ifp = ia->ia_ifp; 1390 ia = NULL; 1391 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1392 sa = ifa->ifa_addr; 1393 if (sa->sa_family != AF_INET) 1394 continue; 1395 sin = (struct sockaddr_in *)sa; 1396 if (prison_check_ip4(cred, 1397 &sin->sin_addr) == 0) { 1398 ia = (struct in_ifaddr *)ifa; 1399 break; 1400 } 1401 } 1402 if (ia != NULL) { 1403 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1404 goto done; 1405 } 1406 } 1407 1408 /* 3. As a last resort return the 'default' jail address. */ 1409 error = prison_get_ip4(cred, laddr); 1410 goto done; 1411 } 1412 1413 done: 1414 if (error == 0 && laddr->s_addr == INADDR_ANY) 1415 return (EHOSTUNREACH); 1416 return (error); 1417 } 1418 1419 void 1420 in_pcbdisconnect(struct inpcb *inp) 1421 { 1422 1423 INP_WLOCK_ASSERT(inp); 1424 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1425 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1426 ("%s: inp %p was already disconnected", __func__, inp)); 1427 1428 in_pcbremhash_locked(inp); 1429 1430 /* See the comment in in_pcbinshash(). */ 1431 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1432 inp->inp_laddr.s_addr = INADDR_ANY; 1433 inp->inp_faddr.s_addr = INADDR_ANY; 1434 inp->inp_fport = 0; 1435 } 1436 #endif /* INET */ 1437 1438 void 1439 in_pcblisten(struct inpcb *inp) 1440 { 1441 struct inpcblbgroup *grp; 1442 1443 INP_WLOCK_ASSERT(inp); 1444 1445 if ((inp->inp_flags & INP_INLBGROUP) != 0) { 1446 struct inpcbinfo *pcbinfo; 1447 1448 pcbinfo = inp->inp_pcbinfo; 1449 INP_HASH_WLOCK(pcbinfo); 1450 grp = in_pcblbgroup_find(inp); 1451 LIST_REMOVE(inp, inp_lbgroup_list); 1452 grp->il_pendcnt--; 1453 in_pcblbgroup_insert(grp, inp); 1454 INP_HASH_WUNLOCK(pcbinfo); 1455 } 1456 } 1457 1458 /* 1459 * inpcb hash lookups are protected by SMR section. 1460 * 1461 * Once desired pcb has been found, switching from SMR section to a pcb 1462 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1463 * here because SMR is a critical section. 1464 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1465 */ 1466 void 1467 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1468 { 1469 1470 lock == INPLOOKUP_RLOCKPCB ? 1471 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1472 } 1473 1474 void 1475 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1476 { 1477 1478 lock == INPLOOKUP_RLOCKPCB ? 1479 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1480 } 1481 1482 int 1483 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1484 { 1485 1486 return (lock == INPLOOKUP_RLOCKPCB ? 1487 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1488 } 1489 1490 static inline bool 1491 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1492 { 1493 1494 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1495 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1496 1497 if (__predict_true(inp_trylock(inp, lock))) { 1498 if (__predict_false(inp->inp_flags & ignflags)) { 1499 smr_exit(inp->inp_pcbinfo->ipi_smr); 1500 inp_unlock(inp, lock); 1501 return (false); 1502 } 1503 smr_exit(inp->inp_pcbinfo->ipi_smr); 1504 return (true); 1505 } 1506 1507 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1508 smr_exit(inp->inp_pcbinfo->ipi_smr); 1509 inp_lock(inp, lock); 1510 if (__predict_false(in_pcbrele(inp, lock))) 1511 return (false); 1512 /* 1513 * inp acquired through refcount & lock for sure didn't went 1514 * through uma_zfree(). However, it may have already went 1515 * through in_pcbfree() and has another reference, that 1516 * prevented its release by our in_pcbrele(). 1517 */ 1518 if (__predict_false(inp->inp_flags & ignflags)) { 1519 inp_unlock(inp, lock); 1520 return (false); 1521 } 1522 return (true); 1523 } else { 1524 smr_exit(inp->inp_pcbinfo->ipi_smr); 1525 return (false); 1526 } 1527 } 1528 1529 bool 1530 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1531 { 1532 1533 /* 1534 * in_pcblookup() family of functions ignore not only freed entries, 1535 * that may be found due to lockless access to the hash, but dropped 1536 * entries, too. 1537 */ 1538 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1539 } 1540 1541 /* 1542 * inp_next() - inpcb hash/list traversal iterator 1543 * 1544 * Requires initialized struct inpcb_iterator for context. 1545 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1546 * 1547 * - Iterator can have either write-lock or read-lock semantics, that can not 1548 * be changed later. 1549 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1550 * a single hash slot. Note: only rip_input() does the latter. 1551 * - Iterator may have optional bool matching function. The matching function 1552 * will be executed for each inpcb in the SMR context, so it can not acquire 1553 * locks and can safely access only immutable fields of inpcb. 1554 * 1555 * A fresh initialized iterator has NULL inpcb in its context and that 1556 * means that inp_next() call would return the very first inpcb on the list 1557 * locked with desired semantic. In all following calls the context pointer 1558 * shall hold the current inpcb pointer. The KPI user is not supposed to 1559 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1560 * and write NULL to its context. After end of traversal an iterator can be 1561 * reused. 1562 * 1563 * List traversals have the following features/constraints: 1564 * - New entries won't be seen, as they are always added to the head of a list. 1565 * - Removed entries won't stop traversal as long as they are not added to 1566 * a different list. This is violated by in_pcbrehash(). 1567 */ 1568 #define II_LIST_FIRST(ipi, hash) \ 1569 (((hash) == INP_ALL_LIST) ? \ 1570 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1571 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1572 #define II_LIST_NEXT(inp, hash) \ 1573 (((hash) == INP_ALL_LIST) ? \ 1574 CK_LIST_NEXT((inp), inp_list) : \ 1575 CK_LIST_NEXT((inp), inp_hash_exact)) 1576 #define II_LOCK_ASSERT(inp, lock) \ 1577 rw_assert(&(inp)->inp_lock, \ 1578 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1579 struct inpcb * 1580 inp_next(struct inpcb_iterator *ii) 1581 { 1582 const struct inpcbinfo *ipi = ii->ipi; 1583 inp_match_t *match = ii->match; 1584 void *ctx = ii->ctx; 1585 inp_lookup_t lock = ii->lock; 1586 int hash = ii->hash; 1587 struct inpcb *inp; 1588 1589 if (ii->inp == NULL) { /* First call. */ 1590 smr_enter(ipi->ipi_smr); 1591 /* This is unrolled CK_LIST_FOREACH(). */ 1592 for (inp = II_LIST_FIRST(ipi, hash); 1593 inp != NULL; 1594 inp = II_LIST_NEXT(inp, hash)) { 1595 if (match != NULL && (match)(inp, ctx) == false) 1596 continue; 1597 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1598 break; 1599 else { 1600 smr_enter(ipi->ipi_smr); 1601 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1602 inp = II_LIST_FIRST(ipi, hash); 1603 if (inp == NULL) 1604 break; 1605 } 1606 } 1607 1608 if (inp == NULL) 1609 smr_exit(ipi->ipi_smr); 1610 else 1611 ii->inp = inp; 1612 1613 return (inp); 1614 } 1615 1616 /* Not a first call. */ 1617 smr_enter(ipi->ipi_smr); 1618 restart: 1619 inp = ii->inp; 1620 II_LOCK_ASSERT(inp, lock); 1621 next: 1622 inp = II_LIST_NEXT(inp, hash); 1623 if (inp == NULL) { 1624 smr_exit(ipi->ipi_smr); 1625 goto found; 1626 } 1627 1628 if (match != NULL && (match)(inp, ctx) == false) 1629 goto next; 1630 1631 if (__predict_true(inp_trylock(inp, lock))) { 1632 if (__predict_false(inp->inp_flags & INP_FREED)) { 1633 /* 1634 * Entries are never inserted in middle of a list, thus 1635 * as long as we are in SMR, we can continue traversal. 1636 * Jump to 'restart' should yield in the same result, 1637 * but could produce unnecessary looping. Could this 1638 * looping be unbound? 1639 */ 1640 inp_unlock(inp, lock); 1641 goto next; 1642 } else { 1643 smr_exit(ipi->ipi_smr); 1644 goto found; 1645 } 1646 } 1647 1648 /* 1649 * Can't obtain lock immediately, thus going hard. Once we exit the 1650 * SMR section we can no longer jump to 'next', and our only stable 1651 * anchoring point is ii->inp, which we keep locked for this case, so 1652 * we jump to 'restart'. 1653 */ 1654 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1655 smr_exit(ipi->ipi_smr); 1656 inp_lock(inp, lock); 1657 if (__predict_false(in_pcbrele(inp, lock))) { 1658 smr_enter(ipi->ipi_smr); 1659 goto restart; 1660 } 1661 /* 1662 * See comment in inp_smr_lock(). 1663 */ 1664 if (__predict_false(inp->inp_flags & INP_FREED)) { 1665 inp_unlock(inp, lock); 1666 smr_enter(ipi->ipi_smr); 1667 goto restart; 1668 } 1669 } else 1670 goto next; 1671 1672 found: 1673 inp_unlock(ii->inp, lock); 1674 ii->inp = inp; 1675 1676 return (ii->inp); 1677 } 1678 1679 /* 1680 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1681 * stability of an inpcb pointer despite the inpcb lock being released or 1682 * SMR section exited. 1683 * 1684 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1685 */ 1686 void 1687 in_pcbref(struct inpcb *inp) 1688 { 1689 u_int old __diagused; 1690 1691 old = refcount_acquire(&inp->inp_refcount); 1692 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1693 } 1694 1695 /* 1696 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1697 * freeing the pcb, if the reference was very last. 1698 */ 1699 bool 1700 in_pcbrele_rlocked(struct inpcb *inp) 1701 { 1702 1703 INP_RLOCK_ASSERT(inp); 1704 1705 if (!refcount_release(&inp->inp_refcount)) 1706 return (false); 1707 1708 MPASS(inp->inp_flags & INP_FREED); 1709 MPASS(inp->inp_socket == NULL); 1710 crfree(inp->inp_cred); 1711 #ifdef INVARIANTS 1712 inp->inp_cred = NULL; 1713 #endif 1714 INP_RUNLOCK(inp); 1715 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1716 return (true); 1717 } 1718 1719 bool 1720 in_pcbrele_wlocked(struct inpcb *inp) 1721 { 1722 1723 INP_WLOCK_ASSERT(inp); 1724 1725 if (!refcount_release(&inp->inp_refcount)) 1726 return (false); 1727 1728 MPASS(inp->inp_flags & INP_FREED); 1729 MPASS(inp->inp_socket == NULL); 1730 crfree(inp->inp_cred); 1731 #ifdef INVARIANTS 1732 inp->inp_cred = NULL; 1733 #endif 1734 INP_WUNLOCK(inp); 1735 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1736 return (true); 1737 } 1738 1739 bool 1740 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1741 { 1742 1743 return (lock == INPLOOKUP_RLOCKPCB ? 1744 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1745 } 1746 1747 /* 1748 * Dereference and rlock inp, for which the caller must own the 1749 * reference. Returns true if inp no longer usable, false otherwise. 1750 */ 1751 bool 1752 in_pcbrele_rlock(struct inpcb *inp) 1753 { 1754 INP_RLOCK(inp); 1755 if (in_pcbrele_rlocked(inp)) 1756 return (true); 1757 if ((inp->inp_flags & INP_FREED) != 0) { 1758 INP_RUNLOCK(inp); 1759 return (true); 1760 } 1761 return (false); 1762 } 1763 1764 /* 1765 * Unconditionally schedule an inpcb to be freed by decrementing its 1766 * reference count, which should occur only after the inpcb has been detached 1767 * from its socket. If another thread holds a temporary reference (acquired 1768 * using in_pcbref()) then the free is deferred until that reference is 1769 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1770 * Almost all work, including removal from global lists, is done in this 1771 * context, where the pcbinfo lock is held. 1772 */ 1773 void 1774 in_pcbfree(struct inpcb *inp) 1775 { 1776 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1777 #ifdef INET 1778 struct ip_moptions *imo; 1779 #endif 1780 #ifdef INET6 1781 struct ip6_moptions *im6o; 1782 #endif 1783 1784 INP_WLOCK_ASSERT(inp); 1785 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1786 KASSERT((inp->inp_flags & INP_FREED) == 0, 1787 ("%s: called twice for pcb %p", __func__, inp)); 1788 1789 /* 1790 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1791 * from the hash without acquiring inpcb lock, they rely on the hash 1792 * lock, thus in_pcbremhash() should be the first action. 1793 */ 1794 if (inp->inp_flags & INP_INHASHLIST) 1795 in_pcbremhash(inp); 1796 INP_INFO_WLOCK(pcbinfo); 1797 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1798 pcbinfo->ipi_count--; 1799 CK_LIST_REMOVE(inp, inp_list); 1800 INP_INFO_WUNLOCK(pcbinfo); 1801 1802 #ifdef RATELIMIT 1803 if (inp->inp_snd_tag != NULL) 1804 in_pcbdetach_txrtlmt(inp); 1805 #endif 1806 inp->inp_flags |= INP_FREED; 1807 inp->inp_socket->so_pcb = NULL; 1808 inp->inp_socket = NULL; 1809 1810 RO_INVALIDATE_CACHE(&inp->inp_route); 1811 #ifdef MAC 1812 mac_inpcb_destroy(inp); 1813 #endif 1814 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1815 if (inp->inp_sp != NULL) 1816 ipsec_delete_pcbpolicy(inp); 1817 #endif 1818 #ifdef INET 1819 if (inp->inp_options) 1820 (void)m_free(inp->inp_options); 1821 DEBUG_POISON_POINTER(inp->inp_options); 1822 imo = inp->inp_moptions; 1823 DEBUG_POISON_POINTER(inp->inp_moptions); 1824 #endif 1825 #ifdef INET6 1826 if (inp->inp_vflag & INP_IPV6PROTO) { 1827 ip6_freepcbopts(inp->in6p_outputopts); 1828 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1829 im6o = inp->in6p_moptions; 1830 DEBUG_POISON_POINTER(inp->in6p_moptions); 1831 } else 1832 im6o = NULL; 1833 #endif 1834 1835 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1836 INP_WUNLOCK(inp); 1837 } 1838 #ifdef INET6 1839 ip6_freemoptions(im6o); 1840 #endif 1841 #ifdef INET 1842 inp_freemoptions(imo); 1843 #endif 1844 } 1845 1846 /* 1847 * Different protocols initialize their inpcbs differently - giving 1848 * different name to the lock. But they all are disposed the same. 1849 */ 1850 static void 1851 inpcb_fini(void *mem, int size) 1852 { 1853 struct inpcb *inp = mem; 1854 1855 INP_LOCK_DESTROY(inp); 1856 } 1857 1858 /* 1859 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1860 * port reservation, and preventing it from being returned by inpcb lookups. 1861 * 1862 * It is used by TCP to mark an inpcb as unused and avoid future packet 1863 * delivery or event notification when a socket remains open but TCP has 1864 * closed. This might occur as a result of a shutdown()-initiated TCP close 1865 * or a RST on the wire, and allows the port binding to be reused while still 1866 * maintaining the invariant that so_pcb always points to a valid inpcb until 1867 * in_pcbdetach(). 1868 * 1869 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1870 * in_pcbpurgeif0()? 1871 */ 1872 void 1873 in_pcbdrop(struct inpcb *inp) 1874 { 1875 1876 INP_WLOCK_ASSERT(inp); 1877 1878 inp->inp_flags |= INP_DROPPED; 1879 if (inp->inp_flags & INP_INHASHLIST) 1880 in_pcbremhash(inp); 1881 } 1882 1883 #ifdef INET 1884 /* 1885 * Common routines to return the socket addresses associated with inpcbs. 1886 */ 1887 int 1888 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1889 { 1890 struct inpcb *inp; 1891 1892 inp = sotoinpcb(so); 1893 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1894 1895 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1896 .sin_len = sizeof(struct sockaddr_in), 1897 .sin_family = AF_INET, 1898 .sin_port = inp->inp_lport, 1899 .sin_addr = inp->inp_laddr, 1900 }; 1901 1902 return (0); 1903 } 1904 1905 int 1906 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1907 { 1908 struct inpcb *inp; 1909 1910 inp = sotoinpcb(so); 1911 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1912 1913 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1914 .sin_len = sizeof(struct sockaddr_in), 1915 .sin_family = AF_INET, 1916 .sin_port = inp->inp_fport, 1917 .sin_addr = inp->inp_faddr, 1918 }; 1919 1920 return (0); 1921 } 1922 1923 static bool 1924 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1925 { 1926 1927 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1928 return (true); 1929 else 1930 return (false); 1931 } 1932 1933 void 1934 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1935 { 1936 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1937 inp_v4_multi_match, NULL); 1938 struct inpcb *inp; 1939 struct in_multi *inm; 1940 struct in_mfilter *imf; 1941 struct ip_moptions *imo; 1942 1943 IN_MULTI_LOCK_ASSERT(); 1944 1945 while ((inp = inp_next(&inpi)) != NULL) { 1946 INP_WLOCK_ASSERT(inp); 1947 1948 imo = inp->inp_moptions; 1949 /* 1950 * Unselect the outgoing interface if it is being 1951 * detached. 1952 */ 1953 if (imo->imo_multicast_ifp == ifp) 1954 imo->imo_multicast_ifp = NULL; 1955 1956 /* 1957 * Drop multicast group membership if we joined 1958 * through the interface being detached. 1959 * 1960 * XXX This can all be deferred to an epoch_call 1961 */ 1962 restart: 1963 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1964 if ((inm = imf->imf_inm) == NULL) 1965 continue; 1966 if (inm->inm_ifp != ifp) 1967 continue; 1968 ip_mfilter_remove(&imo->imo_head, imf); 1969 in_leavegroup_locked(inm, NULL); 1970 ip_mfilter_free(imf); 1971 goto restart; 1972 } 1973 } 1974 } 1975 1976 /* 1977 * Lookup a PCB based on the local address and port. Caller must hold the 1978 * hash lock. No inpcb locks or references are acquired. 1979 */ 1980 #define INP_LOOKUP_MAPPED_PCB_COST 3 1981 struct inpcb * 1982 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1983 u_short lport, int fib, int lookupflags, struct ucred *cred) 1984 { 1985 struct inpcb *inp; 1986 #ifdef INET6 1987 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1988 #else 1989 int matchwild = 3; 1990 #endif 1991 int wildcard; 1992 1993 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1994 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1995 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), 1996 ("%s: invalid fib %d", __func__, fib)); 1997 1998 INP_HASH_LOCK_ASSERT(pcbinfo); 1999 2000 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2001 struct inpcbhead *head; 2002 /* 2003 * Look for an unconnected (wildcard foreign addr) PCB that 2004 * matches the local address and port we're looking for. 2005 */ 2006 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2007 pcbinfo->ipi_hashmask)]; 2008 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2009 #ifdef INET6 2010 /* XXX inp locking */ 2011 if ((inp->inp_vflag & INP_IPV4) == 0) 2012 continue; 2013 #endif 2014 if (inp->inp_faddr.s_addr == INADDR_ANY && 2015 inp->inp_laddr.s_addr == laddr.s_addr && 2016 inp->inp_lport == lport && (fib == RT_ALL_FIBS || 2017 inp->inp_inc.inc_fibnum == fib)) { 2018 /* 2019 * Found? 2020 */ 2021 if (prison_equal_ip4(cred->cr_prison, 2022 inp->inp_cred->cr_prison)) 2023 return (inp); 2024 } 2025 } 2026 /* 2027 * Not found. 2028 */ 2029 return (NULL); 2030 } else { 2031 struct inpcbhead *porthash; 2032 struct inpcb *match = NULL; 2033 2034 /* 2035 * Port is in use by one or more PCBs. Look for best 2036 * fit. 2037 */ 2038 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2039 pcbinfo->ipi_porthashmask)]; 2040 CK_LIST_FOREACH(inp, porthash, inp_portlist) { 2041 if (inp->inp_lport != lport) 2042 continue; 2043 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2044 cred->cr_prison)) 2045 continue; 2046 if (fib != RT_ALL_FIBS && 2047 inp->inp_inc.inc_fibnum != fib) 2048 continue; 2049 wildcard = 0; 2050 #ifdef INET6 2051 /* XXX inp locking */ 2052 if ((inp->inp_vflag & INP_IPV4) == 0) 2053 continue; 2054 /* 2055 * We never select the PCB that has INP_IPV6 flag and 2056 * is bound to :: if we have another PCB which is bound 2057 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we 2058 * set its cost higher than IPv4 only PCBs. 2059 * 2060 * Note that the case only happens when a socket is 2061 * bound to ::, under the condition that the use of the 2062 * mapped address is allowed. 2063 */ 2064 if ((inp->inp_vflag & INP_IPV6) != 0) 2065 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2066 #endif 2067 if (inp->inp_faddr.s_addr != INADDR_ANY) 2068 wildcard++; 2069 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2070 if (laddr.s_addr == INADDR_ANY) 2071 wildcard++; 2072 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2073 continue; 2074 } else { 2075 if (laddr.s_addr != INADDR_ANY) 2076 wildcard++; 2077 } 2078 if (wildcard < matchwild) { 2079 match = inp; 2080 matchwild = wildcard; 2081 if (matchwild == 0) 2082 break; 2083 } 2084 } 2085 return (match); 2086 } 2087 } 2088 #undef INP_LOOKUP_MAPPED_PCB_COST 2089 2090 static bool 2091 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) 2092 { 2093 return ((domain == M_NODOM || domain == grp->il_numa_domain) && 2094 (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); 2095 } 2096 2097 static struct inpcb * 2098 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2099 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2100 uint16_t lport, int domain, int fib) 2101 { 2102 const struct inpcblbgrouphead *hdr; 2103 struct inpcblbgroup *grp; 2104 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2105 struct inpcb *inp; 2106 u_int count; 2107 2108 INP_HASH_LOCK_ASSERT(pcbinfo); 2109 NET_EPOCH_ASSERT(); 2110 2111 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2112 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2113 2114 /* 2115 * Search for an LB group match based on the following criteria: 2116 * - prefer jailed groups to non-jailed groups 2117 * - prefer exact source address matches to wildcard matches 2118 * - prefer groups bound to the specified NUMA domain 2119 */ 2120 jail_exact = jail_wild = local_exact = local_wild = NULL; 2121 CK_LIST_FOREACH(grp, hdr, il_list) { 2122 bool injail; 2123 2124 #ifdef INET6 2125 if (!(grp->il_vflag & INP_IPV4)) 2126 continue; 2127 #endif 2128 if (grp->il_lport != lport) 2129 continue; 2130 2131 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2132 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2133 laddr) != 0) 2134 continue; 2135 2136 if (grp->il_laddr.s_addr == laddr->s_addr) { 2137 if (injail) { 2138 jail_exact = grp; 2139 if (in_pcblookup_lb_match(grp, domain, fib)) 2140 /* This is a perfect match. */ 2141 goto out; 2142 } else if (local_exact == NULL || 2143 in_pcblookup_lb_match(grp, domain, fib)) { 2144 local_exact = grp; 2145 } 2146 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2147 if (injail) { 2148 if (jail_wild == NULL || 2149 in_pcblookup_lb_match(grp, domain, fib)) 2150 jail_wild = grp; 2151 } else if (local_wild == NULL || 2152 in_pcblookup_lb_match(grp, domain, fib)) { 2153 local_wild = grp; 2154 } 2155 } 2156 } 2157 2158 if (jail_exact != NULL) 2159 grp = jail_exact; 2160 else if (jail_wild != NULL) 2161 grp = jail_wild; 2162 else if (local_exact != NULL) 2163 grp = local_exact; 2164 else 2165 grp = local_wild; 2166 if (grp == NULL) 2167 return (NULL); 2168 2169 out: 2170 /* 2171 * Synchronize with in_pcblbgroup_insert(). 2172 */ 2173 count = atomic_load_acq_int(&grp->il_inpcnt); 2174 if (count == 0) 2175 return (NULL); 2176 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2177 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2178 return (inp); 2179 } 2180 2181 static bool 2182 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2183 u_short fport, struct in_addr laddr, u_short lport) 2184 { 2185 #ifdef INET6 2186 /* XXX inp locking */ 2187 if ((inp->inp_vflag & INP_IPV4) == 0) 2188 return (false); 2189 #endif 2190 if (inp->inp_faddr.s_addr == faddr.s_addr && 2191 inp->inp_laddr.s_addr == laddr.s_addr && 2192 inp->inp_fport == fport && 2193 inp->inp_lport == lport) 2194 return (true); 2195 return (false); 2196 } 2197 2198 static struct inpcb * 2199 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2200 u_short fport, struct in_addr laddr, u_short lport) 2201 { 2202 struct inpcbhead *head; 2203 struct inpcb *inp; 2204 2205 INP_HASH_LOCK_ASSERT(pcbinfo); 2206 2207 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2208 pcbinfo->ipi_hashmask)]; 2209 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2210 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2211 return (inp); 2212 } 2213 return (NULL); 2214 } 2215 2216 typedef enum { 2217 INPLOOKUP_MATCH_NONE = 0, 2218 INPLOOKUP_MATCH_WILD = 1, 2219 INPLOOKUP_MATCH_LADDR = 2, 2220 } inp_lookup_match_t; 2221 2222 static inp_lookup_match_t 2223 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2224 u_short lport, int fib) 2225 { 2226 #ifdef INET6 2227 /* XXX inp locking */ 2228 if ((inp->inp_vflag & INP_IPV4) == 0) 2229 return (INPLOOKUP_MATCH_NONE); 2230 #endif 2231 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2232 return (INPLOOKUP_MATCH_NONE); 2233 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) 2234 return (INPLOOKUP_MATCH_NONE); 2235 if (inp->inp_laddr.s_addr == INADDR_ANY) 2236 return (INPLOOKUP_MATCH_WILD); 2237 if (inp->inp_laddr.s_addr == laddr.s_addr) 2238 return (INPLOOKUP_MATCH_LADDR); 2239 return (INPLOOKUP_MATCH_NONE); 2240 } 2241 2242 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2243 2244 static struct inpcb * 2245 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2246 u_short lport, int fib, const inp_lookup_t lockflags) 2247 { 2248 struct inpcbhead *head; 2249 struct inpcb *inp; 2250 2251 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2252 ("%s: not in SMR read section", __func__)); 2253 2254 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2255 pcbinfo->ipi_hashmask)]; 2256 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2257 inp_lookup_match_t match; 2258 2259 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2260 if (match == INPLOOKUP_MATCH_NONE) 2261 continue; 2262 2263 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2264 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2265 if (match != INPLOOKUP_MATCH_NONE && 2266 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2267 &laddr) == 0) 2268 return (inp); 2269 inp_unlock(inp, lockflags); 2270 } 2271 2272 /* 2273 * The matching socket disappeared out from under us. Fall back 2274 * to a serialized lookup. 2275 */ 2276 return (INP_LOOKUP_AGAIN); 2277 } 2278 return (NULL); 2279 } 2280 2281 static struct inpcb * 2282 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2283 u_short lport, int fib) 2284 { 2285 struct inpcbhead *head; 2286 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2287 #ifdef INET6 2288 struct inpcb *local_wild_mapped; 2289 #endif 2290 2291 INP_HASH_LOCK_ASSERT(pcbinfo); 2292 2293 /* 2294 * Order of socket selection - we always prefer jails. 2295 * 1. jailed, non-wild. 2296 * 2. jailed, wild. 2297 * 3. non-jailed, non-wild. 2298 * 4. non-jailed, wild. 2299 */ 2300 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2301 pcbinfo->ipi_hashmask)]; 2302 local_wild = local_exact = jail_wild = NULL; 2303 #ifdef INET6 2304 local_wild_mapped = NULL; 2305 #endif 2306 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2307 inp_lookup_match_t match; 2308 bool injail; 2309 2310 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2311 if (match == INPLOOKUP_MATCH_NONE) 2312 continue; 2313 2314 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2315 if (injail) { 2316 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2317 &laddr) != 0) 2318 continue; 2319 } else { 2320 if (local_exact != NULL) 2321 continue; 2322 } 2323 2324 if (match == INPLOOKUP_MATCH_LADDR) { 2325 if (injail) 2326 return (inp); 2327 local_exact = inp; 2328 } else { 2329 #ifdef INET6 2330 /* XXX inp locking, NULL check */ 2331 if (inp->inp_vflag & INP_IPV6PROTO) 2332 local_wild_mapped = inp; 2333 else 2334 #endif 2335 if (injail) 2336 jail_wild = inp; 2337 else 2338 local_wild = inp; 2339 } 2340 } 2341 if (jail_wild != NULL) 2342 return (jail_wild); 2343 if (local_exact != NULL) 2344 return (local_exact); 2345 if (local_wild != NULL) 2346 return (local_wild); 2347 #ifdef INET6 2348 if (local_wild_mapped != NULL) 2349 return (local_wild_mapped); 2350 #endif 2351 return (NULL); 2352 } 2353 2354 /* 2355 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2356 * that the caller has either locked the hash list, which usually happens 2357 * for bind(2) operations, or is in SMR section, which happens when sorting 2358 * out incoming packets. 2359 */ 2360 static struct inpcb * 2361 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2362 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2363 uint8_t numa_domain, int fib) 2364 { 2365 struct inpcb *inp; 2366 const u_short fport = fport_arg, lport = lport_arg; 2367 2368 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, 2369 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2370 KASSERT(faddr.s_addr != INADDR_ANY, 2371 ("%s: invalid foreign address", __func__)); 2372 KASSERT(laddr.s_addr != INADDR_ANY, 2373 ("%s: invalid local address", __func__)); 2374 INP_HASH_WLOCK_ASSERT(pcbinfo); 2375 2376 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2377 if (inp != NULL) 2378 return (inp); 2379 2380 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2381 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2382 &laddr, lport, numa_domain, fib); 2383 if (inp == NULL) { 2384 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2385 lport, fib); 2386 } 2387 } 2388 2389 return (inp); 2390 } 2391 2392 static struct inpcb * 2393 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2394 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2395 uint8_t numa_domain, int fib) 2396 { 2397 struct inpcb *inp; 2398 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2399 2400 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2401 ("%s: LOCKPCB not set", __func__)); 2402 2403 INP_HASH_WLOCK(pcbinfo); 2404 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2405 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); 2406 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2407 in_pcbref(inp); 2408 INP_HASH_WUNLOCK(pcbinfo); 2409 inp_lock(inp, lockflags); 2410 if (in_pcbrele(inp, lockflags)) 2411 /* XXX-MJ or retry until we get a negative match? */ 2412 inp = NULL; 2413 } else { 2414 INP_HASH_WUNLOCK(pcbinfo); 2415 } 2416 return (inp); 2417 } 2418 2419 static struct inpcb * 2420 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2421 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2422 uint8_t numa_domain, int fib) 2423 { 2424 struct inpcb *inp; 2425 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2426 const u_short fport = fport_arg, lport = lport_arg; 2427 2428 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2429 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2430 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2431 ("%s: LOCKPCB not set", __func__)); 2432 2433 smr_enter(pcbinfo->ipi_smr); 2434 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2435 if (inp != NULL) { 2436 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2437 /* 2438 * Revalidate the 4-tuple, the socket could have been 2439 * disconnected. 2440 */ 2441 if (__predict_true(in_pcblookup_exact_match(inp, 2442 faddr, fport, laddr, lport))) 2443 return (inp); 2444 inp_unlock(inp, lockflags); 2445 } 2446 2447 /* 2448 * We failed to lock the inpcb, or its connection state changed 2449 * out from under us. Fall back to a precise search. 2450 */ 2451 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2452 lookupflags, numa_domain, fib)); 2453 } 2454 2455 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2456 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2457 &laddr, lport, numa_domain, fib); 2458 if (inp != NULL) { 2459 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2460 if (__predict_true(in_pcblookup_wild_match(inp, 2461 laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) 2462 return (inp); 2463 inp_unlock(inp, lockflags); 2464 } 2465 inp = INP_LOOKUP_AGAIN; 2466 } else { 2467 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2468 fib, lockflags); 2469 } 2470 if (inp == INP_LOOKUP_AGAIN) { 2471 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2472 lport, lookupflags, numa_domain, fib)); 2473 } 2474 } 2475 2476 if (inp == NULL) 2477 smr_exit(pcbinfo->ipi_smr); 2478 2479 return (inp); 2480 } 2481 2482 /* 2483 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2484 * from which a pre-calculated hash value may be extracted. 2485 */ 2486 struct inpcb * 2487 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2488 struct in_addr laddr, u_int lport, int lookupflags, 2489 struct ifnet *ifp) 2490 { 2491 int fib; 2492 2493 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; 2494 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2495 lookupflags, M_NODOM, fib)); 2496 } 2497 2498 struct inpcb * 2499 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2500 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2501 struct ifnet *ifp __unused, struct mbuf *m) 2502 { 2503 int fib; 2504 2505 M_ASSERTPKTHDR(m); 2506 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; 2507 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2508 lookupflags, m->m_pkthdr.numa_domain, fib)); 2509 } 2510 #endif /* INET */ 2511 2512 static bool 2513 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2514 { 2515 return (prison_flag(inp->inp_cred, flag) != 0); 2516 } 2517 2518 /* 2519 * Insert the PCB into a hash chain using ordering rules which ensure that 2520 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2521 * 2522 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2523 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2524 * always appear last no matter whether they are jailed. 2525 */ 2526 static void 2527 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2528 { 2529 struct inpcb *last; 2530 bool bound, injail; 2531 2532 INP_LOCK_ASSERT(inp); 2533 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2534 2535 last = NULL; 2536 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2537 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2538 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2539 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2540 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2541 return; 2542 } 2543 } 2544 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2545 return; 2546 } 2547 2548 injail = in_pcbjailed(inp, PR_IP4); 2549 if (!injail) { 2550 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2551 if (!in_pcbjailed(last, PR_IP4)) 2552 break; 2553 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2554 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2555 return; 2556 } 2557 } 2558 } else if (!CK_LIST_EMPTY(pcbhash) && 2559 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2560 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2561 return; 2562 } 2563 if (!bound) { 2564 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2565 if (last->inp_laddr.s_addr == INADDR_ANY) 2566 break; 2567 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2568 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2569 return; 2570 } 2571 } 2572 } 2573 if (last == NULL) 2574 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2575 else 2576 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2577 } 2578 2579 #ifdef INET6 2580 /* 2581 * See the comment above _in_pcbinshash_wild(). 2582 */ 2583 static void 2584 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2585 { 2586 struct inpcb *last; 2587 bool bound, injail; 2588 2589 INP_LOCK_ASSERT(inp); 2590 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2591 2592 last = NULL; 2593 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2594 injail = in_pcbjailed(inp, PR_IP6); 2595 if (!injail) { 2596 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2597 if (!in_pcbjailed(last, PR_IP6)) 2598 break; 2599 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2600 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2601 return; 2602 } 2603 } 2604 } else if (!CK_LIST_EMPTY(pcbhash) && 2605 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2606 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2607 return; 2608 } 2609 if (!bound) { 2610 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2611 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2612 break; 2613 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2614 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2615 return; 2616 } 2617 } 2618 } 2619 if (last == NULL) 2620 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2621 else 2622 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2623 } 2624 #endif 2625 2626 /* 2627 * Insert PCB onto various hash lists. 2628 * 2629 * With normal sockets this function shall not fail, so it could return void. 2630 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held, 2631 * that's the only condition when it can fail. 2632 */ 2633 int 2634 in_pcbinshash(struct inpcb *inp) 2635 { 2636 struct inpcbhead *pcbhash, *pcbporthash; 2637 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2638 uint32_t hash; 2639 bool connected; 2640 2641 INP_WLOCK_ASSERT(inp); 2642 INP_HASH_WLOCK_ASSERT(pcbinfo); 2643 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2644 ("in_pcbinshash: INP_INHASHLIST")); 2645 2646 #ifdef INET6 2647 if (inp->inp_vflag & INP_IPV6) { 2648 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2649 inp->inp_fport, pcbinfo->ipi_hashmask); 2650 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2651 } else 2652 #endif 2653 { 2654 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2655 inp->inp_fport, pcbinfo->ipi_hashmask); 2656 connected = !in_nullhost(inp->inp_faddr); 2657 } 2658 2659 if (connected) 2660 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2661 else 2662 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2663 2664 pcbporthash = &pcbinfo->ipi_porthashbase[ 2665 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2666 2667 /* 2668 * Add entry to load balance group. 2669 * Only do this if SO_REUSEPORT_LB is set. 2670 */ 2671 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2672 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2673 if (error != 0) 2674 return (error); 2675 } 2676 2677 /* 2678 * The PCB may have been disconnected in the past. Before we can safely 2679 * make it visible in the hash table, we must wait for all readers which 2680 * may be traversing this PCB to finish. 2681 */ 2682 if (inp->inp_smr != SMR_SEQ_INVALID) { 2683 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2684 inp->inp_smr = SMR_SEQ_INVALID; 2685 } 2686 2687 if (connected) 2688 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2689 else { 2690 #ifdef INET6 2691 if ((inp->inp_vflag & INP_IPV6) != 0) 2692 _in6_pcbinshash_wild(pcbhash, inp); 2693 else 2694 #endif 2695 _in_pcbinshash_wild(pcbhash, inp); 2696 } 2697 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); 2698 inp->inp_flags |= INP_INHASHLIST; 2699 2700 return (0); 2701 } 2702 2703 void 2704 in_pcbremhash_locked(struct inpcb *inp) 2705 { 2706 2707 INP_WLOCK_ASSERT(inp); 2708 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2709 MPASS(inp->inp_flags & INP_INHASHLIST); 2710 2711 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2712 in_pcbremlbgrouphash(inp); 2713 #ifdef INET6 2714 if (inp->inp_vflag & INP_IPV6) { 2715 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2716 CK_LIST_REMOVE(inp, inp_hash_wild); 2717 else 2718 CK_LIST_REMOVE(inp, inp_hash_exact); 2719 } else 2720 #endif 2721 { 2722 if (in_nullhost(inp->inp_faddr)) 2723 CK_LIST_REMOVE(inp, inp_hash_wild); 2724 else 2725 CK_LIST_REMOVE(inp, inp_hash_exact); 2726 } 2727 CK_LIST_REMOVE(inp, inp_portlist); 2728 inp->inp_flags &= ~INP_INHASHLIST; 2729 } 2730 2731 static void 2732 in_pcbremhash(struct inpcb *inp) 2733 { 2734 INP_HASH_WLOCK(inp->inp_pcbinfo); 2735 in_pcbremhash_locked(inp); 2736 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2737 } 2738 2739 /* 2740 * Move PCB to the proper hash bucket when { faddr, fport } have been 2741 * changed. NOTE: This does not handle the case of the lport changing (the 2742 * hashed port list would have to be updated as well), so the lport must 2743 * not change after in_pcbinshash() has been called. 2744 */ 2745 void 2746 in_pcbrehash(struct inpcb *inp) 2747 { 2748 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2749 struct inpcbhead *head; 2750 uint32_t hash; 2751 bool connected; 2752 2753 INP_WLOCK_ASSERT(inp); 2754 INP_HASH_WLOCK_ASSERT(pcbinfo); 2755 KASSERT(inp->inp_flags & INP_INHASHLIST, 2756 ("%s: !INP_INHASHLIST", __func__)); 2757 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2758 ("%s: inp was disconnected", __func__)); 2759 2760 #ifdef INET6 2761 if (inp->inp_vflag & INP_IPV6) { 2762 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2763 inp->inp_fport, pcbinfo->ipi_hashmask); 2764 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2765 } else 2766 #endif 2767 { 2768 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2769 inp->inp_fport, pcbinfo->ipi_hashmask); 2770 connected = !in_nullhost(inp->inp_faddr); 2771 } 2772 2773 /* 2774 * When rehashing, the caller must ensure that either the new or the old 2775 * foreign address was unspecified. 2776 */ 2777 if (connected) 2778 CK_LIST_REMOVE(inp, inp_hash_wild); 2779 else 2780 CK_LIST_REMOVE(inp, inp_hash_exact); 2781 2782 if (connected) { 2783 head = &pcbinfo->ipi_hash_exact[hash]; 2784 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2785 } else { 2786 head = &pcbinfo->ipi_hash_wild[hash]; 2787 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2788 } 2789 } 2790 2791 /* 2792 * Check for alternatives when higher level complains 2793 * about service problems. For now, invalidate cached 2794 * routing information. If the route was created dynamically 2795 * (by a redirect), time to try a default gateway again. 2796 */ 2797 void 2798 in_losing(struct inpcb *inp) 2799 { 2800 2801 RO_INVALIDATE_CACHE(&inp->inp_route); 2802 return; 2803 } 2804 2805 /* 2806 * A set label operation has occurred at the socket layer, propagate the 2807 * label change into the in_pcb for the socket. 2808 */ 2809 void 2810 in_pcbsosetlabel(struct socket *so) 2811 { 2812 #ifdef MAC 2813 struct inpcb *inp; 2814 2815 inp = sotoinpcb(so); 2816 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2817 2818 INP_WLOCK(inp); 2819 SOCK_LOCK(so); 2820 mac_inpcb_sosetlabel(so, inp); 2821 SOCK_UNLOCK(so); 2822 INP_WUNLOCK(inp); 2823 #endif 2824 } 2825 2826 void 2827 inp_wlock(struct inpcb *inp) 2828 { 2829 2830 INP_WLOCK(inp); 2831 } 2832 2833 void 2834 inp_wunlock(struct inpcb *inp) 2835 { 2836 2837 INP_WUNLOCK(inp); 2838 } 2839 2840 void 2841 inp_rlock(struct inpcb *inp) 2842 { 2843 2844 INP_RLOCK(inp); 2845 } 2846 2847 void 2848 inp_runlock(struct inpcb *inp) 2849 { 2850 2851 INP_RUNLOCK(inp); 2852 } 2853 2854 #ifdef INVARIANT_SUPPORT 2855 void 2856 inp_lock_assert(struct inpcb *inp) 2857 { 2858 2859 INP_WLOCK_ASSERT(inp); 2860 } 2861 2862 void 2863 inp_unlock_assert(struct inpcb *inp) 2864 { 2865 2866 INP_UNLOCK_ASSERT(inp); 2867 } 2868 #endif 2869 2870 void 2871 inp_apply_all(struct inpcbinfo *pcbinfo, 2872 void (*func)(struct inpcb *, void *), void *arg) 2873 { 2874 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2875 INPLOOKUP_WLOCKPCB); 2876 struct inpcb *inp; 2877 2878 while ((inp = inp_next(&inpi)) != NULL) 2879 func(inp, arg); 2880 } 2881 2882 struct socket * 2883 inp_inpcbtosocket(struct inpcb *inp) 2884 { 2885 2886 INP_WLOCK_ASSERT(inp); 2887 return (inp->inp_socket); 2888 } 2889 2890 void 2891 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2892 uint32_t *faddr, uint16_t *fp) 2893 { 2894 2895 INP_LOCK_ASSERT(inp); 2896 *laddr = inp->inp_laddr.s_addr; 2897 *faddr = inp->inp_faddr.s_addr; 2898 *lp = inp->inp_lport; 2899 *fp = inp->inp_fport; 2900 } 2901 2902 /* 2903 * Create an external-format (``xinpcb'') structure using the information in 2904 * the kernel-format in_pcb structure pointed to by inp. This is done to 2905 * reduce the spew of irrelevant information over this interface, to isolate 2906 * user code from changes in the kernel structure, and potentially to provide 2907 * information-hiding if we decide that some of this information should be 2908 * hidden from users. 2909 */ 2910 void 2911 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2912 { 2913 2914 bzero(xi, sizeof(*xi)); 2915 xi->xi_len = sizeof(struct xinpcb); 2916 if (inp->inp_socket) 2917 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2918 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2919 xi->inp_gencnt = inp->inp_gencnt; 2920 xi->inp_flow = inp->inp_flow; 2921 xi->inp_flowid = inp->inp_flowid; 2922 xi->inp_flowtype = inp->inp_flowtype; 2923 xi->inp_flags = inp->inp_flags; 2924 xi->inp_flags2 = inp->inp_flags2; 2925 xi->in6p_cksum = inp->in6p_cksum; 2926 xi->in6p_hops = inp->in6p_hops; 2927 xi->inp_ip_tos = inp->inp_ip_tos; 2928 xi->inp_vflag = inp->inp_vflag; 2929 xi->inp_ip_ttl = inp->inp_ip_ttl; 2930 xi->inp_ip_p = inp->inp_ip_p; 2931 xi->inp_ip_minttl = inp->inp_ip_minttl; 2932 } 2933 2934 int 2935 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2936 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2937 { 2938 struct sockopt sopt; 2939 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2940 INPLOOKUP_WLOCKPCB); 2941 struct inpcb *inp; 2942 struct sockopt_parameters *params; 2943 struct socket *so; 2944 int error; 2945 char buf[1024]; 2946 2947 if (req->oldptr != NULL || req->oldlen != 0) 2948 return (EINVAL); 2949 if (req->newptr == NULL) 2950 return (EPERM); 2951 if (req->newlen > sizeof(buf)) 2952 return (ENOMEM); 2953 error = SYSCTL_IN(req, buf, req->newlen); 2954 if (error != 0) 2955 return (error); 2956 if (req->newlen < sizeof(struct sockopt_parameters)) 2957 return (EINVAL); 2958 params = (struct sockopt_parameters *)buf; 2959 sopt.sopt_level = params->sop_level; 2960 sopt.sopt_name = params->sop_optname; 2961 sopt.sopt_dir = SOPT_SET; 2962 sopt.sopt_val = params->sop_optval; 2963 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2964 sopt.sopt_td = NULL; 2965 #ifdef INET6 2966 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2967 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2968 params->sop_inc.inc6_laddr.s6_addr16[1] = 2969 htons(params->sop_inc.inc6_zoneid & 0xffff); 2970 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2971 params->sop_inc.inc6_faddr.s6_addr16[1] = 2972 htons(params->sop_inc.inc6_zoneid & 0xffff); 2973 } 2974 #endif 2975 if (params->sop_inc.inc_lport != htons(0) && 2976 params->sop_inc.inc_fport != htons(0)) { 2977 #ifdef INET6 2978 if (params->sop_inc.inc_flags & INC_ISIPV6) 2979 inpi.hash = INP6_PCBHASH( 2980 ¶ms->sop_inc.inc6_faddr, 2981 params->sop_inc.inc_lport, 2982 params->sop_inc.inc_fport, 2983 pcbinfo->ipi_hashmask); 2984 else 2985 #endif 2986 inpi.hash = INP_PCBHASH( 2987 ¶ms->sop_inc.inc_faddr, 2988 params->sop_inc.inc_lport, 2989 params->sop_inc.inc_fport, 2990 pcbinfo->ipi_hashmask); 2991 } 2992 while ((inp = inp_next(&inpi)) != NULL) 2993 if (inp->inp_gencnt == params->sop_id) { 2994 if (inp->inp_flags & INP_DROPPED) { 2995 INP_WUNLOCK(inp); 2996 return (ECONNRESET); 2997 } 2998 so = inp->inp_socket; 2999 KASSERT(so != NULL, ("inp_socket == NULL")); 3000 soref(so); 3001 if (params->sop_level == SOL_SOCKET) { 3002 INP_WUNLOCK(inp); 3003 error = sosetopt(so, &sopt); 3004 } else 3005 error = (*ctloutput_set)(inp, &sopt); 3006 sorele(so); 3007 break; 3008 } 3009 if (inp == NULL) 3010 error = ESRCH; 3011 return (error); 3012 } 3013 3014 #ifdef DDB 3015 static void 3016 db_print_indent(int indent) 3017 { 3018 int i; 3019 3020 for (i = 0; i < indent; i++) 3021 db_printf(" "); 3022 } 3023 3024 static void 3025 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3026 { 3027 char faddr_str[48], laddr_str[48]; 3028 3029 db_print_indent(indent); 3030 db_printf("%s at %p\n", name, inc); 3031 3032 indent += 2; 3033 3034 #ifdef INET6 3035 if (inc->inc_flags & INC_ISIPV6) { 3036 /* IPv6. */ 3037 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3038 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3039 } else 3040 #endif 3041 { 3042 /* IPv4. */ 3043 inet_ntoa_r(inc->inc_laddr, laddr_str); 3044 inet_ntoa_r(inc->inc_faddr, faddr_str); 3045 } 3046 db_print_indent(indent); 3047 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3048 ntohs(inc->inc_lport)); 3049 db_print_indent(indent); 3050 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3051 ntohs(inc->inc_fport)); 3052 } 3053 3054 static void 3055 db_print_inpflags(int inp_flags) 3056 { 3057 int comma; 3058 3059 comma = 0; 3060 if (inp_flags & INP_RECVOPTS) { 3061 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 3062 comma = 1; 3063 } 3064 if (inp_flags & INP_RECVRETOPTS) { 3065 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 3066 comma = 1; 3067 } 3068 if (inp_flags & INP_RECVDSTADDR) { 3069 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 3070 comma = 1; 3071 } 3072 if (inp_flags & INP_ORIGDSTADDR) { 3073 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 3074 comma = 1; 3075 } 3076 if (inp_flags & INP_HDRINCL) { 3077 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 3078 comma = 1; 3079 } 3080 if (inp_flags & INP_HIGHPORT) { 3081 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 3082 comma = 1; 3083 } 3084 if (inp_flags & INP_LOWPORT) { 3085 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 3086 comma = 1; 3087 } 3088 if (inp_flags & INP_ANONPORT) { 3089 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 3090 comma = 1; 3091 } 3092 if (inp_flags & INP_RECVIF) { 3093 db_printf("%sINP_RECVIF", comma ? ", " : ""); 3094 comma = 1; 3095 } 3096 if (inp_flags & INP_MTUDISC) { 3097 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 3098 comma = 1; 3099 } 3100 if (inp_flags & INP_RECVTTL) { 3101 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 3102 comma = 1; 3103 } 3104 if (inp_flags & INP_DONTFRAG) { 3105 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 3106 comma = 1; 3107 } 3108 if (inp_flags & INP_RECVTOS) { 3109 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 3110 comma = 1; 3111 } 3112 if (inp_flags & IN6P_IPV6_V6ONLY) { 3113 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3114 comma = 1; 3115 } 3116 if (inp_flags & IN6P_PKTINFO) { 3117 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3118 comma = 1; 3119 } 3120 if (inp_flags & IN6P_HOPLIMIT) { 3121 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3122 comma = 1; 3123 } 3124 if (inp_flags & IN6P_HOPOPTS) { 3125 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3126 comma = 1; 3127 } 3128 if (inp_flags & IN6P_DSTOPTS) { 3129 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3130 comma = 1; 3131 } 3132 if (inp_flags & IN6P_RTHDR) { 3133 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3134 comma = 1; 3135 } 3136 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3137 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3138 comma = 1; 3139 } 3140 if (inp_flags & IN6P_TCLASS) { 3141 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3142 comma = 1; 3143 } 3144 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3145 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3146 comma = 1; 3147 } 3148 if (inp_flags & INP_ONESBCAST) { 3149 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3150 comma = 1; 3151 } 3152 if (inp_flags & INP_DROPPED) { 3153 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3154 comma = 1; 3155 } 3156 if (inp_flags & INP_SOCKREF) { 3157 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3158 comma = 1; 3159 } 3160 if (inp_flags & IN6P_RFC2292) { 3161 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3162 comma = 1; 3163 } 3164 if (inp_flags & IN6P_MTU) { 3165 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3166 comma = 1; 3167 } 3168 } 3169 3170 static void 3171 db_print_inpvflag(u_char inp_vflag) 3172 { 3173 int comma; 3174 3175 comma = 0; 3176 if (inp_vflag & INP_IPV4) { 3177 db_printf("%sINP_IPV4", comma ? ", " : ""); 3178 comma = 1; 3179 } 3180 if (inp_vflag & INP_IPV6) { 3181 db_printf("%sINP_IPV6", comma ? ", " : ""); 3182 comma = 1; 3183 } 3184 if (inp_vflag & INP_IPV6PROTO) { 3185 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3186 comma = 1; 3187 } 3188 } 3189 3190 static void 3191 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3192 { 3193 3194 db_print_indent(indent); 3195 db_printf("%s at %p\n", name, inp); 3196 3197 indent += 2; 3198 3199 db_print_indent(indent); 3200 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3201 3202 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3203 3204 db_print_indent(indent); 3205 db_printf("inp_label: %p inp_flags: 0x%x (", 3206 inp->inp_label, inp->inp_flags); 3207 db_print_inpflags(inp->inp_flags); 3208 db_printf(")\n"); 3209 3210 db_print_indent(indent); 3211 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3212 inp->inp_vflag); 3213 db_print_inpvflag(inp->inp_vflag); 3214 db_printf(")\n"); 3215 3216 db_print_indent(indent); 3217 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3218 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3219 3220 db_print_indent(indent); 3221 #ifdef INET6 3222 if (inp->inp_vflag & INP_IPV6) { 3223 db_printf("in6p_options: %p in6p_outputopts: %p " 3224 "in6p_moptions: %p\n", inp->in6p_options, 3225 inp->in6p_outputopts, inp->in6p_moptions); 3226 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3227 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3228 inp->in6p_hops); 3229 } else 3230 #endif 3231 { 3232 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3233 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3234 inp->inp_options, inp->inp_moptions); 3235 } 3236 3237 db_print_indent(indent); 3238 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); 3239 } 3240 3241 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3242 { 3243 struct inpcb *inp; 3244 3245 if (!have_addr) { 3246 db_printf("usage: show inpcb <addr>\n"); 3247 return; 3248 } 3249 inp = (struct inpcb *)addr; 3250 3251 db_print_inpcb(inp, "inpcb", 0); 3252 } 3253 #endif /* DDB */ 3254 3255 #ifdef RATELIMIT 3256 /* 3257 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3258 * if any. 3259 */ 3260 int 3261 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3262 { 3263 union if_snd_tag_modify_params params = { 3264 .rate_limit.max_rate = max_pacing_rate, 3265 .rate_limit.flags = M_NOWAIT, 3266 }; 3267 struct m_snd_tag *mst; 3268 int error; 3269 3270 mst = inp->inp_snd_tag; 3271 if (mst == NULL) 3272 return (EINVAL); 3273 3274 if (mst->sw->snd_tag_modify == NULL) { 3275 error = EOPNOTSUPP; 3276 } else { 3277 error = mst->sw->snd_tag_modify(mst, ¶ms); 3278 } 3279 return (error); 3280 } 3281 3282 /* 3283 * Query existing TX rate limit based on the existing 3284 * "inp->inp_snd_tag", if any. 3285 */ 3286 int 3287 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3288 { 3289 union if_snd_tag_query_params params = { }; 3290 struct m_snd_tag *mst; 3291 int error; 3292 3293 mst = inp->inp_snd_tag; 3294 if (mst == NULL) 3295 return (EINVAL); 3296 3297 if (mst->sw->snd_tag_query == NULL) { 3298 error = EOPNOTSUPP; 3299 } else { 3300 error = mst->sw->snd_tag_query(mst, ¶ms); 3301 if (error == 0 && p_max_pacing_rate != NULL) 3302 *p_max_pacing_rate = params.rate_limit.max_rate; 3303 } 3304 return (error); 3305 } 3306 3307 /* 3308 * Query existing TX queue level based on the existing 3309 * "inp->inp_snd_tag", if any. 3310 */ 3311 int 3312 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3313 { 3314 union if_snd_tag_query_params params = { }; 3315 struct m_snd_tag *mst; 3316 int error; 3317 3318 mst = inp->inp_snd_tag; 3319 if (mst == NULL) 3320 return (EINVAL); 3321 3322 if (mst->sw->snd_tag_query == NULL) 3323 return (EOPNOTSUPP); 3324 3325 error = mst->sw->snd_tag_query(mst, ¶ms); 3326 if (error == 0 && p_txqueue_level != NULL) 3327 *p_txqueue_level = params.rate_limit.queue_level; 3328 return (error); 3329 } 3330 3331 /* 3332 * Allocate a new TX rate limit send tag from the network interface 3333 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3334 */ 3335 int 3336 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3337 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3338 3339 { 3340 union if_snd_tag_alloc_params params = { 3341 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3342 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3343 .rate_limit.hdr.flowid = flowid, 3344 .rate_limit.hdr.flowtype = flowtype, 3345 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3346 .rate_limit.max_rate = max_pacing_rate, 3347 .rate_limit.flags = M_NOWAIT, 3348 }; 3349 int error; 3350 3351 INP_WLOCK_ASSERT(inp); 3352 3353 /* 3354 * If there is already a send tag, or the INP is being torn 3355 * down, allocating a new send tag is not allowed. Else send 3356 * tags may leak. 3357 */ 3358 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3359 return (EINVAL); 3360 3361 error = m_snd_tag_alloc(ifp, ¶ms, st); 3362 #ifdef INET 3363 if (error == 0) { 3364 counter_u64_add(rate_limit_set_ok, 1); 3365 counter_u64_add(rate_limit_active, 1); 3366 } else if (error != EOPNOTSUPP) 3367 counter_u64_add(rate_limit_alloc_fail, 1); 3368 #endif 3369 return (error); 3370 } 3371 3372 void 3373 in_pcbdetach_tag(struct m_snd_tag *mst) 3374 { 3375 3376 m_snd_tag_rele(mst); 3377 #ifdef INET 3378 counter_u64_add(rate_limit_active, -1); 3379 #endif 3380 } 3381 3382 /* 3383 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3384 * if any: 3385 */ 3386 void 3387 in_pcbdetach_txrtlmt(struct inpcb *inp) 3388 { 3389 struct m_snd_tag *mst; 3390 3391 INP_WLOCK_ASSERT(inp); 3392 3393 mst = inp->inp_snd_tag; 3394 inp->inp_snd_tag = NULL; 3395 3396 if (mst == NULL) 3397 return; 3398 3399 m_snd_tag_rele(mst); 3400 #ifdef INET 3401 counter_u64_add(rate_limit_active, -1); 3402 #endif 3403 } 3404 3405 int 3406 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3407 { 3408 int error; 3409 3410 /* 3411 * If the existing send tag is for the wrong interface due to 3412 * a route change, first drop the existing tag. Set the 3413 * CHANGED flag so that we will keep trying to allocate a new 3414 * tag if we fail to allocate one this time. 3415 */ 3416 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3417 in_pcbdetach_txrtlmt(inp); 3418 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3419 } 3420 3421 /* 3422 * NOTE: When attaching to a network interface a reference is 3423 * made to ensure the network interface doesn't go away until 3424 * all ratelimit connections are gone. The network interface 3425 * pointers compared below represent valid network interfaces, 3426 * except when comparing towards NULL. 3427 */ 3428 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3429 error = 0; 3430 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3431 if (inp->inp_snd_tag != NULL) 3432 in_pcbdetach_txrtlmt(inp); 3433 error = 0; 3434 } else if (inp->inp_snd_tag == NULL) { 3435 /* 3436 * In order to utilize packet pacing with RSS, we need 3437 * to wait until there is a valid RSS hash before we 3438 * can proceed: 3439 */ 3440 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3441 error = EAGAIN; 3442 } else { 3443 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3444 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3445 } 3446 } else { 3447 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3448 } 3449 if (error == 0 || error == EOPNOTSUPP) 3450 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3451 3452 return (error); 3453 } 3454 3455 /* 3456 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3457 * is set in the fast path and will attach/detach/modify the TX rate 3458 * limit send tag based on the socket's so_max_pacing_rate value. 3459 */ 3460 void 3461 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3462 { 3463 struct socket *socket; 3464 uint32_t max_pacing_rate; 3465 bool did_upgrade; 3466 3467 if (inp == NULL) 3468 return; 3469 3470 socket = inp->inp_socket; 3471 if (socket == NULL) 3472 return; 3473 3474 if (!INP_WLOCKED(inp)) { 3475 /* 3476 * NOTE: If the write locking fails, we need to bail 3477 * out and use the non-ratelimited ring for the 3478 * transmit until there is a new chance to get the 3479 * write lock. 3480 */ 3481 if (!INP_TRY_UPGRADE(inp)) 3482 return; 3483 did_upgrade = 1; 3484 } else { 3485 did_upgrade = 0; 3486 } 3487 3488 /* 3489 * NOTE: The so_max_pacing_rate value is read unlocked, 3490 * because atomic updates are not required since the variable 3491 * is checked at every mbuf we send. It is assumed that the 3492 * variable read itself will be atomic. 3493 */ 3494 max_pacing_rate = socket->so_max_pacing_rate; 3495 3496 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3497 3498 if (did_upgrade) 3499 INP_DOWNGRADE(inp); 3500 } 3501 3502 /* 3503 * Track route changes for TX rate limiting. 3504 */ 3505 void 3506 in_pcboutput_eagain(struct inpcb *inp) 3507 { 3508 bool did_upgrade; 3509 3510 if (inp == NULL) 3511 return; 3512 3513 if (inp->inp_snd_tag == NULL) 3514 return; 3515 3516 if (!INP_WLOCKED(inp)) { 3517 /* 3518 * NOTE: If the write locking fails, we need to bail 3519 * out and use the non-ratelimited ring for the 3520 * transmit until there is a new chance to get the 3521 * write lock. 3522 */ 3523 if (!INP_TRY_UPGRADE(inp)) 3524 return; 3525 did_upgrade = 1; 3526 } else { 3527 did_upgrade = 0; 3528 } 3529 3530 /* detach rate limiting */ 3531 in_pcbdetach_txrtlmt(inp); 3532 3533 /* make sure new mbuf send tag allocation is made */ 3534 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3535 3536 if (did_upgrade) 3537 INP_DOWNGRADE(inp); 3538 } 3539 3540 #ifdef INET 3541 static void 3542 rl_init(void *st) 3543 { 3544 rate_limit_new = counter_u64_alloc(M_WAITOK); 3545 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3546 rate_limit_active = counter_u64_alloc(M_WAITOK); 3547 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3548 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3549 } 3550 3551 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3552 #endif 3553 #endif /* RATELIMIT */ 3554