1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_ddb.h" 41 #include "opt_ipsec.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_ratelimit.h" 45 #include "opt_route.h" 46 #include "opt_rss.h" 47 48 #include <sys/param.h> 49 #include <sys/hash.h> 50 #include <sys/systm.h> 51 #include <sys/libkern.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/eventhandler.h> 56 #include <sys/domain.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/smp.h> 60 #include <sys/smr.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_private.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 115 116 /* 117 * These configure the range of local port addresses assigned to 118 * "unspecified" outgoing connections/packets/whatever. 119 */ 120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 126 127 /* 128 * Reserved ports accessible only to root. There are significant 129 * security considerations that must be accounted for when changing these, 130 * but the security benefits can be great. Please be careful. 131 */ 132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 133 VNET_DEFINE(int, ipport_reservedlow); 134 135 /* Enable random ephemeral port allocation by default. */ 136 VNET_DEFINE(int, ipport_randomized) = 1; 137 138 #ifdef INET 139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 140 struct in_addr faddr, u_int fport_arg, 141 struct in_addr laddr, u_int lport_arg, 142 int lookupflags, uint8_t numa_domain, int fib); 143 144 #define RANGECHK(var, min, max) \ 145 if ((var) < (min)) { (var) = (min); } \ 146 else if ((var) > (max)) { (var) = (max); } 147 148 static int 149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 150 { 151 int error; 152 153 error = sysctl_handle_int(oidp, arg1, arg2, req); 154 if (error == 0) { 155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 161 } 162 return (error); 163 } 164 165 #undef RANGECHK 166 167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 169 "IP Ports"); 170 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 174 ""); 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 197 &VNET_NAME(ipport_reservedhigh), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 201 CTLFLAG_VNET | CTLFLAG_RW, 202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 203 204 #ifdef RATELIMIT 205 counter_u64_t rate_limit_new; 206 counter_u64_t rate_limit_chg; 207 counter_u64_t rate_limit_active; 208 counter_u64_t rate_limit_alloc_fail; 209 counter_u64_t rate_limit_set_ok; 210 211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 212 "IP Rate Limiting"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 214 &rate_limit_active, "Active rate limited connections"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 216 &rate_limit_alloc_fail, "Rate limited connection failures"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 218 &rate_limit_set_ok, "Rate limited setting succeeded"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 220 &rate_limit_new, "Total Rate limit new attempts"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 222 &rate_limit_chg, "Total Rate limited change attempts"); 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, NULL); 236 237 #ifdef INET 238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; 239 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 241 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 242 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 243 #endif 244 245 static void in_pcbremhash(struct inpcb *); 246 247 /* 248 * in_pcb.c: manage the Protocol Control Blocks. 249 * 250 * NOTE: It is assumed that most of these functions will be called with 251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 252 * functions often modify hash chains or addresses in pcbs. 253 */ 254 255 static struct inpcblbgroup * 256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 257 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) 258 { 259 struct inpcblbgroup *grp; 260 size_t bytes; 261 262 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 263 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 264 if (grp == NULL) 265 return (NULL); 266 LIST_INIT(&grp->il_pending); 267 grp->il_cred = crhold(cred); 268 grp->il_vflag = vflag; 269 grp->il_lport = port; 270 grp->il_numa_domain = numa_domain; 271 grp->il_fibnum = fib; 272 grp->il_dependladdr = *addr; 273 grp->il_inpsiz = size; 274 return (grp); 275 } 276 277 static void 278 in_pcblbgroup_free_deferred(epoch_context_t ctx) 279 { 280 struct inpcblbgroup *grp; 281 282 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 283 crfree(grp->il_cred); 284 free(grp, M_PCB); 285 } 286 287 static void 288 in_pcblbgroup_free(struct inpcblbgroup *grp) 289 { 290 KASSERT(LIST_EMPTY(&grp->il_pending), 291 ("local group %p still has pending inps", grp)); 292 293 CK_LIST_REMOVE(grp, il_list); 294 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 295 } 296 297 static struct inpcblbgroup * 298 in_pcblbgroup_find(struct inpcb *inp) 299 { 300 struct inpcbinfo *pcbinfo; 301 struct inpcblbgroup *grp; 302 struct inpcblbgrouphead *hdr; 303 304 INP_LOCK_ASSERT(inp); 305 306 pcbinfo = inp->inp_pcbinfo; 307 INP_HASH_LOCK_ASSERT(pcbinfo); 308 309 hdr = &pcbinfo->ipi_lbgrouphashbase[ 310 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 311 CK_LIST_FOREACH(grp, hdr, il_list) { 312 struct inpcb *inp1; 313 314 for (unsigned int i = 0; i < grp->il_inpcnt; i++) { 315 if (inp == grp->il_inp[i]) 316 goto found; 317 } 318 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 319 if (inp == inp1) 320 goto found; 321 } 322 } 323 found: 324 return (grp); 325 } 326 327 static void 328 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 329 { 330 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 331 ("invalid local group size %d and count %d", grp->il_inpsiz, 332 grp->il_inpcnt)); 333 INP_WLOCK_ASSERT(inp); 334 335 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && 336 !SOLISTENING(inp->inp_socket)) { 337 /* 338 * If this is a TCP socket, it should not be visible to lbgroup 339 * lookups until listen() has been called. 340 */ 341 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); 342 grp->il_pendcnt++; 343 } else { 344 grp->il_inp[grp->il_inpcnt] = inp; 345 346 /* 347 * Synchronize with in_pcblookup_lbgroup(): make sure that we 348 * don't expose a null slot to the lookup path. 349 */ 350 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 351 } 352 353 inp->inp_flags |= INP_INLBGROUP; 354 } 355 356 static struct inpcblbgroup * 357 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 358 struct inpcblbgroup *old_grp, int size) 359 { 360 struct inpcblbgroup *grp; 361 int i; 362 363 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 364 old_grp->il_lport, &old_grp->il_dependladdr, size, 365 old_grp->il_numa_domain, old_grp->il_fibnum); 366 if (grp == NULL) 367 return (NULL); 368 369 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 370 ("invalid new local group size %d and old local group count %d", 371 grp->il_inpsiz, old_grp->il_inpcnt)); 372 373 for (i = 0; i < old_grp->il_inpcnt; ++i) 374 grp->il_inp[i] = old_grp->il_inp[i]; 375 grp->il_inpcnt = old_grp->il_inpcnt; 376 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 377 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, 378 inp_lbgroup_list); 379 grp->il_pendcnt = old_grp->il_pendcnt; 380 old_grp->il_pendcnt = 0; 381 in_pcblbgroup_free(old_grp); 382 return (grp); 383 } 384 385 /* 386 * Add PCB to load balance group for SO_REUSEPORT_LB option. 387 */ 388 static int 389 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 390 { 391 const static struct timeval interval = { 60, 0 }; 392 static struct timeval lastprint; 393 struct inpcbinfo *pcbinfo; 394 struct inpcblbgrouphead *hdr; 395 struct inpcblbgroup *grp; 396 uint32_t idx; 397 int fib; 398 399 pcbinfo = inp->inp_pcbinfo; 400 401 INP_WLOCK_ASSERT(inp); 402 INP_HASH_WLOCK_ASSERT(pcbinfo); 403 404 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? 405 inp->inp_inc.inc_fibnum : RT_ALL_FIBS; 406 407 #ifdef INET6 408 /* 409 * Don't allow IPv4 mapped INET6 wild socket. 410 */ 411 if ((inp->inp_vflag & INP_IPV4) && 412 inp->inp_laddr.s_addr == INADDR_ANY && 413 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 414 return (0); 415 } 416 #endif 417 418 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 419 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 420 CK_LIST_FOREACH(grp, hdr, il_list) { 421 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 422 grp->il_vflag == inp->inp_vflag && 423 grp->il_lport == inp->inp_lport && 424 grp->il_numa_domain == numa_domain && 425 grp->il_fibnum == fib && 426 memcmp(&grp->il_dependladdr, 427 &inp->inp_inc.inc_ie.ie_dependladdr, 428 sizeof(grp->il_dependladdr)) == 0) { 429 break; 430 } 431 } 432 if (grp == NULL) { 433 /* Create new load balance group. */ 434 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 435 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 436 INPCBLBGROUP_SIZMIN, numa_domain, fib); 437 if (grp == NULL) 438 return (ENOMEM); 439 in_pcblbgroup_insert(grp, inp); 440 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 441 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { 442 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 443 if (ratecheck(&lastprint, &interval)) 444 printf("lb group port %d, limit reached\n", 445 ntohs(grp->il_lport)); 446 return (0); 447 } 448 449 /* Expand this local group. */ 450 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 451 if (grp == NULL) 452 return (ENOMEM); 453 in_pcblbgroup_insert(grp, inp); 454 } else { 455 in_pcblbgroup_insert(grp, inp); 456 } 457 return (0); 458 } 459 460 /* 461 * Remove PCB from load balance group. 462 */ 463 static void 464 in_pcbremlbgrouphash(struct inpcb *inp) 465 { 466 struct inpcbinfo *pcbinfo; 467 struct inpcblbgrouphead *hdr; 468 struct inpcblbgroup *grp; 469 struct inpcb *inp1; 470 int i; 471 472 pcbinfo = inp->inp_pcbinfo; 473 474 INP_WLOCK_ASSERT(inp); 475 MPASS(inp->inp_flags & INP_INLBGROUP); 476 INP_HASH_WLOCK_ASSERT(pcbinfo); 477 478 hdr = &pcbinfo->ipi_lbgrouphashbase[ 479 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 480 CK_LIST_FOREACH(grp, hdr, il_list) { 481 for (i = 0; i < grp->il_inpcnt; ++i) { 482 if (grp->il_inp[i] != inp) 483 continue; 484 485 if (grp->il_inpcnt == 1 && 486 LIST_EMPTY(&grp->il_pending)) { 487 /* We are the last, free this local group. */ 488 in_pcblbgroup_free(grp); 489 } else { 490 grp->il_inp[i] = 491 grp->il_inp[grp->il_inpcnt - 1]; 492 493 /* 494 * Synchronize with in_pcblookup_lbgroup(). 495 */ 496 atomic_store_rel_int(&grp->il_inpcnt, 497 grp->il_inpcnt - 1); 498 } 499 inp->inp_flags &= ~INP_INLBGROUP; 500 return; 501 } 502 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 503 if (inp == inp1) { 504 LIST_REMOVE(inp, inp_lbgroup_list); 505 grp->il_pendcnt--; 506 inp->inp_flags &= ~INP_INLBGROUP; 507 return; 508 } 509 } 510 } 511 __assert_unreachable(); 512 } 513 514 int 515 in_pcblbgroup_numa(struct inpcb *inp, int arg) 516 { 517 struct inpcbinfo *pcbinfo; 518 int error; 519 uint8_t numa_domain; 520 521 switch (arg) { 522 case TCP_REUSPORT_LB_NUMA_NODOM: 523 numa_domain = M_NODOM; 524 break; 525 case TCP_REUSPORT_LB_NUMA_CURDOM: 526 numa_domain = PCPU_GET(domain); 527 break; 528 default: 529 if (arg < 0 || arg >= vm_ndomains) 530 return (EINVAL); 531 numa_domain = arg; 532 } 533 534 pcbinfo = inp->inp_pcbinfo; 535 INP_WLOCK_ASSERT(inp); 536 INP_HASH_WLOCK(pcbinfo); 537 if (in_pcblbgroup_find(inp) != NULL) { 538 /* Remove it from the old group. */ 539 in_pcbremlbgrouphash(inp); 540 /* Add it to the new group based on numa domain. */ 541 in_pcbinslbgrouphash(inp, numa_domain); 542 error = 0; 543 } else { 544 error = ENOENT; 545 } 546 INP_HASH_WUNLOCK(pcbinfo); 547 return (error); 548 } 549 550 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 551 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 552 553 /* 554 * Initialize an inpcbinfo - a per-VNET instance of connections db. 555 */ 556 void 557 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 558 u_int hash_nelements, u_int porthash_nelements) 559 { 560 561 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 562 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 563 NULL, MTX_DEF); 564 #ifdef VIMAGE 565 pcbinfo->ipi_vnet = curvnet; 566 #endif 567 CK_LIST_INIT(&pcbinfo->ipi_listhead); 568 pcbinfo->ipi_count = 0; 569 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 570 &pcbinfo->ipi_hashmask); 571 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 572 &pcbinfo->ipi_hashmask); 573 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 574 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 575 &pcbinfo->ipi_porthashmask); 576 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 577 &pcbinfo->ipi_lbgrouphashmask); 578 pcbinfo->ipi_zone = pcbstor->ips_zone; 579 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 580 } 581 582 /* 583 * Destroy an inpcbinfo. 584 */ 585 void 586 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 587 { 588 589 KASSERT(pcbinfo->ipi_count == 0, 590 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 591 592 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 593 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 594 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 595 pcbinfo->ipi_porthashmask); 596 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 597 pcbinfo->ipi_lbgrouphashmask); 598 mtx_destroy(&pcbinfo->ipi_hash_lock); 599 mtx_destroy(&pcbinfo->ipi_lock); 600 } 601 602 /* 603 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 604 */ 605 static void inpcb_fini(void *, int); 606 void 607 in_pcbstorage_init(void *arg) 608 { 609 struct inpcbstorage *pcbstor = arg; 610 611 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 612 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 613 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 614 } 615 616 /* 617 * Destroy a pcbstorage - used by unloadable protocols. 618 */ 619 void 620 in_pcbstorage_destroy(void *arg) 621 { 622 struct inpcbstorage *pcbstor = arg; 623 624 uma_zdestroy(pcbstor->ips_zone); 625 } 626 627 /* 628 * Allocate a PCB and associate it with the socket. 629 * On success return with the PCB locked. 630 */ 631 int 632 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 633 { 634 struct inpcb *inp; 635 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 636 int error; 637 #endif 638 639 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 640 if (inp == NULL) 641 return (ENOBUFS); 642 bzero(&inp->inp_start_zero, inp_zero_size); 643 #ifdef NUMA 644 inp->inp_numa_domain = M_NODOM; 645 #endif 646 inp->inp_pcbinfo = pcbinfo; 647 inp->inp_socket = so; 648 inp->inp_cred = crhold(so->so_cred); 649 inp->inp_inc.inc_fibnum = so->so_fibnum; 650 #ifdef MAC 651 error = mac_inpcb_init(inp, M_NOWAIT); 652 if (error != 0) 653 goto out; 654 mac_inpcb_create(so, inp); 655 #endif 656 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 657 error = ipsec_init_pcbpolicy(inp); 658 if (error != 0) { 659 #ifdef MAC 660 mac_inpcb_destroy(inp); 661 #endif 662 goto out; 663 } 664 #endif /*IPSEC*/ 665 #ifdef INET6 666 if (INP_SOCKAF(so) == AF_INET6) { 667 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 668 if (V_ip6_v6only) 669 inp->inp_flags |= IN6P_IPV6_V6ONLY; 670 #ifdef INET 671 else 672 inp->inp_vflag |= INP_IPV4; 673 #endif 674 if (V_ip6_auto_flowlabel) 675 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 676 inp->in6p_hops = -1; /* use kernel default */ 677 } 678 #endif 679 #if defined(INET) && defined(INET6) 680 else 681 #endif 682 #ifdef INET 683 inp->inp_vflag |= INP_IPV4; 684 #endif 685 inp->inp_smr = SMR_SEQ_INVALID; 686 687 /* 688 * Routes in inpcb's can cache L2 as well; they are guaranteed 689 * to be cleaned up. 690 */ 691 inp->inp_route.ro_flags = RT_LLE_CACHE; 692 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 693 INP_WLOCK(inp); 694 INP_INFO_WLOCK(pcbinfo); 695 pcbinfo->ipi_count++; 696 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 697 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 698 INP_INFO_WUNLOCK(pcbinfo); 699 so->so_pcb = inp; 700 701 return (0); 702 703 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 704 out: 705 crfree(inp->inp_cred); 706 #ifdef INVARIANTS 707 inp->inp_cred = NULL; 708 #endif 709 uma_zfree_smr(pcbinfo->ipi_zone, inp); 710 return (error); 711 #endif 712 } 713 714 #ifdef INET 715 int 716 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, 717 struct ucred *cred) 718 { 719 int anonport, error; 720 721 KASSERT(sin == NULL || sin->sin_family == AF_INET, 722 ("%s: invalid address family for %p", __func__, sin)); 723 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 724 ("%s: invalid address length for %p", __func__, sin)); 725 INP_WLOCK_ASSERT(inp); 726 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 727 728 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 729 return (EINVAL); 730 anonport = sin == NULL || sin->sin_port == 0; 731 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 732 &inp->inp_lport, flags, cred); 733 if (error) 734 return (error); 735 if (__predict_false((error = in_pcbinshash(inp)) != 0)) { 736 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB); 737 inp->inp_laddr.s_addr = INADDR_ANY; 738 inp->inp_lport = 0; 739 inp->inp_flags &= ~INP_BOUNDFIB; 740 return (error); 741 } 742 if (anonport) 743 inp->inp_flags |= INP_ANONPORT; 744 return (0); 745 } 746 #endif 747 748 #if defined(INET) || defined(INET6) 749 /* 750 * Assign a local port like in_pcb_lport(), but also used with connect() 751 * and a foreign address and port. If fsa is non-NULL, choose a local port 752 * that is unused with those, otherwise one that is completely unused. 753 * lsa can be NULL for IPv6. 754 */ 755 int 756 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, 757 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, 758 int lookupflags) 759 { 760 struct inpcbinfo *pcbinfo; 761 struct inpcb *tmpinp; 762 unsigned short *lastport; 763 int count, error; 764 u_short aux, first, last, lport; 765 #ifdef INET 766 struct in_addr laddr, faddr; 767 #endif 768 #ifdef INET6 769 struct in6_addr *laddr6, *faddr6; 770 #endif 771 772 pcbinfo = inp->inp_pcbinfo; 773 774 /* 775 * Because no actual state changes occur here, a global write lock on 776 * the pcbinfo isn't required. 777 */ 778 INP_LOCK_ASSERT(inp); 779 INP_HASH_LOCK_ASSERT(pcbinfo); 780 781 if (inp->inp_flags & INP_HIGHPORT) { 782 first = V_ipport_hifirstauto; /* sysctl */ 783 last = V_ipport_hilastauto; 784 lastport = &pcbinfo->ipi_lasthi; 785 } else if (inp->inp_flags & INP_LOWPORT) { 786 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 787 if (error) 788 return (error); 789 first = V_ipport_lowfirstauto; /* 1023 */ 790 last = V_ipport_lowlastauto; /* 600 */ 791 lastport = &pcbinfo->ipi_lastlow; 792 } else { 793 first = V_ipport_firstauto; /* sysctl */ 794 last = V_ipport_lastauto; 795 lastport = &pcbinfo->ipi_lastport; 796 } 797 798 /* 799 * Instead of having two loops further down counting up or down 800 * make sure that first is always <= last and go with only one 801 * code path implementing all logic. 802 */ 803 if (first > last) { 804 aux = first; 805 first = last; 806 last = aux; 807 } 808 809 #ifdef INET 810 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 811 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 812 if (lsa != NULL) 813 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 814 if (fsa != NULL) 815 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 816 } 817 #endif 818 #ifdef INET6 819 laddr6 = NULL; 820 if ((inp->inp_vflag & INP_IPV6) != 0) { 821 if (lsa != NULL) 822 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 823 if (fsa != NULL) 824 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 825 } 826 #endif 827 828 tmpinp = NULL; 829 830 if (V_ipport_randomized) 831 *lastport = first + (arc4random() % (last - first)); 832 833 count = last - first; 834 835 do { 836 if (count-- < 0) /* completely used? */ 837 return (EADDRNOTAVAIL); 838 ++*lastport; 839 if (*lastport < first || *lastport > last) 840 *lastport = first; 841 lport = htons(*lastport); 842 843 if (fsa != NULL) { 844 #ifdef INET 845 if (lsa->sa_family == AF_INET) { 846 tmpinp = in_pcblookup_hash_locked(pcbinfo, 847 faddr, fport, laddr, lport, lookupflags, 848 M_NODOM, RT_ALL_FIBS); 849 } 850 #endif 851 #ifdef INET6 852 if (lsa->sa_family == AF_INET6) { 853 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 854 faddr6, fport, laddr6, lport, lookupflags, 855 M_NODOM, RT_ALL_FIBS); 856 } 857 #endif 858 } else { 859 #ifdef INET6 860 if ((inp->inp_vflag & INP_IPV6) != 0) { 861 tmpinp = in6_pcblookup_local(pcbinfo, 862 &inp->in6p_laddr, lport, RT_ALL_FIBS, 863 lookupflags, cred); 864 #ifdef INET 865 if (tmpinp == NULL && 866 (inp->inp_vflag & INP_IPV4)) 867 tmpinp = in_pcblookup_local(pcbinfo, 868 laddr, lport, RT_ALL_FIBS, 869 lookupflags, cred); 870 #endif 871 } 872 #endif 873 #if defined(INET) && defined(INET6) 874 else 875 #endif 876 #ifdef INET 877 tmpinp = in_pcblookup_local(pcbinfo, laddr, 878 lport, RT_ALL_FIBS, lookupflags, cred); 879 #endif 880 } 881 } while (tmpinp != NULL); 882 883 *lportp = lport; 884 885 return (0); 886 } 887 888 /* 889 * Select a local port (number) to use. 890 */ 891 int 892 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 893 struct ucred *cred, int lookupflags) 894 { 895 struct sockaddr_in laddr; 896 897 if (laddrp) { 898 bzero(&laddr, sizeof(laddr)); 899 laddr.sin_family = AF_INET; 900 laddr.sin_addr = *laddrp; 901 } 902 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 903 NULL, lportp, NULL, 0, cred, lookupflags)); 904 } 905 #endif /* INET || INET6 */ 906 907 #ifdef INET 908 /* 909 * Determine whether the inpcb can be bound to the specified address/port tuple. 910 */ 911 static int 912 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 913 const u_short lport, const int fib, int sooptions, int lookupflags, 914 struct ucred *cred) 915 { 916 int reuseport, reuseport_lb; 917 918 INP_LOCK_ASSERT(inp); 919 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 920 921 reuseport = (sooptions & SO_REUSEPORT); 922 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 923 924 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 925 /* 926 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 927 * allow complete duplication of binding if 928 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 929 * and a multicast address is bound on both 930 * new and duplicated sockets. 931 */ 932 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 933 reuseport = SO_REUSEADDR | SO_REUSEPORT; 934 /* 935 * XXX: How to deal with SO_REUSEPORT_LB here? 936 * Treat same as SO_REUSEPORT for now. 937 */ 938 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 939 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 940 } else if (!in_nullhost(laddr)) { 941 struct sockaddr_in sin; 942 943 memset(&sin, 0, sizeof(sin)); 944 sin.sin_family = AF_INET; 945 sin.sin_len = sizeof(sin); 946 sin.sin_addr = laddr; 947 948 /* 949 * Is the address a local IP address? 950 * If INP_BINDANY is set, then the socket may be bound 951 * to any endpoint address, local or not. 952 */ 953 if ((inp->inp_flags & INP_BINDANY) == 0 && 954 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 955 return (EADDRNOTAVAIL); 956 } 957 958 if (lport != 0) { 959 struct inpcb *t; 960 961 if (ntohs(lport) <= V_ipport_reservedhigh && 962 ntohs(lport) >= V_ipport_reservedlow && 963 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 964 return (EACCES); 965 966 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 967 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 968 /* 969 * If a socket owned by a different user is already 970 * bound to this port, fail. In particular, SO_REUSE* 971 * can only be used to share a port among sockets owned 972 * by the same user. 973 * 974 * However, we can share a port with a connected socket 975 * which has a unique 4-tuple. 976 */ 977 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 978 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); 979 if (t != NULL && 980 (inp->inp_socket->so_type != SOCK_STREAM || 981 in_nullhost(t->inp_faddr)) && 982 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 983 return (EADDRINUSE); 984 } 985 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, 986 lookupflags, cred); 987 if (t != NULL && ((reuseport | reuseport_lb) & 988 t->inp_socket->so_options) == 0) { 989 #ifdef INET6 990 if (!in_nullhost(laddr) || 991 !in_nullhost(t->inp_laddr) || 992 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 993 (t->inp_vflag & INP_IPV6PROTO) == 0) 994 #endif 995 return (EADDRINUSE); 996 } 997 } 998 return (0); 999 } 1000 1001 /* 1002 * Set up a bind operation on a PCB, performing port allocation 1003 * as required, but do not actually modify the PCB. Callers can 1004 * either complete the bind by setting inp_laddr/inp_lport and 1005 * calling in_pcbinshash(), or they can just use the resulting 1006 * port and address to authorise the sending of a once-off packet. 1007 * 1008 * On error, the values of *laddrp and *lportp are not changed. 1009 */ 1010 int 1011 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 1012 u_short *lportp, int flags, struct ucred *cred) 1013 { 1014 struct socket *so = inp->inp_socket; 1015 struct in_addr laddr; 1016 u_short lport = 0; 1017 int error, fib, lookupflags, sooptions; 1018 1019 /* 1020 * No state changes, so read locks are sufficient here. 1021 */ 1022 INP_LOCK_ASSERT(inp); 1023 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1024 1025 laddr.s_addr = *laddrp; 1026 if (sin != NULL && laddr.s_addr != INADDR_ANY) 1027 return (EINVAL); 1028 1029 lookupflags = 0; 1030 sooptions = atomic_load_int(&so->so_options); 1031 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 1032 lookupflags = INPLOOKUP_WILDCARD; 1033 if (sin == NULL) { 1034 if ((error = prison_local_ip4(cred, &laddr)) != 0) 1035 return (error); 1036 } else { 1037 KASSERT(sin->sin_family == AF_INET, 1038 ("%s: invalid family for address %p", __func__, sin)); 1039 KASSERT(sin->sin_len == sizeof(*sin), 1040 ("%s: invalid length for address %p", __func__, sin)); 1041 1042 error = prison_local_ip4(cred, &sin->sin_addr); 1043 if (error) 1044 return (error); 1045 if (sin->sin_port != *lportp) { 1046 /* Don't allow the port to change. */ 1047 if (*lportp != 0) 1048 return (EINVAL); 1049 lport = sin->sin_port; 1050 } 1051 laddr = sin->sin_addr; 1052 1053 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : 1054 RT_ALL_FIBS; 1055 1056 /* See if this address/port combo is available. */ 1057 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, 1058 lookupflags, cred); 1059 if (error != 0) 1060 return (error); 1061 } 1062 if (*lportp != 0) 1063 lport = *lportp; 1064 if (lport == 0) { 1065 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1066 if (error != 0) 1067 return (error); 1068 } 1069 *laddrp = laddr.s_addr; 1070 *lportp = lport; 1071 if ((flags & INPBIND_FIB) != 0) 1072 inp->inp_flags |= INP_BOUNDFIB; 1073 return (0); 1074 } 1075 1076 /* 1077 * Connect from a socket to a specified address. 1078 * Both address and port must be specified in argument sin. 1079 * If don't have a local address for this socket yet, 1080 * then pick one. 1081 */ 1082 int 1083 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1084 { 1085 struct in_addr laddr, faddr; 1086 u_short lport; 1087 int error; 1088 bool anonport; 1089 1090 INP_WLOCK_ASSERT(inp); 1091 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1092 KASSERT(in_nullhost(inp->inp_faddr), 1093 ("%s: inp is already connected", __func__)); 1094 KASSERT(sin->sin_family == AF_INET, 1095 ("%s: invalid address family for %p", __func__, sin)); 1096 KASSERT(sin->sin_len == sizeof(*sin), 1097 ("%s: invalid address length for %p", __func__, sin)); 1098 1099 if (sin->sin_port == 0) 1100 return (EADDRNOTAVAIL); 1101 1102 anonport = (inp->inp_lport == 0); 1103 1104 if (__predict_false(in_broadcast(sin->sin_addr))) { 1105 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead)) 1106 return (ENETUNREACH); 1107 /* 1108 * If the destination address is INADDR_ANY, use the primary 1109 * local address. If the supplied address is INADDR_BROADCAST, 1110 * and the primary interface supports broadcast, choose the 1111 * broadcast address for that interface. 1112 */ 1113 if (in_nullhost(sin->sin_addr)) { 1114 faddr = 1115 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1116 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1117 return (error); 1118 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST && 1119 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags 1120 & IFF_BROADCAST) { 1121 faddr = satosin(&CK_STAILQ_FIRST( 1122 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1123 } else 1124 faddr = sin->sin_addr; 1125 } else 1126 faddr = sin->sin_addr; 1127 1128 if (in_nullhost(inp->inp_laddr)) { 1129 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1130 if (error) 1131 return (error); 1132 } else 1133 laddr = inp->inp_laddr; 1134 1135 if (anonport) { 1136 struct sockaddr_in lsin = { 1137 .sin_family = AF_INET, 1138 .sin_addr = laddr, 1139 }; 1140 struct sockaddr_in fsin = { 1141 .sin_family = AF_INET, 1142 .sin_addr = faddr, 1143 }; 1144 1145 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin, 1146 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred, 1147 INPLOOKUP_WILDCARD); 1148 if (error) 1149 return (error); 1150 } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1151 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) != 1152 NULL) 1153 return (EADDRINUSE); 1154 else 1155 lport = inp->inp_lport; 1156 1157 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 || 1158 !(inp->inp_flags & INP_INHASHLIST)); 1159 1160 inp->inp_faddr = faddr; 1161 inp->inp_fport = sin->sin_port; 1162 inp->inp_laddr = laddr; 1163 inp->inp_lport = lport; 1164 1165 if ((inp->inp_flags & INP_INHASHLIST) == 0) { 1166 error = in_pcbinshash(inp); 1167 MPASS(error == 0); 1168 } else 1169 in_pcbrehash(inp); 1170 #ifdef ROUTE_MPATH 1171 if (CALC_FLOWID_OUTBOUND) { 1172 uint32_t hash_val, hash_type; 1173 1174 hash_val = fib4_calc_software_hash(inp->inp_laddr, 1175 inp->inp_faddr, 0, sin->sin_port, 1176 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1177 1178 inp->inp_flowid = hash_val; 1179 inp->inp_flowtype = hash_type; 1180 } 1181 #endif 1182 if (anonport) 1183 inp->inp_flags |= INP_ANONPORT; 1184 return (0); 1185 } 1186 1187 /* 1188 * Do proper source address selection on an unbound socket in case 1189 * of connect. Take jails into account as well. 1190 */ 1191 int 1192 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, 1193 struct in_addr *laddr, struct ucred *cred) 1194 { 1195 struct ifaddr *ifa; 1196 struct sockaddr *sa; 1197 struct sockaddr_in *sin, dst; 1198 struct nhop_object *nh; 1199 int error; 1200 1201 NET_EPOCH_ASSERT(); 1202 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1203 1204 /* 1205 * Bypass source address selection and use the primary jail IP 1206 * if requested. 1207 */ 1208 if (!prison_saddrsel_ip4(cred, laddr)) 1209 return (0); 1210 1211 /* 1212 * If the destination address is multicast and an outgoing 1213 * interface has been set as a multicast option, prefer the 1214 * address of that interface as our source address. 1215 */ 1216 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL && 1217 inp->inp_moptions->imo_multicast_ifp != NULL) { 1218 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp; 1219 struct in_ifaddr *ia; 1220 1221 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1222 if (ia->ia_ifp == ifp && 1223 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) 1224 break; 1225 } 1226 if (ia == NULL) 1227 return (EADDRNOTAVAIL); 1228 *laddr = ia->ia_addr.sin_addr; 1229 return (0); 1230 } 1231 1232 error = 0; 1233 1234 nh = NULL; 1235 bzero(&dst, sizeof(dst)); 1236 sin = &dst; 1237 sin->sin_family = AF_INET; 1238 sin->sin_len = sizeof(struct sockaddr_in); 1239 sin->sin_addr.s_addr = faddr->s_addr; 1240 1241 /* 1242 * If route is known our src addr is taken from the i/f, 1243 * else punt. 1244 * 1245 * Find out route to destination. 1246 */ 1247 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1248 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1249 0, NHR_NONE, 0); 1250 1251 /* 1252 * If we found a route, use the address corresponding to 1253 * the outgoing interface. 1254 * 1255 * Otherwise assume faddr is reachable on a directly connected 1256 * network and try to find a corresponding interface to take 1257 * the source address from. 1258 */ 1259 if (nh == NULL || nh->nh_ifp == NULL) { 1260 struct in_ifaddr *ia; 1261 struct ifnet *ifp; 1262 1263 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1264 inp->inp_socket->so_fibnum)); 1265 if (ia == NULL) { 1266 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1267 inp->inp_socket->so_fibnum)); 1268 } 1269 if (ia == NULL) { 1270 error = ENETUNREACH; 1271 goto done; 1272 } 1273 1274 if (!prison_flag(cred, PR_IP4)) { 1275 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1276 goto done; 1277 } 1278 1279 ifp = ia->ia_ifp; 1280 ia = NULL; 1281 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1282 sa = ifa->ifa_addr; 1283 if (sa->sa_family != AF_INET) 1284 continue; 1285 sin = (struct sockaddr_in *)sa; 1286 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1287 ia = (struct in_ifaddr *)ifa; 1288 break; 1289 } 1290 } 1291 if (ia != NULL) { 1292 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1293 goto done; 1294 } 1295 1296 /* 3. As a last resort return the 'default' jail address. */ 1297 error = prison_get_ip4(cred, laddr); 1298 goto done; 1299 } 1300 1301 /* 1302 * If the outgoing interface on the route found is not 1303 * a loopback interface, use the address from that interface. 1304 * In case of jails do those three steps: 1305 * 1. check if the interface address belongs to the jail. If so use it. 1306 * 2. check if we have any address on the outgoing interface 1307 * belonging to this jail. If so use it. 1308 * 3. as a last resort return the 'default' jail address. 1309 */ 1310 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1311 struct in_ifaddr *ia; 1312 struct ifnet *ifp; 1313 1314 /* If not jailed, use the default returned. */ 1315 if (!prison_flag(cred, PR_IP4)) { 1316 ia = (struct in_ifaddr *)nh->nh_ifa; 1317 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1318 goto done; 1319 } 1320 1321 /* Jailed. */ 1322 /* 1. Check if the iface address belongs to the jail. */ 1323 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1324 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1325 ia = (struct in_ifaddr *)nh->nh_ifa; 1326 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1327 goto done; 1328 } 1329 1330 /* 1331 * 2. Check if we have any address on the outgoing interface 1332 * belonging to this jail. 1333 */ 1334 ia = NULL; 1335 ifp = nh->nh_ifp; 1336 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1337 sa = ifa->ifa_addr; 1338 if (sa->sa_family != AF_INET) 1339 continue; 1340 sin = (struct sockaddr_in *)sa; 1341 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1342 ia = (struct in_ifaddr *)ifa; 1343 break; 1344 } 1345 } 1346 if (ia != NULL) { 1347 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1348 goto done; 1349 } 1350 1351 /* 3. As a last resort return the 'default' jail address. */ 1352 error = prison_get_ip4(cred, laddr); 1353 goto done; 1354 } 1355 1356 /* 1357 * The outgoing interface is marked with 'loopback net', so a route 1358 * to ourselves is here. 1359 * Try to find the interface of the destination address and then 1360 * take the address from there. That interface is not necessarily 1361 * a loopback interface. 1362 * In case of jails, check that it is an address of the jail 1363 * and if we cannot find, fall back to the 'default' jail address. 1364 */ 1365 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1366 struct in_ifaddr *ia; 1367 1368 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1369 inp->inp_socket->so_fibnum)); 1370 if (ia == NULL) 1371 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1372 inp->inp_socket->so_fibnum)); 1373 if (ia == NULL) 1374 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1375 1376 if (!prison_flag(cred, PR_IP4)) { 1377 if (ia == NULL) { 1378 error = ENETUNREACH; 1379 goto done; 1380 } 1381 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1382 goto done; 1383 } 1384 1385 /* Jailed. */ 1386 if (ia != NULL) { 1387 struct ifnet *ifp; 1388 1389 ifp = ia->ia_ifp; 1390 ia = NULL; 1391 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1392 sa = ifa->ifa_addr; 1393 if (sa->sa_family != AF_INET) 1394 continue; 1395 sin = (struct sockaddr_in *)sa; 1396 if (prison_check_ip4(cred, 1397 &sin->sin_addr) == 0) { 1398 ia = (struct in_ifaddr *)ifa; 1399 break; 1400 } 1401 } 1402 if (ia != NULL) { 1403 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1404 goto done; 1405 } 1406 } 1407 1408 /* 3. As a last resort return the 'default' jail address. */ 1409 error = prison_get_ip4(cred, laddr); 1410 goto done; 1411 } 1412 1413 done: 1414 if (error == 0 && laddr->s_addr == INADDR_ANY) 1415 return (EHOSTUNREACH); 1416 return (error); 1417 } 1418 1419 void 1420 in_pcbdisconnect(struct inpcb *inp) 1421 { 1422 1423 INP_WLOCK_ASSERT(inp); 1424 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1425 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1426 ("%s: inp %p was already disconnected", __func__, inp)); 1427 1428 in_pcbremhash_locked(inp); 1429 1430 /* See the comment in in_pcbinshash(). */ 1431 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1432 inp->inp_laddr.s_addr = INADDR_ANY; 1433 inp->inp_faddr.s_addr = INADDR_ANY; 1434 inp->inp_fport = 0; 1435 } 1436 #endif /* INET */ 1437 1438 void 1439 in_pcblisten(struct inpcb *inp) 1440 { 1441 struct inpcblbgroup *grp; 1442 1443 INP_WLOCK_ASSERT(inp); 1444 1445 if ((inp->inp_flags & INP_INLBGROUP) != 0) { 1446 struct inpcbinfo *pcbinfo; 1447 1448 pcbinfo = inp->inp_pcbinfo; 1449 INP_HASH_WLOCK(pcbinfo); 1450 grp = in_pcblbgroup_find(inp); 1451 LIST_REMOVE(inp, inp_lbgroup_list); 1452 grp->il_pendcnt--; 1453 in_pcblbgroup_insert(grp, inp); 1454 INP_HASH_WUNLOCK(pcbinfo); 1455 } 1456 } 1457 1458 /* 1459 * inpcb hash lookups are protected by SMR section. 1460 * 1461 * Once desired pcb has been found, switching from SMR section to a pcb 1462 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1463 * here because SMR is a critical section. 1464 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1465 */ 1466 void 1467 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1468 { 1469 1470 lock == INPLOOKUP_RLOCKPCB ? 1471 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1472 } 1473 1474 void 1475 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1476 { 1477 1478 lock == INPLOOKUP_RLOCKPCB ? 1479 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1480 } 1481 1482 int 1483 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1484 { 1485 1486 return (lock == INPLOOKUP_RLOCKPCB ? 1487 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1488 } 1489 1490 static inline bool 1491 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1492 { 1493 1494 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1495 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1496 1497 if (__predict_true(inp_trylock(inp, lock))) { 1498 if (__predict_false(inp->inp_flags & ignflags)) { 1499 smr_exit(inp->inp_pcbinfo->ipi_smr); 1500 inp_unlock(inp, lock); 1501 return (false); 1502 } 1503 smr_exit(inp->inp_pcbinfo->ipi_smr); 1504 return (true); 1505 } 1506 1507 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1508 smr_exit(inp->inp_pcbinfo->ipi_smr); 1509 inp_lock(inp, lock); 1510 if (__predict_false(in_pcbrele(inp, lock))) 1511 return (false); 1512 /* 1513 * inp acquired through refcount & lock for sure didn't went 1514 * through uma_zfree(). However, it may have already went 1515 * through in_pcbfree() and has another reference, that 1516 * prevented its release by our in_pcbrele(). 1517 */ 1518 if (__predict_false(inp->inp_flags & ignflags)) { 1519 inp_unlock(inp, lock); 1520 return (false); 1521 } 1522 return (true); 1523 } else { 1524 smr_exit(inp->inp_pcbinfo->ipi_smr); 1525 return (false); 1526 } 1527 } 1528 1529 bool 1530 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1531 { 1532 1533 /* 1534 * in_pcblookup() family of functions ignore not only freed entries, 1535 * that may be found due to lockless access to the hash, but dropped 1536 * entries, too. 1537 */ 1538 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1539 } 1540 1541 /* 1542 * inp_next() - inpcb hash/list traversal iterator 1543 * 1544 * Requires initialized struct inpcb_iterator for context. 1545 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1546 * 1547 * - Iterator can have either write-lock or read-lock semantics, that can not 1548 * be changed later. 1549 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1550 * a single hash slot. Note: only rip_input() does the latter. 1551 * - Iterator may have optional bool matching function. The matching function 1552 * will be executed for each inpcb in the SMR context, so it can not acquire 1553 * locks and can safely access only immutable fields of inpcb. 1554 * 1555 * A fresh initialized iterator has NULL inpcb in its context and that 1556 * means that inp_next() call would return the very first inpcb on the list 1557 * locked with desired semantic. In all following calls the context pointer 1558 * shall hold the current inpcb pointer. The KPI user is not supposed to 1559 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1560 * and write NULL to its context. After end of traversal an iterator can be 1561 * reused. 1562 * 1563 * List traversals have the following features/constraints: 1564 * - New entries won't be seen, as they are always added to the head of a list. 1565 * - Removed entries won't stop traversal as long as they are not added to 1566 * a different list. This is violated by in_pcbrehash(). 1567 */ 1568 #define II_LIST_FIRST(ipi, hash) \ 1569 (((hash) == INP_ALL_LIST) ? \ 1570 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1571 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1572 #define II_LIST_NEXT(inp, hash) \ 1573 (((hash) == INP_ALL_LIST) ? \ 1574 CK_LIST_NEXT((inp), inp_list) : \ 1575 CK_LIST_NEXT((inp), inp_hash_exact)) 1576 #define II_LOCK_ASSERT(inp, lock) \ 1577 rw_assert(&(inp)->inp_lock, \ 1578 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1579 struct inpcb * 1580 inp_next(struct inpcb_iterator *ii) 1581 { 1582 const struct inpcbinfo *ipi = ii->ipi; 1583 inp_match_t *match = ii->match; 1584 void *ctx = ii->ctx; 1585 inp_lookup_t lock = ii->lock; 1586 int hash = ii->hash; 1587 struct inpcb *inp; 1588 1589 if (ii->inp == NULL) { /* First call. */ 1590 smr_enter(ipi->ipi_smr); 1591 /* This is unrolled CK_LIST_FOREACH(). */ 1592 for (inp = II_LIST_FIRST(ipi, hash); 1593 inp != NULL; 1594 inp = II_LIST_NEXT(inp, hash)) { 1595 if (match != NULL && (match)(inp, ctx) == false) 1596 continue; 1597 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1598 break; 1599 else { 1600 smr_enter(ipi->ipi_smr); 1601 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1602 inp = II_LIST_FIRST(ipi, hash); 1603 if (inp == NULL) 1604 break; 1605 } 1606 } 1607 1608 if (inp == NULL) 1609 smr_exit(ipi->ipi_smr); 1610 else 1611 ii->inp = inp; 1612 1613 return (inp); 1614 } 1615 1616 /* Not a first call. */ 1617 smr_enter(ipi->ipi_smr); 1618 restart: 1619 inp = ii->inp; 1620 II_LOCK_ASSERT(inp, lock); 1621 next: 1622 inp = II_LIST_NEXT(inp, hash); 1623 if (inp == NULL) { 1624 smr_exit(ipi->ipi_smr); 1625 goto found; 1626 } 1627 1628 if (match != NULL && (match)(inp, ctx) == false) 1629 goto next; 1630 1631 if (__predict_true(inp_trylock(inp, lock))) { 1632 if (__predict_false(inp->inp_flags & INP_FREED)) { 1633 /* 1634 * Entries are never inserted in middle of a list, thus 1635 * as long as we are in SMR, we can continue traversal. 1636 * Jump to 'restart' should yield in the same result, 1637 * but could produce unnecessary looping. Could this 1638 * looping be unbound? 1639 */ 1640 inp_unlock(inp, lock); 1641 goto next; 1642 } else { 1643 smr_exit(ipi->ipi_smr); 1644 goto found; 1645 } 1646 } 1647 1648 /* 1649 * Can't obtain lock immediately, thus going hard. Once we exit the 1650 * SMR section we can no longer jump to 'next', and our only stable 1651 * anchoring point is ii->inp, which we keep locked for this case, so 1652 * we jump to 'restart'. 1653 */ 1654 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1655 smr_exit(ipi->ipi_smr); 1656 inp_lock(inp, lock); 1657 if (__predict_false(in_pcbrele(inp, lock))) { 1658 smr_enter(ipi->ipi_smr); 1659 goto restart; 1660 } 1661 /* 1662 * See comment in inp_smr_lock(). 1663 */ 1664 if (__predict_false(inp->inp_flags & INP_FREED)) { 1665 inp_unlock(inp, lock); 1666 smr_enter(ipi->ipi_smr); 1667 goto restart; 1668 } 1669 } else 1670 goto next; 1671 1672 found: 1673 inp_unlock(ii->inp, lock); 1674 ii->inp = inp; 1675 1676 return (ii->inp); 1677 } 1678 1679 /* 1680 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1681 * stability of an inpcb pointer despite the inpcb lock being released or 1682 * SMR section exited. 1683 * 1684 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1685 */ 1686 void 1687 in_pcbref(struct inpcb *inp) 1688 { 1689 u_int old __diagused; 1690 1691 old = refcount_acquire(&inp->inp_refcount); 1692 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1693 } 1694 1695 /* 1696 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1697 * freeing the pcb, if the reference was very last. 1698 */ 1699 bool 1700 in_pcbrele_rlocked(struct inpcb *inp) 1701 { 1702 1703 INP_RLOCK_ASSERT(inp); 1704 1705 if (!refcount_release(&inp->inp_refcount)) 1706 return (false); 1707 1708 MPASS(inp->inp_flags & INP_FREED); 1709 MPASS(inp->inp_socket == NULL); 1710 crfree(inp->inp_cred); 1711 #ifdef INVARIANTS 1712 inp->inp_cred = NULL; 1713 #endif 1714 INP_RUNLOCK(inp); 1715 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1716 return (true); 1717 } 1718 1719 bool 1720 in_pcbrele_wlocked(struct inpcb *inp) 1721 { 1722 1723 INP_WLOCK_ASSERT(inp); 1724 1725 if (!refcount_release(&inp->inp_refcount)) 1726 return (false); 1727 1728 MPASS(inp->inp_flags & INP_FREED); 1729 MPASS(inp->inp_socket == NULL); 1730 crfree(inp->inp_cred); 1731 #ifdef INVARIANTS 1732 inp->inp_cred = NULL; 1733 #endif 1734 INP_WUNLOCK(inp); 1735 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1736 return (true); 1737 } 1738 1739 bool 1740 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1741 { 1742 1743 return (lock == INPLOOKUP_RLOCKPCB ? 1744 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1745 } 1746 1747 /* 1748 * Unconditionally schedule an inpcb to be freed by decrementing its 1749 * reference count, which should occur only after the inpcb has been detached 1750 * from its socket. If another thread holds a temporary reference (acquired 1751 * using in_pcbref()) then the free is deferred until that reference is 1752 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1753 * Almost all work, including removal from global lists, is done in this 1754 * context, where the pcbinfo lock is held. 1755 */ 1756 void 1757 in_pcbfree(struct inpcb *inp) 1758 { 1759 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1760 #ifdef INET 1761 struct ip_moptions *imo; 1762 #endif 1763 #ifdef INET6 1764 struct ip6_moptions *im6o; 1765 #endif 1766 1767 INP_WLOCK_ASSERT(inp); 1768 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1769 KASSERT((inp->inp_flags & INP_FREED) == 0, 1770 ("%s: called twice for pcb %p", __func__, inp)); 1771 1772 /* 1773 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1774 * from the hash without acquiring inpcb lock, they rely on the hash 1775 * lock, thus in_pcbremhash() should be the first action. 1776 */ 1777 if (inp->inp_flags & INP_INHASHLIST) 1778 in_pcbremhash(inp); 1779 INP_INFO_WLOCK(pcbinfo); 1780 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1781 pcbinfo->ipi_count--; 1782 CK_LIST_REMOVE(inp, inp_list); 1783 INP_INFO_WUNLOCK(pcbinfo); 1784 1785 #ifdef RATELIMIT 1786 if (inp->inp_snd_tag != NULL) 1787 in_pcbdetach_txrtlmt(inp); 1788 #endif 1789 inp->inp_flags |= INP_FREED; 1790 inp->inp_socket->so_pcb = NULL; 1791 inp->inp_socket = NULL; 1792 1793 RO_INVALIDATE_CACHE(&inp->inp_route); 1794 #ifdef MAC 1795 mac_inpcb_destroy(inp); 1796 #endif 1797 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1798 if (inp->inp_sp != NULL) 1799 ipsec_delete_pcbpolicy(inp); 1800 #endif 1801 #ifdef INET 1802 if (inp->inp_options) 1803 (void)m_free(inp->inp_options); 1804 DEBUG_POISON_POINTER(inp->inp_options); 1805 imo = inp->inp_moptions; 1806 DEBUG_POISON_POINTER(inp->inp_moptions); 1807 #endif 1808 #ifdef INET6 1809 if (inp->inp_vflag & INP_IPV6PROTO) { 1810 ip6_freepcbopts(inp->in6p_outputopts); 1811 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1812 im6o = inp->in6p_moptions; 1813 DEBUG_POISON_POINTER(inp->in6p_moptions); 1814 } else 1815 im6o = NULL; 1816 #endif 1817 1818 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1819 INP_WUNLOCK(inp); 1820 } 1821 #ifdef INET6 1822 ip6_freemoptions(im6o); 1823 #endif 1824 #ifdef INET 1825 inp_freemoptions(imo); 1826 #endif 1827 } 1828 1829 /* 1830 * Different protocols initialize their inpcbs differently - giving 1831 * different name to the lock. But they all are disposed the same. 1832 */ 1833 static void 1834 inpcb_fini(void *mem, int size) 1835 { 1836 struct inpcb *inp = mem; 1837 1838 INP_LOCK_DESTROY(inp); 1839 } 1840 1841 /* 1842 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1843 * port reservation, and preventing it from being returned by inpcb lookups. 1844 * 1845 * It is used by TCP to mark an inpcb as unused and avoid future packet 1846 * delivery or event notification when a socket remains open but TCP has 1847 * closed. This might occur as a result of a shutdown()-initiated TCP close 1848 * or a RST on the wire, and allows the port binding to be reused while still 1849 * maintaining the invariant that so_pcb always points to a valid inpcb until 1850 * in_pcbdetach(). 1851 * 1852 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1853 * in_pcbpurgeif0()? 1854 */ 1855 void 1856 in_pcbdrop(struct inpcb *inp) 1857 { 1858 1859 INP_WLOCK_ASSERT(inp); 1860 1861 inp->inp_flags |= INP_DROPPED; 1862 if (inp->inp_flags & INP_INHASHLIST) 1863 in_pcbremhash(inp); 1864 } 1865 1866 #ifdef INET 1867 /* 1868 * Common routines to return the socket addresses associated with inpcbs. 1869 */ 1870 int 1871 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1872 { 1873 struct inpcb *inp; 1874 1875 inp = sotoinpcb(so); 1876 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1877 1878 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1879 .sin_len = sizeof(struct sockaddr_in), 1880 .sin_family = AF_INET, 1881 .sin_port = inp->inp_lport, 1882 .sin_addr = inp->inp_laddr, 1883 }; 1884 1885 return (0); 1886 } 1887 1888 int 1889 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1890 { 1891 struct inpcb *inp; 1892 1893 inp = sotoinpcb(so); 1894 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1895 1896 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1897 .sin_len = sizeof(struct sockaddr_in), 1898 .sin_family = AF_INET, 1899 .sin_port = inp->inp_fport, 1900 .sin_addr = inp->inp_faddr, 1901 }; 1902 1903 return (0); 1904 } 1905 1906 static bool 1907 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1908 { 1909 1910 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1911 return (true); 1912 else 1913 return (false); 1914 } 1915 1916 void 1917 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1918 { 1919 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1920 inp_v4_multi_match, NULL); 1921 struct inpcb *inp; 1922 struct in_multi *inm; 1923 struct in_mfilter *imf; 1924 struct ip_moptions *imo; 1925 1926 IN_MULTI_LOCK_ASSERT(); 1927 1928 while ((inp = inp_next(&inpi)) != NULL) { 1929 INP_WLOCK_ASSERT(inp); 1930 1931 imo = inp->inp_moptions; 1932 /* 1933 * Unselect the outgoing interface if it is being 1934 * detached. 1935 */ 1936 if (imo->imo_multicast_ifp == ifp) 1937 imo->imo_multicast_ifp = NULL; 1938 1939 /* 1940 * Drop multicast group membership if we joined 1941 * through the interface being detached. 1942 * 1943 * XXX This can all be deferred to an epoch_call 1944 */ 1945 restart: 1946 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1947 if ((inm = imf->imf_inm) == NULL) 1948 continue; 1949 if (inm->inm_ifp != ifp) 1950 continue; 1951 ip_mfilter_remove(&imo->imo_head, imf); 1952 in_leavegroup_locked(inm, NULL); 1953 ip_mfilter_free(imf); 1954 goto restart; 1955 } 1956 } 1957 } 1958 1959 /* 1960 * Lookup a PCB based on the local address and port. Caller must hold the 1961 * hash lock. No inpcb locks or references are acquired. 1962 */ 1963 #define INP_LOOKUP_MAPPED_PCB_COST 3 1964 struct inpcb * 1965 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1966 u_short lport, int fib, int lookupflags, struct ucred *cred) 1967 { 1968 struct inpcb *inp; 1969 #ifdef INET6 1970 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1971 #else 1972 int matchwild = 3; 1973 #endif 1974 int wildcard; 1975 1976 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1977 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1978 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), 1979 ("%s: invalid fib %d", __func__, fib)); 1980 1981 INP_HASH_LOCK_ASSERT(pcbinfo); 1982 1983 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 1984 struct inpcbhead *head; 1985 /* 1986 * Look for an unconnected (wildcard foreign addr) PCB that 1987 * matches the local address and port we're looking for. 1988 */ 1989 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 1990 pcbinfo->ipi_hashmask)]; 1991 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 1992 #ifdef INET6 1993 /* XXX inp locking */ 1994 if ((inp->inp_vflag & INP_IPV4) == 0) 1995 continue; 1996 #endif 1997 if (inp->inp_faddr.s_addr == INADDR_ANY && 1998 inp->inp_laddr.s_addr == laddr.s_addr && 1999 inp->inp_lport == lport && (fib == RT_ALL_FIBS || 2000 inp->inp_inc.inc_fibnum == fib)) { 2001 /* 2002 * Found? 2003 */ 2004 if (prison_equal_ip4(cred->cr_prison, 2005 inp->inp_cred->cr_prison)) 2006 return (inp); 2007 } 2008 } 2009 /* 2010 * Not found. 2011 */ 2012 return (NULL); 2013 } else { 2014 struct inpcbhead *porthash; 2015 struct inpcb *match = NULL; 2016 2017 /* 2018 * Port is in use by one or more PCBs. Look for best 2019 * fit. 2020 */ 2021 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2022 pcbinfo->ipi_porthashmask)]; 2023 CK_LIST_FOREACH(inp, porthash, inp_portlist) { 2024 if (inp->inp_lport != lport) 2025 continue; 2026 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2027 cred->cr_prison)) 2028 continue; 2029 if (fib != RT_ALL_FIBS && 2030 inp->inp_inc.inc_fibnum != fib) 2031 continue; 2032 wildcard = 0; 2033 #ifdef INET6 2034 /* XXX inp locking */ 2035 if ((inp->inp_vflag & INP_IPV4) == 0) 2036 continue; 2037 /* 2038 * We never select the PCB that has INP_IPV6 flag and 2039 * is bound to :: if we have another PCB which is bound 2040 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we 2041 * set its cost higher than IPv4 only PCBs. 2042 * 2043 * Note that the case only happens when a socket is 2044 * bound to ::, under the condition that the use of the 2045 * mapped address is allowed. 2046 */ 2047 if ((inp->inp_vflag & INP_IPV6) != 0) 2048 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2049 #endif 2050 if (inp->inp_faddr.s_addr != INADDR_ANY) 2051 wildcard++; 2052 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2053 if (laddr.s_addr == INADDR_ANY) 2054 wildcard++; 2055 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2056 continue; 2057 } else { 2058 if (laddr.s_addr != INADDR_ANY) 2059 wildcard++; 2060 } 2061 if (wildcard < matchwild) { 2062 match = inp; 2063 matchwild = wildcard; 2064 if (matchwild == 0) 2065 break; 2066 } 2067 } 2068 return (match); 2069 } 2070 } 2071 #undef INP_LOOKUP_MAPPED_PCB_COST 2072 2073 static bool 2074 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) 2075 { 2076 return ((domain == M_NODOM || domain == grp->il_numa_domain) && 2077 (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); 2078 } 2079 2080 static struct inpcb * 2081 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2082 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2083 uint16_t lport, int domain, int fib) 2084 { 2085 const struct inpcblbgrouphead *hdr; 2086 struct inpcblbgroup *grp; 2087 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2088 struct inpcb *inp; 2089 u_int count; 2090 2091 INP_HASH_LOCK_ASSERT(pcbinfo); 2092 NET_EPOCH_ASSERT(); 2093 2094 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2095 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2096 2097 /* 2098 * Search for an LB group match based on the following criteria: 2099 * - prefer jailed groups to non-jailed groups 2100 * - prefer exact source address matches to wildcard matches 2101 * - prefer groups bound to the specified NUMA domain 2102 */ 2103 jail_exact = jail_wild = local_exact = local_wild = NULL; 2104 CK_LIST_FOREACH(grp, hdr, il_list) { 2105 bool injail; 2106 2107 #ifdef INET6 2108 if (!(grp->il_vflag & INP_IPV4)) 2109 continue; 2110 #endif 2111 if (grp->il_lport != lport) 2112 continue; 2113 2114 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2115 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2116 laddr) != 0) 2117 continue; 2118 2119 if (grp->il_laddr.s_addr == laddr->s_addr) { 2120 if (injail) { 2121 jail_exact = grp; 2122 if (in_pcblookup_lb_match(grp, domain, fib)) 2123 /* This is a perfect match. */ 2124 goto out; 2125 } else if (local_exact == NULL || 2126 in_pcblookup_lb_match(grp, domain, fib)) { 2127 local_exact = grp; 2128 } 2129 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2130 if (injail) { 2131 if (jail_wild == NULL || 2132 in_pcblookup_lb_match(grp, domain, fib)) 2133 jail_wild = grp; 2134 } else if (local_wild == NULL || 2135 in_pcblookup_lb_match(grp, domain, fib)) { 2136 local_wild = grp; 2137 } 2138 } 2139 } 2140 2141 if (jail_exact != NULL) 2142 grp = jail_exact; 2143 else if (jail_wild != NULL) 2144 grp = jail_wild; 2145 else if (local_exact != NULL) 2146 grp = local_exact; 2147 else 2148 grp = local_wild; 2149 if (grp == NULL) 2150 return (NULL); 2151 2152 out: 2153 /* 2154 * Synchronize with in_pcblbgroup_insert(). 2155 */ 2156 count = atomic_load_acq_int(&grp->il_inpcnt); 2157 if (count == 0) 2158 return (NULL); 2159 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2160 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2161 return (inp); 2162 } 2163 2164 static bool 2165 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2166 u_short fport, struct in_addr laddr, u_short lport) 2167 { 2168 #ifdef INET6 2169 /* XXX inp locking */ 2170 if ((inp->inp_vflag & INP_IPV4) == 0) 2171 return (false); 2172 #endif 2173 if (inp->inp_faddr.s_addr == faddr.s_addr && 2174 inp->inp_laddr.s_addr == laddr.s_addr && 2175 inp->inp_fport == fport && 2176 inp->inp_lport == lport) 2177 return (true); 2178 return (false); 2179 } 2180 2181 static struct inpcb * 2182 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2183 u_short fport, struct in_addr laddr, u_short lport) 2184 { 2185 struct inpcbhead *head; 2186 struct inpcb *inp; 2187 2188 INP_HASH_LOCK_ASSERT(pcbinfo); 2189 2190 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2191 pcbinfo->ipi_hashmask)]; 2192 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2193 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2194 return (inp); 2195 } 2196 return (NULL); 2197 } 2198 2199 typedef enum { 2200 INPLOOKUP_MATCH_NONE = 0, 2201 INPLOOKUP_MATCH_WILD = 1, 2202 INPLOOKUP_MATCH_LADDR = 2, 2203 } inp_lookup_match_t; 2204 2205 static inp_lookup_match_t 2206 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2207 u_short lport, int fib) 2208 { 2209 #ifdef INET6 2210 /* XXX inp locking */ 2211 if ((inp->inp_vflag & INP_IPV4) == 0) 2212 return (INPLOOKUP_MATCH_NONE); 2213 #endif 2214 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2215 return (INPLOOKUP_MATCH_NONE); 2216 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) 2217 return (INPLOOKUP_MATCH_NONE); 2218 if (inp->inp_laddr.s_addr == INADDR_ANY) 2219 return (INPLOOKUP_MATCH_WILD); 2220 if (inp->inp_laddr.s_addr == laddr.s_addr) 2221 return (INPLOOKUP_MATCH_LADDR); 2222 return (INPLOOKUP_MATCH_NONE); 2223 } 2224 2225 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2226 2227 static struct inpcb * 2228 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2229 u_short lport, int fib, const inp_lookup_t lockflags) 2230 { 2231 struct inpcbhead *head; 2232 struct inpcb *inp; 2233 2234 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2235 ("%s: not in SMR read section", __func__)); 2236 2237 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2238 pcbinfo->ipi_hashmask)]; 2239 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2240 inp_lookup_match_t match; 2241 2242 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2243 if (match == INPLOOKUP_MATCH_NONE) 2244 continue; 2245 2246 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2247 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2248 if (match != INPLOOKUP_MATCH_NONE && 2249 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2250 &laddr) == 0) 2251 return (inp); 2252 inp_unlock(inp, lockflags); 2253 } 2254 2255 /* 2256 * The matching socket disappeared out from under us. Fall back 2257 * to a serialized lookup. 2258 */ 2259 return (INP_LOOKUP_AGAIN); 2260 } 2261 return (NULL); 2262 } 2263 2264 static struct inpcb * 2265 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2266 u_short lport, int fib) 2267 { 2268 struct inpcbhead *head; 2269 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2270 #ifdef INET6 2271 struct inpcb *local_wild_mapped; 2272 #endif 2273 2274 INP_HASH_LOCK_ASSERT(pcbinfo); 2275 2276 /* 2277 * Order of socket selection - we always prefer jails. 2278 * 1. jailed, non-wild. 2279 * 2. jailed, wild. 2280 * 3. non-jailed, non-wild. 2281 * 4. non-jailed, wild. 2282 */ 2283 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2284 pcbinfo->ipi_hashmask)]; 2285 local_wild = local_exact = jail_wild = NULL; 2286 #ifdef INET6 2287 local_wild_mapped = NULL; 2288 #endif 2289 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2290 inp_lookup_match_t match; 2291 bool injail; 2292 2293 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2294 if (match == INPLOOKUP_MATCH_NONE) 2295 continue; 2296 2297 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2298 if (injail) { 2299 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2300 &laddr) != 0) 2301 continue; 2302 } else { 2303 if (local_exact != NULL) 2304 continue; 2305 } 2306 2307 if (match == INPLOOKUP_MATCH_LADDR) { 2308 if (injail) 2309 return (inp); 2310 local_exact = inp; 2311 } else { 2312 #ifdef INET6 2313 /* XXX inp locking, NULL check */ 2314 if (inp->inp_vflag & INP_IPV6PROTO) 2315 local_wild_mapped = inp; 2316 else 2317 #endif 2318 if (injail) 2319 jail_wild = inp; 2320 else 2321 local_wild = inp; 2322 } 2323 } 2324 if (jail_wild != NULL) 2325 return (jail_wild); 2326 if (local_exact != NULL) 2327 return (local_exact); 2328 if (local_wild != NULL) 2329 return (local_wild); 2330 #ifdef INET6 2331 if (local_wild_mapped != NULL) 2332 return (local_wild_mapped); 2333 #endif 2334 return (NULL); 2335 } 2336 2337 /* 2338 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2339 * that the caller has either locked the hash list, which usually happens 2340 * for bind(2) operations, or is in SMR section, which happens when sorting 2341 * out incoming packets. 2342 */ 2343 static struct inpcb * 2344 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2345 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2346 uint8_t numa_domain, int fib) 2347 { 2348 struct inpcb *inp; 2349 const u_short fport = fport_arg, lport = lport_arg; 2350 2351 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, 2352 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2353 KASSERT(faddr.s_addr != INADDR_ANY, 2354 ("%s: invalid foreign address", __func__)); 2355 KASSERT(laddr.s_addr != INADDR_ANY, 2356 ("%s: invalid local address", __func__)); 2357 INP_HASH_WLOCK_ASSERT(pcbinfo); 2358 2359 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2360 if (inp != NULL) 2361 return (inp); 2362 2363 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2364 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2365 &laddr, lport, numa_domain, fib); 2366 if (inp == NULL) { 2367 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2368 lport, fib); 2369 } 2370 } 2371 2372 return (inp); 2373 } 2374 2375 static struct inpcb * 2376 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2377 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2378 uint8_t numa_domain, int fib) 2379 { 2380 struct inpcb *inp; 2381 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2382 2383 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2384 ("%s: LOCKPCB not set", __func__)); 2385 2386 INP_HASH_WLOCK(pcbinfo); 2387 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2388 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); 2389 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2390 in_pcbref(inp); 2391 INP_HASH_WUNLOCK(pcbinfo); 2392 inp_lock(inp, lockflags); 2393 if (in_pcbrele(inp, lockflags)) 2394 /* XXX-MJ or retry until we get a negative match? */ 2395 inp = NULL; 2396 } else { 2397 INP_HASH_WUNLOCK(pcbinfo); 2398 } 2399 return (inp); 2400 } 2401 2402 static struct inpcb * 2403 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2404 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2405 uint8_t numa_domain, int fib) 2406 { 2407 struct inpcb *inp; 2408 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2409 const u_short fport = fport_arg, lport = lport_arg; 2410 2411 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2412 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2413 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2414 ("%s: LOCKPCB not set", __func__)); 2415 2416 smr_enter(pcbinfo->ipi_smr); 2417 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2418 if (inp != NULL) { 2419 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2420 /* 2421 * Revalidate the 4-tuple, the socket could have been 2422 * disconnected. 2423 */ 2424 if (__predict_true(in_pcblookup_exact_match(inp, 2425 faddr, fport, laddr, lport))) 2426 return (inp); 2427 inp_unlock(inp, lockflags); 2428 } 2429 2430 /* 2431 * We failed to lock the inpcb, or its connection state changed 2432 * out from under us. Fall back to a precise search. 2433 */ 2434 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2435 lookupflags, numa_domain, fib)); 2436 } 2437 2438 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2439 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2440 &laddr, lport, numa_domain, fib); 2441 if (inp != NULL) { 2442 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2443 if (__predict_true(in_pcblookup_wild_match(inp, 2444 laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) 2445 return (inp); 2446 inp_unlock(inp, lockflags); 2447 } 2448 inp = INP_LOOKUP_AGAIN; 2449 } else { 2450 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2451 fib, lockflags); 2452 } 2453 if (inp == INP_LOOKUP_AGAIN) { 2454 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2455 lport, lookupflags, numa_domain, fib)); 2456 } 2457 } 2458 2459 if (inp == NULL) 2460 smr_exit(pcbinfo->ipi_smr); 2461 2462 return (inp); 2463 } 2464 2465 /* 2466 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2467 * from which a pre-calculated hash value may be extracted. 2468 */ 2469 struct inpcb * 2470 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2471 struct in_addr laddr, u_int lport, int lookupflags, 2472 struct ifnet *ifp) 2473 { 2474 int fib; 2475 2476 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; 2477 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2478 lookupflags, M_NODOM, fib)); 2479 } 2480 2481 struct inpcb * 2482 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2483 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2484 struct ifnet *ifp __unused, struct mbuf *m) 2485 { 2486 int fib; 2487 2488 M_ASSERTPKTHDR(m); 2489 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; 2490 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2491 lookupflags, m->m_pkthdr.numa_domain, fib)); 2492 } 2493 #endif /* INET */ 2494 2495 static bool 2496 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2497 { 2498 return (prison_flag(inp->inp_cred, flag) != 0); 2499 } 2500 2501 /* 2502 * Insert the PCB into a hash chain using ordering rules which ensure that 2503 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2504 * 2505 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2506 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2507 * always appear last no matter whether they are jailed. 2508 */ 2509 static void 2510 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2511 { 2512 struct inpcb *last; 2513 bool bound, injail; 2514 2515 INP_LOCK_ASSERT(inp); 2516 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2517 2518 last = NULL; 2519 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2520 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2521 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2522 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2523 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2524 return; 2525 } 2526 } 2527 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2528 return; 2529 } 2530 2531 injail = in_pcbjailed(inp, PR_IP4); 2532 if (!injail) { 2533 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2534 if (!in_pcbjailed(last, PR_IP4)) 2535 break; 2536 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2537 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2538 return; 2539 } 2540 } 2541 } else if (!CK_LIST_EMPTY(pcbhash) && 2542 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2543 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2544 return; 2545 } 2546 if (!bound) { 2547 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2548 if (last->inp_laddr.s_addr == INADDR_ANY) 2549 break; 2550 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2551 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2552 return; 2553 } 2554 } 2555 } 2556 if (last == NULL) 2557 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2558 else 2559 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2560 } 2561 2562 #ifdef INET6 2563 /* 2564 * See the comment above _in_pcbinshash_wild(). 2565 */ 2566 static void 2567 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2568 { 2569 struct inpcb *last; 2570 bool bound, injail; 2571 2572 INP_LOCK_ASSERT(inp); 2573 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2574 2575 last = NULL; 2576 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2577 injail = in_pcbjailed(inp, PR_IP6); 2578 if (!injail) { 2579 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2580 if (!in_pcbjailed(last, PR_IP6)) 2581 break; 2582 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2583 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2584 return; 2585 } 2586 } 2587 } else if (!CK_LIST_EMPTY(pcbhash) && 2588 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2589 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2590 return; 2591 } 2592 if (!bound) { 2593 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2594 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2595 break; 2596 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2597 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2598 return; 2599 } 2600 } 2601 } 2602 if (last == NULL) 2603 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2604 else 2605 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2606 } 2607 #endif 2608 2609 /* 2610 * Insert PCB onto various hash lists. 2611 * 2612 * With normal sockets this function shall not fail, so it could return void. 2613 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held, 2614 * that's the only condition when it can fail. 2615 */ 2616 int 2617 in_pcbinshash(struct inpcb *inp) 2618 { 2619 struct inpcbhead *pcbhash, *pcbporthash; 2620 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2621 uint32_t hash; 2622 bool connected; 2623 2624 INP_WLOCK_ASSERT(inp); 2625 INP_HASH_WLOCK_ASSERT(pcbinfo); 2626 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2627 ("in_pcbinshash: INP_INHASHLIST")); 2628 2629 #ifdef INET6 2630 if (inp->inp_vflag & INP_IPV6) { 2631 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2632 inp->inp_fport, pcbinfo->ipi_hashmask); 2633 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2634 } else 2635 #endif 2636 { 2637 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2638 inp->inp_fport, pcbinfo->ipi_hashmask); 2639 connected = !in_nullhost(inp->inp_faddr); 2640 } 2641 2642 if (connected) 2643 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2644 else 2645 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2646 2647 pcbporthash = &pcbinfo->ipi_porthashbase[ 2648 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2649 2650 /* 2651 * Add entry to load balance group. 2652 * Only do this if SO_REUSEPORT_LB is set. 2653 */ 2654 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2655 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2656 if (error != 0) 2657 return (error); 2658 } 2659 2660 /* 2661 * The PCB may have been disconnected in the past. Before we can safely 2662 * make it visible in the hash table, we must wait for all readers which 2663 * may be traversing this PCB to finish. 2664 */ 2665 if (inp->inp_smr != SMR_SEQ_INVALID) { 2666 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2667 inp->inp_smr = SMR_SEQ_INVALID; 2668 } 2669 2670 if (connected) 2671 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2672 else { 2673 #ifdef INET6 2674 if ((inp->inp_vflag & INP_IPV6) != 0) 2675 _in6_pcbinshash_wild(pcbhash, inp); 2676 else 2677 #endif 2678 _in_pcbinshash_wild(pcbhash, inp); 2679 } 2680 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); 2681 inp->inp_flags |= INP_INHASHLIST; 2682 2683 return (0); 2684 } 2685 2686 void 2687 in_pcbremhash_locked(struct inpcb *inp) 2688 { 2689 2690 INP_WLOCK_ASSERT(inp); 2691 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2692 MPASS(inp->inp_flags & INP_INHASHLIST); 2693 2694 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2695 in_pcbremlbgrouphash(inp); 2696 #ifdef INET6 2697 if (inp->inp_vflag & INP_IPV6) { 2698 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2699 CK_LIST_REMOVE(inp, inp_hash_wild); 2700 else 2701 CK_LIST_REMOVE(inp, inp_hash_exact); 2702 } else 2703 #endif 2704 { 2705 if (in_nullhost(inp->inp_faddr)) 2706 CK_LIST_REMOVE(inp, inp_hash_wild); 2707 else 2708 CK_LIST_REMOVE(inp, inp_hash_exact); 2709 } 2710 CK_LIST_REMOVE(inp, inp_portlist); 2711 inp->inp_flags &= ~INP_INHASHLIST; 2712 } 2713 2714 static void 2715 in_pcbremhash(struct inpcb *inp) 2716 { 2717 INP_HASH_WLOCK(inp->inp_pcbinfo); 2718 in_pcbremhash_locked(inp); 2719 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2720 } 2721 2722 /* 2723 * Move PCB to the proper hash bucket when { faddr, fport } have been 2724 * changed. NOTE: This does not handle the case of the lport changing (the 2725 * hashed port list would have to be updated as well), so the lport must 2726 * not change after in_pcbinshash() has been called. 2727 */ 2728 void 2729 in_pcbrehash(struct inpcb *inp) 2730 { 2731 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2732 struct inpcbhead *head; 2733 uint32_t hash; 2734 bool connected; 2735 2736 INP_WLOCK_ASSERT(inp); 2737 INP_HASH_WLOCK_ASSERT(pcbinfo); 2738 KASSERT(inp->inp_flags & INP_INHASHLIST, 2739 ("%s: !INP_INHASHLIST", __func__)); 2740 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2741 ("%s: inp was disconnected", __func__)); 2742 2743 #ifdef INET6 2744 if (inp->inp_vflag & INP_IPV6) { 2745 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2746 inp->inp_fport, pcbinfo->ipi_hashmask); 2747 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2748 } else 2749 #endif 2750 { 2751 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2752 inp->inp_fport, pcbinfo->ipi_hashmask); 2753 connected = !in_nullhost(inp->inp_faddr); 2754 } 2755 2756 /* 2757 * When rehashing, the caller must ensure that either the new or the old 2758 * foreign address was unspecified. 2759 */ 2760 if (connected) 2761 CK_LIST_REMOVE(inp, inp_hash_wild); 2762 else 2763 CK_LIST_REMOVE(inp, inp_hash_exact); 2764 2765 if (connected) { 2766 head = &pcbinfo->ipi_hash_exact[hash]; 2767 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2768 } else { 2769 head = &pcbinfo->ipi_hash_wild[hash]; 2770 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2771 } 2772 } 2773 2774 /* 2775 * Check for alternatives when higher level complains 2776 * about service problems. For now, invalidate cached 2777 * routing information. If the route was created dynamically 2778 * (by a redirect), time to try a default gateway again. 2779 */ 2780 void 2781 in_losing(struct inpcb *inp) 2782 { 2783 2784 RO_INVALIDATE_CACHE(&inp->inp_route); 2785 return; 2786 } 2787 2788 /* 2789 * A set label operation has occurred at the socket layer, propagate the 2790 * label change into the in_pcb for the socket. 2791 */ 2792 void 2793 in_pcbsosetlabel(struct socket *so) 2794 { 2795 #ifdef MAC 2796 struct inpcb *inp; 2797 2798 inp = sotoinpcb(so); 2799 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2800 2801 INP_WLOCK(inp); 2802 SOCK_LOCK(so); 2803 mac_inpcb_sosetlabel(so, inp); 2804 SOCK_UNLOCK(so); 2805 INP_WUNLOCK(inp); 2806 #endif 2807 } 2808 2809 void 2810 inp_wlock(struct inpcb *inp) 2811 { 2812 2813 INP_WLOCK(inp); 2814 } 2815 2816 void 2817 inp_wunlock(struct inpcb *inp) 2818 { 2819 2820 INP_WUNLOCK(inp); 2821 } 2822 2823 void 2824 inp_rlock(struct inpcb *inp) 2825 { 2826 2827 INP_RLOCK(inp); 2828 } 2829 2830 void 2831 inp_runlock(struct inpcb *inp) 2832 { 2833 2834 INP_RUNLOCK(inp); 2835 } 2836 2837 #ifdef INVARIANT_SUPPORT 2838 void 2839 inp_lock_assert(struct inpcb *inp) 2840 { 2841 2842 INP_WLOCK_ASSERT(inp); 2843 } 2844 2845 void 2846 inp_unlock_assert(struct inpcb *inp) 2847 { 2848 2849 INP_UNLOCK_ASSERT(inp); 2850 } 2851 #endif 2852 2853 void 2854 inp_apply_all(struct inpcbinfo *pcbinfo, 2855 void (*func)(struct inpcb *, void *), void *arg) 2856 { 2857 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2858 INPLOOKUP_WLOCKPCB); 2859 struct inpcb *inp; 2860 2861 while ((inp = inp_next(&inpi)) != NULL) 2862 func(inp, arg); 2863 } 2864 2865 struct socket * 2866 inp_inpcbtosocket(struct inpcb *inp) 2867 { 2868 2869 INP_WLOCK_ASSERT(inp); 2870 return (inp->inp_socket); 2871 } 2872 2873 void 2874 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2875 uint32_t *faddr, uint16_t *fp) 2876 { 2877 2878 INP_LOCK_ASSERT(inp); 2879 *laddr = inp->inp_laddr.s_addr; 2880 *faddr = inp->inp_faddr.s_addr; 2881 *lp = inp->inp_lport; 2882 *fp = inp->inp_fport; 2883 } 2884 2885 /* 2886 * Create an external-format (``xinpcb'') structure using the information in 2887 * the kernel-format in_pcb structure pointed to by inp. This is done to 2888 * reduce the spew of irrelevant information over this interface, to isolate 2889 * user code from changes in the kernel structure, and potentially to provide 2890 * information-hiding if we decide that some of this information should be 2891 * hidden from users. 2892 */ 2893 void 2894 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2895 { 2896 2897 bzero(xi, sizeof(*xi)); 2898 xi->xi_len = sizeof(struct xinpcb); 2899 if (inp->inp_socket) 2900 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2901 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2902 xi->inp_gencnt = inp->inp_gencnt; 2903 xi->inp_flow = inp->inp_flow; 2904 xi->inp_flowid = inp->inp_flowid; 2905 xi->inp_flowtype = inp->inp_flowtype; 2906 xi->inp_flags = inp->inp_flags; 2907 xi->inp_flags2 = inp->inp_flags2; 2908 xi->in6p_cksum = inp->in6p_cksum; 2909 xi->in6p_hops = inp->in6p_hops; 2910 xi->inp_ip_tos = inp->inp_ip_tos; 2911 xi->inp_vflag = inp->inp_vflag; 2912 xi->inp_ip_ttl = inp->inp_ip_ttl; 2913 xi->inp_ip_p = inp->inp_ip_p; 2914 xi->inp_ip_minttl = inp->inp_ip_minttl; 2915 } 2916 2917 int 2918 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2919 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2920 { 2921 struct sockopt sopt; 2922 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2923 INPLOOKUP_WLOCKPCB); 2924 struct inpcb *inp; 2925 struct sockopt_parameters *params; 2926 struct socket *so; 2927 int error; 2928 char buf[1024]; 2929 2930 if (req->oldptr != NULL || req->oldlen != 0) 2931 return (EINVAL); 2932 if (req->newptr == NULL) 2933 return (EPERM); 2934 if (req->newlen > sizeof(buf)) 2935 return (ENOMEM); 2936 error = SYSCTL_IN(req, buf, req->newlen); 2937 if (error != 0) 2938 return (error); 2939 if (req->newlen < sizeof(struct sockopt_parameters)) 2940 return (EINVAL); 2941 params = (struct sockopt_parameters *)buf; 2942 sopt.sopt_level = params->sop_level; 2943 sopt.sopt_name = params->sop_optname; 2944 sopt.sopt_dir = SOPT_SET; 2945 sopt.sopt_val = params->sop_optval; 2946 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2947 sopt.sopt_td = NULL; 2948 #ifdef INET6 2949 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2950 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2951 params->sop_inc.inc6_laddr.s6_addr16[1] = 2952 htons(params->sop_inc.inc6_zoneid & 0xffff); 2953 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2954 params->sop_inc.inc6_faddr.s6_addr16[1] = 2955 htons(params->sop_inc.inc6_zoneid & 0xffff); 2956 } 2957 #endif 2958 if (params->sop_inc.inc_lport != htons(0) && 2959 params->sop_inc.inc_fport != htons(0)) { 2960 #ifdef INET6 2961 if (params->sop_inc.inc_flags & INC_ISIPV6) 2962 inpi.hash = INP6_PCBHASH( 2963 ¶ms->sop_inc.inc6_faddr, 2964 params->sop_inc.inc_lport, 2965 params->sop_inc.inc_fport, 2966 pcbinfo->ipi_hashmask); 2967 else 2968 #endif 2969 inpi.hash = INP_PCBHASH( 2970 ¶ms->sop_inc.inc_faddr, 2971 params->sop_inc.inc_lport, 2972 params->sop_inc.inc_fport, 2973 pcbinfo->ipi_hashmask); 2974 } 2975 while ((inp = inp_next(&inpi)) != NULL) 2976 if (inp->inp_gencnt == params->sop_id) { 2977 if (inp->inp_flags & INP_DROPPED) { 2978 INP_WUNLOCK(inp); 2979 return (ECONNRESET); 2980 } 2981 so = inp->inp_socket; 2982 KASSERT(so != NULL, ("inp_socket == NULL")); 2983 soref(so); 2984 if (params->sop_level == SOL_SOCKET) { 2985 INP_WUNLOCK(inp); 2986 error = sosetopt(so, &sopt); 2987 } else 2988 error = (*ctloutput_set)(inp, &sopt); 2989 sorele(so); 2990 break; 2991 } 2992 if (inp == NULL) 2993 error = ESRCH; 2994 return (error); 2995 } 2996 2997 #ifdef DDB 2998 static void 2999 db_print_indent(int indent) 3000 { 3001 int i; 3002 3003 for (i = 0; i < indent; i++) 3004 db_printf(" "); 3005 } 3006 3007 static void 3008 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3009 { 3010 char faddr_str[48], laddr_str[48]; 3011 3012 db_print_indent(indent); 3013 db_printf("%s at %p\n", name, inc); 3014 3015 indent += 2; 3016 3017 #ifdef INET6 3018 if (inc->inc_flags & INC_ISIPV6) { 3019 /* IPv6. */ 3020 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3021 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3022 } else 3023 #endif 3024 { 3025 /* IPv4. */ 3026 inet_ntoa_r(inc->inc_laddr, laddr_str); 3027 inet_ntoa_r(inc->inc_faddr, faddr_str); 3028 } 3029 db_print_indent(indent); 3030 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3031 ntohs(inc->inc_lport)); 3032 db_print_indent(indent); 3033 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3034 ntohs(inc->inc_fport)); 3035 } 3036 3037 static void 3038 db_print_inpflags(int inp_flags) 3039 { 3040 int comma; 3041 3042 comma = 0; 3043 if (inp_flags & INP_RECVOPTS) { 3044 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 3045 comma = 1; 3046 } 3047 if (inp_flags & INP_RECVRETOPTS) { 3048 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 3049 comma = 1; 3050 } 3051 if (inp_flags & INP_RECVDSTADDR) { 3052 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 3053 comma = 1; 3054 } 3055 if (inp_flags & INP_ORIGDSTADDR) { 3056 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 3057 comma = 1; 3058 } 3059 if (inp_flags & INP_HDRINCL) { 3060 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 3061 comma = 1; 3062 } 3063 if (inp_flags & INP_HIGHPORT) { 3064 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 3065 comma = 1; 3066 } 3067 if (inp_flags & INP_LOWPORT) { 3068 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 3069 comma = 1; 3070 } 3071 if (inp_flags & INP_ANONPORT) { 3072 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 3073 comma = 1; 3074 } 3075 if (inp_flags & INP_RECVIF) { 3076 db_printf("%sINP_RECVIF", comma ? ", " : ""); 3077 comma = 1; 3078 } 3079 if (inp_flags & INP_MTUDISC) { 3080 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 3081 comma = 1; 3082 } 3083 if (inp_flags & INP_RECVTTL) { 3084 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 3085 comma = 1; 3086 } 3087 if (inp_flags & INP_DONTFRAG) { 3088 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 3089 comma = 1; 3090 } 3091 if (inp_flags & INP_RECVTOS) { 3092 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 3093 comma = 1; 3094 } 3095 if (inp_flags & IN6P_IPV6_V6ONLY) { 3096 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3097 comma = 1; 3098 } 3099 if (inp_flags & IN6P_PKTINFO) { 3100 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3101 comma = 1; 3102 } 3103 if (inp_flags & IN6P_HOPLIMIT) { 3104 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3105 comma = 1; 3106 } 3107 if (inp_flags & IN6P_HOPOPTS) { 3108 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3109 comma = 1; 3110 } 3111 if (inp_flags & IN6P_DSTOPTS) { 3112 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3113 comma = 1; 3114 } 3115 if (inp_flags & IN6P_RTHDR) { 3116 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3117 comma = 1; 3118 } 3119 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3120 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3121 comma = 1; 3122 } 3123 if (inp_flags & IN6P_TCLASS) { 3124 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3125 comma = 1; 3126 } 3127 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3128 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3129 comma = 1; 3130 } 3131 if (inp_flags & INP_ONESBCAST) { 3132 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3133 comma = 1; 3134 } 3135 if (inp_flags & INP_DROPPED) { 3136 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3137 comma = 1; 3138 } 3139 if (inp_flags & INP_SOCKREF) { 3140 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3141 comma = 1; 3142 } 3143 if (inp_flags & IN6P_RFC2292) { 3144 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3145 comma = 1; 3146 } 3147 if (inp_flags & IN6P_MTU) { 3148 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3149 comma = 1; 3150 } 3151 } 3152 3153 static void 3154 db_print_inpvflag(u_char inp_vflag) 3155 { 3156 int comma; 3157 3158 comma = 0; 3159 if (inp_vflag & INP_IPV4) { 3160 db_printf("%sINP_IPV4", comma ? ", " : ""); 3161 comma = 1; 3162 } 3163 if (inp_vflag & INP_IPV6) { 3164 db_printf("%sINP_IPV6", comma ? ", " : ""); 3165 comma = 1; 3166 } 3167 if (inp_vflag & INP_IPV6PROTO) { 3168 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3169 comma = 1; 3170 } 3171 } 3172 3173 static void 3174 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3175 { 3176 3177 db_print_indent(indent); 3178 db_printf("%s at %p\n", name, inp); 3179 3180 indent += 2; 3181 3182 db_print_indent(indent); 3183 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3184 3185 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3186 3187 db_print_indent(indent); 3188 db_printf("inp_label: %p inp_flags: 0x%x (", 3189 inp->inp_label, inp->inp_flags); 3190 db_print_inpflags(inp->inp_flags); 3191 db_printf(")\n"); 3192 3193 db_print_indent(indent); 3194 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3195 inp->inp_vflag); 3196 db_print_inpvflag(inp->inp_vflag); 3197 db_printf(")\n"); 3198 3199 db_print_indent(indent); 3200 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3201 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3202 3203 db_print_indent(indent); 3204 #ifdef INET6 3205 if (inp->inp_vflag & INP_IPV6) { 3206 db_printf("in6p_options: %p in6p_outputopts: %p " 3207 "in6p_moptions: %p\n", inp->in6p_options, 3208 inp->in6p_outputopts, inp->in6p_moptions); 3209 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3210 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3211 inp->in6p_hops); 3212 } else 3213 #endif 3214 { 3215 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3216 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3217 inp->inp_options, inp->inp_moptions); 3218 } 3219 3220 db_print_indent(indent); 3221 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); 3222 } 3223 3224 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3225 { 3226 struct inpcb *inp; 3227 3228 if (!have_addr) { 3229 db_printf("usage: show inpcb <addr>\n"); 3230 return; 3231 } 3232 inp = (struct inpcb *)addr; 3233 3234 db_print_inpcb(inp, "inpcb", 0); 3235 } 3236 #endif /* DDB */ 3237 3238 #ifdef RATELIMIT 3239 /* 3240 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3241 * if any. 3242 */ 3243 int 3244 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3245 { 3246 union if_snd_tag_modify_params params = { 3247 .rate_limit.max_rate = max_pacing_rate, 3248 .rate_limit.flags = M_NOWAIT, 3249 }; 3250 struct m_snd_tag *mst; 3251 int error; 3252 3253 mst = inp->inp_snd_tag; 3254 if (mst == NULL) 3255 return (EINVAL); 3256 3257 if (mst->sw->snd_tag_modify == NULL) { 3258 error = EOPNOTSUPP; 3259 } else { 3260 error = mst->sw->snd_tag_modify(mst, ¶ms); 3261 } 3262 return (error); 3263 } 3264 3265 /* 3266 * Query existing TX rate limit based on the existing 3267 * "inp->inp_snd_tag", if any. 3268 */ 3269 int 3270 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3271 { 3272 union if_snd_tag_query_params params = { }; 3273 struct m_snd_tag *mst; 3274 int error; 3275 3276 mst = inp->inp_snd_tag; 3277 if (mst == NULL) 3278 return (EINVAL); 3279 3280 if (mst->sw->snd_tag_query == NULL) { 3281 error = EOPNOTSUPP; 3282 } else { 3283 error = mst->sw->snd_tag_query(mst, ¶ms); 3284 if (error == 0 && p_max_pacing_rate != NULL) 3285 *p_max_pacing_rate = params.rate_limit.max_rate; 3286 } 3287 return (error); 3288 } 3289 3290 /* 3291 * Query existing TX queue level based on the existing 3292 * "inp->inp_snd_tag", if any. 3293 */ 3294 int 3295 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3296 { 3297 union if_snd_tag_query_params params = { }; 3298 struct m_snd_tag *mst; 3299 int error; 3300 3301 mst = inp->inp_snd_tag; 3302 if (mst == NULL) 3303 return (EINVAL); 3304 3305 if (mst->sw->snd_tag_query == NULL) 3306 return (EOPNOTSUPP); 3307 3308 error = mst->sw->snd_tag_query(mst, ¶ms); 3309 if (error == 0 && p_txqueue_level != NULL) 3310 *p_txqueue_level = params.rate_limit.queue_level; 3311 return (error); 3312 } 3313 3314 /* 3315 * Allocate a new TX rate limit send tag from the network interface 3316 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3317 */ 3318 int 3319 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3320 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3321 3322 { 3323 union if_snd_tag_alloc_params params = { 3324 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3325 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3326 .rate_limit.hdr.flowid = flowid, 3327 .rate_limit.hdr.flowtype = flowtype, 3328 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3329 .rate_limit.max_rate = max_pacing_rate, 3330 .rate_limit.flags = M_NOWAIT, 3331 }; 3332 int error; 3333 3334 INP_WLOCK_ASSERT(inp); 3335 3336 /* 3337 * If there is already a send tag, or the INP is being torn 3338 * down, allocating a new send tag is not allowed. Else send 3339 * tags may leak. 3340 */ 3341 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3342 return (EINVAL); 3343 3344 error = m_snd_tag_alloc(ifp, ¶ms, st); 3345 #ifdef INET 3346 if (error == 0) { 3347 counter_u64_add(rate_limit_set_ok, 1); 3348 counter_u64_add(rate_limit_active, 1); 3349 } else if (error != EOPNOTSUPP) 3350 counter_u64_add(rate_limit_alloc_fail, 1); 3351 #endif 3352 return (error); 3353 } 3354 3355 void 3356 in_pcbdetach_tag(struct m_snd_tag *mst) 3357 { 3358 3359 m_snd_tag_rele(mst); 3360 #ifdef INET 3361 counter_u64_add(rate_limit_active, -1); 3362 #endif 3363 } 3364 3365 /* 3366 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3367 * if any: 3368 */ 3369 void 3370 in_pcbdetach_txrtlmt(struct inpcb *inp) 3371 { 3372 struct m_snd_tag *mst; 3373 3374 INP_WLOCK_ASSERT(inp); 3375 3376 mst = inp->inp_snd_tag; 3377 inp->inp_snd_tag = NULL; 3378 3379 if (mst == NULL) 3380 return; 3381 3382 m_snd_tag_rele(mst); 3383 #ifdef INET 3384 counter_u64_add(rate_limit_active, -1); 3385 #endif 3386 } 3387 3388 int 3389 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3390 { 3391 int error; 3392 3393 /* 3394 * If the existing send tag is for the wrong interface due to 3395 * a route change, first drop the existing tag. Set the 3396 * CHANGED flag so that we will keep trying to allocate a new 3397 * tag if we fail to allocate one this time. 3398 */ 3399 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3400 in_pcbdetach_txrtlmt(inp); 3401 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3402 } 3403 3404 /* 3405 * NOTE: When attaching to a network interface a reference is 3406 * made to ensure the network interface doesn't go away until 3407 * all ratelimit connections are gone. The network interface 3408 * pointers compared below represent valid network interfaces, 3409 * except when comparing towards NULL. 3410 */ 3411 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3412 error = 0; 3413 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3414 if (inp->inp_snd_tag != NULL) 3415 in_pcbdetach_txrtlmt(inp); 3416 error = 0; 3417 } else if (inp->inp_snd_tag == NULL) { 3418 /* 3419 * In order to utilize packet pacing with RSS, we need 3420 * to wait until there is a valid RSS hash before we 3421 * can proceed: 3422 */ 3423 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3424 error = EAGAIN; 3425 } else { 3426 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3427 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3428 } 3429 } else { 3430 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3431 } 3432 if (error == 0 || error == EOPNOTSUPP) 3433 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3434 3435 return (error); 3436 } 3437 3438 /* 3439 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3440 * is set in the fast path and will attach/detach/modify the TX rate 3441 * limit send tag based on the socket's so_max_pacing_rate value. 3442 */ 3443 void 3444 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3445 { 3446 struct socket *socket; 3447 uint32_t max_pacing_rate; 3448 bool did_upgrade; 3449 3450 if (inp == NULL) 3451 return; 3452 3453 socket = inp->inp_socket; 3454 if (socket == NULL) 3455 return; 3456 3457 if (!INP_WLOCKED(inp)) { 3458 /* 3459 * NOTE: If the write locking fails, we need to bail 3460 * out and use the non-ratelimited ring for the 3461 * transmit until there is a new chance to get the 3462 * write lock. 3463 */ 3464 if (!INP_TRY_UPGRADE(inp)) 3465 return; 3466 did_upgrade = 1; 3467 } else { 3468 did_upgrade = 0; 3469 } 3470 3471 /* 3472 * NOTE: The so_max_pacing_rate value is read unlocked, 3473 * because atomic updates are not required since the variable 3474 * is checked at every mbuf we send. It is assumed that the 3475 * variable read itself will be atomic. 3476 */ 3477 max_pacing_rate = socket->so_max_pacing_rate; 3478 3479 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3480 3481 if (did_upgrade) 3482 INP_DOWNGRADE(inp); 3483 } 3484 3485 /* 3486 * Track route changes for TX rate limiting. 3487 */ 3488 void 3489 in_pcboutput_eagain(struct inpcb *inp) 3490 { 3491 bool did_upgrade; 3492 3493 if (inp == NULL) 3494 return; 3495 3496 if (inp->inp_snd_tag == NULL) 3497 return; 3498 3499 if (!INP_WLOCKED(inp)) { 3500 /* 3501 * NOTE: If the write locking fails, we need to bail 3502 * out and use the non-ratelimited ring for the 3503 * transmit until there is a new chance to get the 3504 * write lock. 3505 */ 3506 if (!INP_TRY_UPGRADE(inp)) 3507 return; 3508 did_upgrade = 1; 3509 } else { 3510 did_upgrade = 0; 3511 } 3512 3513 /* detach rate limiting */ 3514 in_pcbdetach_txrtlmt(inp); 3515 3516 /* make sure new mbuf send tag allocation is made */ 3517 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3518 3519 if (did_upgrade) 3520 INP_DOWNGRADE(inp); 3521 } 3522 3523 #ifdef INET 3524 static void 3525 rl_init(void *st) 3526 { 3527 rate_limit_new = counter_u64_alloc(M_WAITOK); 3528 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3529 rate_limit_active = counter_u64_alloc(M_WAITOK); 3530 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3531 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3532 } 3533 3534 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3535 #endif 3536 #endif /* RATELIMIT */ 3537