1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_ddb.h" 41 #include "opt_ipsec.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_ratelimit.h" 45 #include "opt_route.h" 46 #include "opt_rss.h" 47 48 #include <sys/param.h> 49 #include <sys/hash.h> 50 #include <sys/systm.h> 51 #include <sys/libkern.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/eventhandler.h> 56 #include <sys/domain.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/smp.h> 60 #include <sys/smr.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_private.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 115 116 /* 117 * These configure the range of local port addresses assigned to 118 * "unspecified" outgoing connections/packets/whatever. 119 */ 120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 126 127 /* 128 * Reserved ports accessible only to root. There are significant 129 * security considerations that must be accounted for when changing these, 130 * but the security benefits can be great. Please be careful. 131 */ 132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 133 VNET_DEFINE(int, ipport_reservedlow); 134 135 /* Enable random ephemeral port allocation by default. */ 136 VNET_DEFINE(int, ipport_randomized) = 1; 137 138 #ifdef INET 139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 140 struct in_addr faddr, u_int fport_arg, 141 struct in_addr laddr, u_int lport_arg, 142 int lookupflags, uint8_t numa_domain); 143 144 #define RANGECHK(var, min, max) \ 145 if ((var) < (min)) { (var) = (min); } \ 146 else if ((var) > (max)) { (var) = (max); } 147 148 static int 149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 150 { 151 int error; 152 153 error = sysctl_handle_int(oidp, arg1, arg2, req); 154 if (error == 0) { 155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 161 } 162 return (error); 163 } 164 165 #undef RANGECHK 166 167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 169 "IP Ports"); 170 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 174 ""); 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 197 &VNET_NAME(ipport_reservedhigh), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 201 CTLFLAG_VNET | CTLFLAG_RW, 202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 203 204 #ifdef RATELIMIT 205 counter_u64_t rate_limit_new; 206 counter_u64_t rate_limit_chg; 207 counter_u64_t rate_limit_active; 208 counter_u64_t rate_limit_alloc_fail; 209 counter_u64_t rate_limit_set_ok; 210 211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 212 "IP Rate Limiting"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 214 &rate_limit_active, "Active rate limited connections"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 216 &rate_limit_alloc_fail, "Rate limited connection failures"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 218 &rate_limit_set_ok, "Rate limited setting succeeded"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 220 &rate_limit_new, "Total Rate limit new attempts"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 222 &rate_limit_chg, "Total Rate limited change attempts"); 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, NULL); 236 237 #ifdef INET 238 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1; 239 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 240 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 241 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 242 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 243 #endif 244 245 static void in_pcbremhash(struct inpcb *); 246 247 /* 248 * in_pcb.c: manage the Protocol Control Blocks. 249 * 250 * NOTE: It is assumed that most of these functions will be called with 251 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 252 * functions often modify hash chains or addresses in pcbs. 253 */ 254 255 static struct inpcblbgroup * 256 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 257 const union in_dependaddr *addr, int size, uint8_t numa_domain) 258 { 259 struct inpcblbgroup *grp; 260 size_t bytes; 261 262 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 263 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 264 if (grp == NULL) 265 return (NULL); 266 grp->il_cred = crhold(cred); 267 grp->il_vflag = vflag; 268 grp->il_lport = port; 269 grp->il_numa_domain = numa_domain; 270 grp->il_dependladdr = *addr; 271 grp->il_inpsiz = size; 272 return (grp); 273 } 274 275 static void 276 in_pcblbgroup_free_deferred(epoch_context_t ctx) 277 { 278 struct inpcblbgroup *grp; 279 280 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 281 crfree(grp->il_cred); 282 free(grp, M_PCB); 283 } 284 285 static void 286 in_pcblbgroup_free(struct inpcblbgroup *grp) 287 { 288 289 CK_LIST_REMOVE(grp, il_list); 290 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 291 } 292 293 static void 294 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 295 { 296 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 297 ("invalid local group size %d and count %d", grp->il_inpsiz, 298 grp->il_inpcnt)); 299 INP_WLOCK_ASSERT(inp); 300 301 inp->inp_flags |= INP_INLBGROUP; 302 grp->il_inp[grp->il_inpcnt] = inp; 303 304 /* 305 * Synchronize with in_pcblookup_lbgroup(): make sure that we don't 306 * expose a null slot to the lookup path. 307 */ 308 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 309 } 310 311 static struct inpcblbgroup * 312 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 313 struct inpcblbgroup *old_grp, int size) 314 { 315 struct inpcblbgroup *grp; 316 int i; 317 318 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 319 old_grp->il_lport, &old_grp->il_dependladdr, size, 320 old_grp->il_numa_domain); 321 if (grp == NULL) 322 return (NULL); 323 324 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 325 ("invalid new local group size %d and old local group count %d", 326 grp->il_inpsiz, old_grp->il_inpcnt)); 327 328 for (i = 0; i < old_grp->il_inpcnt; ++i) 329 grp->il_inp[i] = old_grp->il_inp[i]; 330 grp->il_inpcnt = old_grp->il_inpcnt; 331 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 332 in_pcblbgroup_free(old_grp); 333 return (grp); 334 } 335 336 /* 337 * Add PCB to load balance group for SO_REUSEPORT_LB option. 338 */ 339 static int 340 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 341 { 342 const static struct timeval interval = { 60, 0 }; 343 static struct timeval lastprint; 344 struct inpcbinfo *pcbinfo; 345 struct inpcblbgrouphead *hdr; 346 struct inpcblbgroup *grp; 347 uint32_t idx; 348 349 pcbinfo = inp->inp_pcbinfo; 350 351 INP_WLOCK_ASSERT(inp); 352 INP_HASH_WLOCK_ASSERT(pcbinfo); 353 354 #ifdef INET6 355 /* 356 * Don't allow IPv4 mapped INET6 wild socket. 357 */ 358 if ((inp->inp_vflag & INP_IPV4) && 359 inp->inp_laddr.s_addr == INADDR_ANY && 360 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 361 return (0); 362 } 363 #endif 364 365 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 366 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 367 CK_LIST_FOREACH(grp, hdr, il_list) { 368 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 369 grp->il_vflag == inp->inp_vflag && 370 grp->il_lport == inp->inp_lport && 371 grp->il_numa_domain == numa_domain && 372 memcmp(&grp->il_dependladdr, 373 &inp->inp_inc.inc_ie.ie_dependladdr, 374 sizeof(grp->il_dependladdr)) == 0) { 375 break; 376 } 377 } 378 if (grp == NULL) { 379 /* Create new load balance group. */ 380 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 381 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 382 INPCBLBGROUP_SIZMIN, numa_domain); 383 if (grp == NULL) 384 return (ENOBUFS); 385 in_pcblbgroup_insert(grp, inp); 386 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 387 } else if (grp->il_inpcnt == grp->il_inpsiz) { 388 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 389 if (ratecheck(&lastprint, &interval)) 390 printf("lb group port %d, limit reached\n", 391 ntohs(grp->il_lport)); 392 return (0); 393 } 394 395 /* Expand this local group. */ 396 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 397 if (grp == NULL) 398 return (ENOBUFS); 399 in_pcblbgroup_insert(grp, inp); 400 } else { 401 in_pcblbgroup_insert(grp, inp); 402 } 403 return (0); 404 } 405 406 /* 407 * Remove PCB from load balance group. 408 */ 409 static void 410 in_pcbremlbgrouphash(struct inpcb *inp) 411 { 412 struct inpcbinfo *pcbinfo; 413 struct inpcblbgrouphead *hdr; 414 struct inpcblbgroup *grp; 415 int i; 416 417 pcbinfo = inp->inp_pcbinfo; 418 419 INP_WLOCK_ASSERT(inp); 420 MPASS(inp->inp_flags & INP_INLBGROUP); 421 INP_HASH_WLOCK_ASSERT(pcbinfo); 422 423 hdr = &pcbinfo->ipi_lbgrouphashbase[ 424 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 425 CK_LIST_FOREACH(grp, hdr, il_list) { 426 for (i = 0; i < grp->il_inpcnt; ++i) { 427 if (grp->il_inp[i] != inp) 428 continue; 429 430 if (grp->il_inpcnt == 1) { 431 /* We are the last, free this local group. */ 432 in_pcblbgroup_free(grp); 433 } else { 434 KASSERT(grp->il_inpcnt >= 2, 435 ("invalid local group count %d", 436 grp->il_inpcnt)); 437 grp->il_inp[i] = 438 grp->il_inp[grp->il_inpcnt - 1]; 439 440 /* 441 * Synchronize with in_pcblookup_lbgroup(). 442 */ 443 atomic_store_rel_int(&grp->il_inpcnt, 444 grp->il_inpcnt - 1); 445 } 446 inp->inp_flags &= ~INP_INLBGROUP; 447 return; 448 } 449 } 450 KASSERT(0, ("%s: did not find %p", __func__, inp)); 451 } 452 453 int 454 in_pcblbgroup_numa(struct inpcb *inp, int arg) 455 { 456 struct inpcbinfo *pcbinfo; 457 struct inpcblbgrouphead *hdr; 458 struct inpcblbgroup *grp; 459 int err, i; 460 uint8_t numa_domain; 461 462 switch (arg) { 463 case TCP_REUSPORT_LB_NUMA_NODOM: 464 numa_domain = M_NODOM; 465 break; 466 case TCP_REUSPORT_LB_NUMA_CURDOM: 467 numa_domain = PCPU_GET(domain); 468 break; 469 default: 470 if (arg < 0 || arg >= vm_ndomains) 471 return (EINVAL); 472 numa_domain = arg; 473 } 474 475 err = 0; 476 pcbinfo = inp->inp_pcbinfo; 477 INP_WLOCK_ASSERT(inp); 478 INP_HASH_WLOCK(pcbinfo); 479 hdr = &pcbinfo->ipi_lbgrouphashbase[ 480 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 481 CK_LIST_FOREACH(grp, hdr, il_list) { 482 for (i = 0; i < grp->il_inpcnt; ++i) { 483 if (grp->il_inp[i] != inp) 484 continue; 485 486 if (grp->il_numa_domain == numa_domain) { 487 goto abort_with_hash_wlock; 488 } 489 490 /* Remove it from the old group. */ 491 in_pcbremlbgrouphash(inp); 492 493 /* Add it to the new group based on numa domain. */ 494 in_pcbinslbgrouphash(inp, numa_domain); 495 goto abort_with_hash_wlock; 496 } 497 } 498 err = ENOENT; 499 abort_with_hash_wlock: 500 INP_HASH_WUNLOCK(pcbinfo); 501 return (err); 502 } 503 504 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 505 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 506 507 /* 508 * Initialize an inpcbinfo - a per-VNET instance of connections db. 509 */ 510 void 511 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 512 u_int hash_nelements, u_int porthash_nelements) 513 { 514 515 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 516 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 517 NULL, MTX_DEF); 518 #ifdef VIMAGE 519 pcbinfo->ipi_vnet = curvnet; 520 #endif 521 CK_LIST_INIT(&pcbinfo->ipi_listhead); 522 pcbinfo->ipi_count = 0; 523 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 524 &pcbinfo->ipi_hashmask); 525 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 526 &pcbinfo->ipi_hashmask); 527 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 528 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 529 &pcbinfo->ipi_porthashmask); 530 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 531 &pcbinfo->ipi_lbgrouphashmask); 532 pcbinfo->ipi_zone = pcbstor->ips_zone; 533 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 534 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 535 } 536 537 /* 538 * Destroy an inpcbinfo. 539 */ 540 void 541 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 542 { 543 544 KASSERT(pcbinfo->ipi_count == 0, 545 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 546 547 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 548 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 549 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 550 pcbinfo->ipi_porthashmask); 551 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 552 pcbinfo->ipi_lbgrouphashmask); 553 mtx_destroy(&pcbinfo->ipi_hash_lock); 554 mtx_destroy(&pcbinfo->ipi_lock); 555 } 556 557 /* 558 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 559 */ 560 static void inpcb_fini(void *, int); 561 void 562 in_pcbstorage_init(void *arg) 563 { 564 struct inpcbstorage *pcbstor = arg; 565 566 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 567 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 568 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 569 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 570 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 571 uma_zone_set_smr(pcbstor->ips_portzone, 572 uma_zone_get_smr(pcbstor->ips_zone)); 573 } 574 575 /* 576 * Destroy a pcbstorage - used by unloadable protocols. 577 */ 578 void 579 in_pcbstorage_destroy(void *arg) 580 { 581 struct inpcbstorage *pcbstor = arg; 582 583 uma_zdestroy(pcbstor->ips_zone); 584 uma_zdestroy(pcbstor->ips_portzone); 585 } 586 587 /* 588 * Allocate a PCB and associate it with the socket. 589 * On success return with the PCB locked. 590 */ 591 int 592 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 593 { 594 struct inpcb *inp; 595 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 596 int error; 597 #endif 598 599 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 600 if (inp == NULL) 601 return (ENOBUFS); 602 bzero(&inp->inp_start_zero, inp_zero_size); 603 #ifdef NUMA 604 inp->inp_numa_domain = M_NODOM; 605 #endif 606 inp->inp_pcbinfo = pcbinfo; 607 inp->inp_socket = so; 608 inp->inp_cred = crhold(so->so_cred); 609 inp->inp_inc.inc_fibnum = so->so_fibnum; 610 #ifdef MAC 611 error = mac_inpcb_init(inp, M_NOWAIT); 612 if (error != 0) 613 goto out; 614 mac_inpcb_create(so, inp); 615 #endif 616 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 617 error = ipsec_init_pcbpolicy(inp); 618 if (error != 0) { 619 #ifdef MAC 620 mac_inpcb_destroy(inp); 621 #endif 622 goto out; 623 } 624 #endif /*IPSEC*/ 625 #ifdef INET6 626 if (INP_SOCKAF(so) == AF_INET6) { 627 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 628 if (V_ip6_v6only) 629 inp->inp_flags |= IN6P_IPV6_V6ONLY; 630 #ifdef INET 631 else 632 inp->inp_vflag |= INP_IPV4; 633 #endif 634 if (V_ip6_auto_flowlabel) 635 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 636 inp->in6p_hops = -1; /* use kernel default */ 637 } 638 #endif 639 #if defined(INET) && defined(INET6) 640 else 641 #endif 642 #ifdef INET 643 inp->inp_vflag |= INP_IPV4; 644 #endif 645 inp->inp_smr = SMR_SEQ_INVALID; 646 647 /* 648 * Routes in inpcb's can cache L2 as well; they are guaranteed 649 * to be cleaned up. 650 */ 651 inp->inp_route.ro_flags = RT_LLE_CACHE; 652 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 653 INP_WLOCK(inp); 654 INP_INFO_WLOCK(pcbinfo); 655 pcbinfo->ipi_count++; 656 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 657 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 658 INP_INFO_WUNLOCK(pcbinfo); 659 so->so_pcb = inp; 660 661 return (0); 662 663 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 664 out: 665 crfree(inp->inp_cred); 666 #ifdef INVARIANTS 667 inp->inp_cred = NULL; 668 #endif 669 uma_zfree_smr(pcbinfo->ipi_zone, inp); 670 return (error); 671 #endif 672 } 673 674 #ifdef INET 675 int 676 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 677 { 678 int anonport, error; 679 680 KASSERT(sin == NULL || sin->sin_family == AF_INET, 681 ("%s: invalid address family for %p", __func__, sin)); 682 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 683 ("%s: invalid address length for %p", __func__, sin)); 684 INP_WLOCK_ASSERT(inp); 685 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 686 687 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 688 return (EINVAL); 689 anonport = sin == NULL || sin->sin_port == 0; 690 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 691 &inp->inp_lport, cred); 692 if (error) 693 return (error); 694 if (in_pcbinshash(inp) != 0) { 695 inp->inp_laddr.s_addr = INADDR_ANY; 696 inp->inp_lport = 0; 697 return (EAGAIN); 698 } 699 if (anonport) 700 inp->inp_flags |= INP_ANONPORT; 701 return (0); 702 } 703 #endif 704 705 #if defined(INET) || defined(INET6) 706 /* 707 * Assign a local port like in_pcb_lport(), but also used with connect() 708 * and a foreign address and port. If fsa is non-NULL, choose a local port 709 * that is unused with those, otherwise one that is completely unused. 710 * lsa can be NULL for IPv6. 711 */ 712 int 713 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 714 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 715 { 716 struct inpcbinfo *pcbinfo; 717 struct inpcb *tmpinp; 718 unsigned short *lastport; 719 int count, error; 720 u_short aux, first, last, lport; 721 #ifdef INET 722 struct in_addr laddr, faddr; 723 #endif 724 #ifdef INET6 725 struct in6_addr *laddr6, *faddr6; 726 #endif 727 728 pcbinfo = inp->inp_pcbinfo; 729 730 /* 731 * Because no actual state changes occur here, a global write lock on 732 * the pcbinfo isn't required. 733 */ 734 INP_LOCK_ASSERT(inp); 735 INP_HASH_LOCK_ASSERT(pcbinfo); 736 737 if (inp->inp_flags & INP_HIGHPORT) { 738 first = V_ipport_hifirstauto; /* sysctl */ 739 last = V_ipport_hilastauto; 740 lastport = &pcbinfo->ipi_lasthi; 741 } else if (inp->inp_flags & INP_LOWPORT) { 742 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 743 if (error) 744 return (error); 745 first = V_ipport_lowfirstauto; /* 1023 */ 746 last = V_ipport_lowlastauto; /* 600 */ 747 lastport = &pcbinfo->ipi_lastlow; 748 } else { 749 first = V_ipport_firstauto; /* sysctl */ 750 last = V_ipport_lastauto; 751 lastport = &pcbinfo->ipi_lastport; 752 } 753 754 /* 755 * Instead of having two loops further down counting up or down 756 * make sure that first is always <= last and go with only one 757 * code path implementing all logic. 758 */ 759 if (first > last) { 760 aux = first; 761 first = last; 762 last = aux; 763 } 764 765 #ifdef INET 766 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 767 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 768 if (lsa != NULL) 769 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 770 if (fsa != NULL) 771 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 772 } 773 #endif 774 #ifdef INET6 775 laddr6 = NULL; 776 if ((inp->inp_vflag & INP_IPV6) != 0) { 777 if (lsa != NULL) 778 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 779 if (fsa != NULL) 780 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 781 } 782 #endif 783 784 tmpinp = NULL; 785 lport = *lportp; 786 787 if (V_ipport_randomized) 788 *lastport = first + (arc4random() % (last - first)); 789 790 count = last - first; 791 792 do { 793 if (count-- < 0) /* completely used? */ 794 return (EADDRNOTAVAIL); 795 ++*lastport; 796 if (*lastport < first || *lastport > last) 797 *lastport = first; 798 lport = htons(*lastport); 799 800 if (fsa != NULL) { 801 #ifdef INET 802 if (lsa->sa_family == AF_INET) { 803 tmpinp = in_pcblookup_hash_locked(pcbinfo, 804 faddr, fport, laddr, lport, lookupflags, 805 M_NODOM); 806 } 807 #endif 808 #ifdef INET6 809 if (lsa->sa_family == AF_INET6) { 810 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 811 faddr6, fport, laddr6, lport, lookupflags, 812 M_NODOM); 813 } 814 #endif 815 } else { 816 #ifdef INET6 817 if ((inp->inp_vflag & INP_IPV6) != 0) { 818 tmpinp = in6_pcblookup_local(pcbinfo, 819 &inp->in6p_laddr, lport, lookupflags, cred); 820 #ifdef INET 821 if (tmpinp == NULL && 822 (inp->inp_vflag & INP_IPV4)) 823 tmpinp = in_pcblookup_local(pcbinfo, 824 laddr, lport, lookupflags, cred); 825 #endif 826 } 827 #endif 828 #if defined(INET) && defined(INET6) 829 else 830 #endif 831 #ifdef INET 832 tmpinp = in_pcblookup_local(pcbinfo, laddr, 833 lport, lookupflags, cred); 834 #endif 835 } 836 } while (tmpinp != NULL); 837 838 *lportp = lport; 839 840 return (0); 841 } 842 843 /* 844 * Select a local port (number) to use. 845 */ 846 int 847 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 848 struct ucred *cred, int lookupflags) 849 { 850 struct sockaddr_in laddr; 851 852 if (laddrp) { 853 bzero(&laddr, sizeof(laddr)); 854 laddr.sin_family = AF_INET; 855 laddr.sin_addr = *laddrp; 856 } 857 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 858 NULL, lportp, NULL, 0, cred, lookupflags)); 859 } 860 #endif /* INET || INET6 */ 861 862 #ifdef INET 863 /* 864 * Determine whether the inpcb can be bound to the specified address/port tuple. 865 */ 866 static int 867 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 868 const u_short lport, int sooptions, int lookupflags, struct ucred *cred) 869 { 870 int reuseport, reuseport_lb; 871 872 INP_LOCK_ASSERT(inp); 873 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 874 875 reuseport = (sooptions & SO_REUSEPORT); 876 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 877 878 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 879 /* 880 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 881 * allow complete duplication of binding if 882 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 883 * and a multicast address is bound on both 884 * new and duplicated sockets. 885 */ 886 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 887 reuseport = SO_REUSEADDR | SO_REUSEPORT; 888 /* 889 * XXX: How to deal with SO_REUSEPORT_LB here? 890 * Treat same as SO_REUSEPORT for now. 891 */ 892 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 893 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 894 } else if (!in_nullhost(laddr)) { 895 struct sockaddr_in sin; 896 897 memset(&sin, 0, sizeof(sin)); 898 sin.sin_family = AF_INET; 899 sin.sin_len = sizeof(sin); 900 sin.sin_addr = laddr; 901 902 /* 903 * Is the address a local IP address? 904 * If INP_BINDANY is set, then the socket may be bound 905 * to any endpoint address, local or not. 906 */ 907 if ((inp->inp_flags & INP_BINDANY) == 0 && 908 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 909 return (EADDRNOTAVAIL); 910 } 911 912 if (lport != 0) { 913 struct inpcb *t; 914 915 if (ntohs(lport) <= V_ipport_reservedhigh && 916 ntohs(lport) >= V_ipport_reservedlow && 917 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 918 return (EACCES); 919 920 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 921 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 922 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 923 INPLOOKUP_WILDCARD, cred); 924 if (t != NULL && 925 (inp->inp_socket->so_type != SOCK_STREAM || 926 in_nullhost(t->inp_faddr)) && 927 (!in_nullhost(laddr) || 928 !in_nullhost(t->inp_laddr)) && 929 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 930 return (EADDRINUSE); 931 } 932 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 933 lookupflags, cred); 934 if (t != NULL && ((reuseport | reuseport_lb) & 935 t->inp_socket->so_options) == 0) { 936 #ifdef INET6 937 if (!in_nullhost(laddr) || 938 !in_nullhost(t->inp_laddr) || 939 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 940 (t->inp_vflag & INP_IPV6PROTO) == 0) 941 #endif 942 return (EADDRINUSE); 943 } 944 } 945 return (0); 946 } 947 948 /* 949 * Set up a bind operation on a PCB, performing port allocation 950 * as required, but do not actually modify the PCB. Callers can 951 * either complete the bind by setting inp_laddr/inp_lport and 952 * calling in_pcbinshash(), or they can just use the resulting 953 * port and address to authorise the sending of a once-off packet. 954 * 955 * On error, the values of *laddrp and *lportp are not changed. 956 */ 957 int 958 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 959 u_short *lportp, struct ucred *cred) 960 { 961 struct socket *so = inp->inp_socket; 962 struct in_addr laddr; 963 u_short lport = 0; 964 int lookupflags, sooptions; 965 int error; 966 967 /* 968 * No state changes, so read locks are sufficient here. 969 */ 970 INP_LOCK_ASSERT(inp); 971 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 972 973 laddr.s_addr = *laddrp; 974 if (sin != NULL && laddr.s_addr != INADDR_ANY) 975 return (EINVAL); 976 977 lookupflags = 0; 978 sooptions = atomic_load_int(&so->so_options); 979 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 980 lookupflags = INPLOOKUP_WILDCARD; 981 if (sin == NULL) { 982 if ((error = prison_local_ip4(cred, &laddr)) != 0) 983 return (error); 984 } else { 985 KASSERT(sin->sin_family == AF_INET, 986 ("%s: invalid family for address %p", __func__, sin)); 987 KASSERT(sin->sin_len == sizeof(*sin), 988 ("%s: invalid length for address %p", __func__, sin)); 989 990 error = prison_local_ip4(cred, &sin->sin_addr); 991 if (error) 992 return (error); 993 if (sin->sin_port != *lportp) { 994 /* Don't allow the port to change. */ 995 if (*lportp != 0) 996 return (EINVAL); 997 lport = sin->sin_port; 998 } 999 laddr = sin->sin_addr; 1000 1001 /* See if this address/port combo is available. */ 1002 error = in_pcbbind_avail(inp, laddr, lport, sooptions, 1003 lookupflags, cred); 1004 if (error != 0) 1005 return (error); 1006 } 1007 if (*lportp != 0) 1008 lport = *lportp; 1009 if (lport == 0) { 1010 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1011 if (error != 0) 1012 return (error); 1013 } 1014 *laddrp = laddr.s_addr; 1015 *lportp = lport; 1016 return (0); 1017 } 1018 1019 /* 1020 * Connect from a socket to a specified address. 1021 * Both address and port must be specified in argument sin. 1022 * If don't have a local address for this socket yet, 1023 * then pick one. 1024 */ 1025 int 1026 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1027 { 1028 u_short lport, fport; 1029 in_addr_t laddr, faddr; 1030 int anonport, error; 1031 1032 INP_WLOCK_ASSERT(inp); 1033 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1034 KASSERT(in_nullhost(inp->inp_faddr), 1035 ("%s: inp is already connected", __func__)); 1036 1037 lport = inp->inp_lport; 1038 laddr = inp->inp_laddr.s_addr; 1039 anonport = (lport == 0); 1040 error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, 1041 cred); 1042 if (error) 1043 return (error); 1044 1045 inp->inp_faddr.s_addr = faddr; 1046 inp->inp_fport = fport; 1047 1048 /* Do the initial binding of the local address if required. */ 1049 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1050 inp->inp_lport = lport; 1051 inp->inp_laddr.s_addr = laddr; 1052 if (in_pcbinshash(inp) != 0) { 1053 inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = 1054 INADDR_ANY; 1055 inp->inp_lport = inp->inp_fport = 0; 1056 return (EAGAIN); 1057 } 1058 } else { 1059 inp->inp_lport = lport; 1060 inp->inp_laddr.s_addr = laddr; 1061 if ((inp->inp_flags & INP_INHASHLIST) != 0) 1062 in_pcbrehash(inp); 1063 else 1064 in_pcbinshash(inp); 1065 } 1066 1067 if (anonport) 1068 inp->inp_flags |= INP_ANONPORT; 1069 return (0); 1070 } 1071 1072 /* 1073 * Do proper source address selection on an unbound socket in case 1074 * of connect. Take jails into account as well. 1075 */ 1076 int 1077 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1078 struct ucred *cred) 1079 { 1080 struct ifaddr *ifa; 1081 struct sockaddr *sa; 1082 struct sockaddr_in *sin, dst; 1083 struct nhop_object *nh; 1084 int error; 1085 1086 NET_EPOCH_ASSERT(); 1087 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1088 1089 /* 1090 * Bypass source address selection and use the primary jail IP 1091 * if requested. 1092 */ 1093 if (!prison_saddrsel_ip4(cred, laddr)) 1094 return (0); 1095 1096 error = 0; 1097 1098 nh = NULL; 1099 bzero(&dst, sizeof(dst)); 1100 sin = &dst; 1101 sin->sin_family = AF_INET; 1102 sin->sin_len = sizeof(struct sockaddr_in); 1103 sin->sin_addr.s_addr = faddr->s_addr; 1104 1105 /* 1106 * If route is known our src addr is taken from the i/f, 1107 * else punt. 1108 * 1109 * Find out route to destination. 1110 */ 1111 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1112 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1113 0, NHR_NONE, 0); 1114 1115 /* 1116 * If we found a route, use the address corresponding to 1117 * the outgoing interface. 1118 * 1119 * Otherwise assume faddr is reachable on a directly connected 1120 * network and try to find a corresponding interface to take 1121 * the source address from. 1122 */ 1123 if (nh == NULL || nh->nh_ifp == NULL) { 1124 struct in_ifaddr *ia; 1125 struct ifnet *ifp; 1126 1127 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1128 inp->inp_socket->so_fibnum)); 1129 if (ia == NULL) { 1130 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1131 inp->inp_socket->so_fibnum)); 1132 } 1133 if (ia == NULL) { 1134 error = ENETUNREACH; 1135 goto done; 1136 } 1137 1138 if (!prison_flag(cred, PR_IP4)) { 1139 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1140 goto done; 1141 } 1142 1143 ifp = ia->ia_ifp; 1144 ia = NULL; 1145 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1146 sa = ifa->ifa_addr; 1147 if (sa->sa_family != AF_INET) 1148 continue; 1149 sin = (struct sockaddr_in *)sa; 1150 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1151 ia = (struct in_ifaddr *)ifa; 1152 break; 1153 } 1154 } 1155 if (ia != NULL) { 1156 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1157 goto done; 1158 } 1159 1160 /* 3. As a last resort return the 'default' jail address. */ 1161 error = prison_get_ip4(cred, laddr); 1162 goto done; 1163 } 1164 1165 /* 1166 * If the outgoing interface on the route found is not 1167 * a loopback interface, use the address from that interface. 1168 * In case of jails do those three steps: 1169 * 1. check if the interface address belongs to the jail. If so use it. 1170 * 2. check if we have any address on the outgoing interface 1171 * belonging to this jail. If so use it. 1172 * 3. as a last resort return the 'default' jail address. 1173 */ 1174 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1175 struct in_ifaddr *ia; 1176 struct ifnet *ifp; 1177 1178 /* If not jailed, use the default returned. */ 1179 if (!prison_flag(cred, PR_IP4)) { 1180 ia = (struct in_ifaddr *)nh->nh_ifa; 1181 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1182 goto done; 1183 } 1184 1185 /* Jailed. */ 1186 /* 1. Check if the iface address belongs to the jail. */ 1187 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1188 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1189 ia = (struct in_ifaddr *)nh->nh_ifa; 1190 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1191 goto done; 1192 } 1193 1194 /* 1195 * 2. Check if we have any address on the outgoing interface 1196 * belonging to this jail. 1197 */ 1198 ia = NULL; 1199 ifp = nh->nh_ifp; 1200 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1201 sa = ifa->ifa_addr; 1202 if (sa->sa_family != AF_INET) 1203 continue; 1204 sin = (struct sockaddr_in *)sa; 1205 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1206 ia = (struct in_ifaddr *)ifa; 1207 break; 1208 } 1209 } 1210 if (ia != NULL) { 1211 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1212 goto done; 1213 } 1214 1215 /* 3. As a last resort return the 'default' jail address. */ 1216 error = prison_get_ip4(cred, laddr); 1217 goto done; 1218 } 1219 1220 /* 1221 * The outgoing interface is marked with 'loopback net', so a route 1222 * to ourselves is here. 1223 * Try to find the interface of the destination address and then 1224 * take the address from there. That interface is not necessarily 1225 * a loopback interface. 1226 * In case of jails, check that it is an address of the jail 1227 * and if we cannot find, fall back to the 'default' jail address. 1228 */ 1229 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1230 struct in_ifaddr *ia; 1231 1232 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1233 inp->inp_socket->so_fibnum)); 1234 if (ia == NULL) 1235 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1236 inp->inp_socket->so_fibnum)); 1237 if (ia == NULL) 1238 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1239 1240 if (!prison_flag(cred, PR_IP4)) { 1241 if (ia == NULL) { 1242 error = ENETUNREACH; 1243 goto done; 1244 } 1245 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1246 goto done; 1247 } 1248 1249 /* Jailed. */ 1250 if (ia != NULL) { 1251 struct ifnet *ifp; 1252 1253 ifp = ia->ia_ifp; 1254 ia = NULL; 1255 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1256 sa = ifa->ifa_addr; 1257 if (sa->sa_family != AF_INET) 1258 continue; 1259 sin = (struct sockaddr_in *)sa; 1260 if (prison_check_ip4(cred, 1261 &sin->sin_addr) == 0) { 1262 ia = (struct in_ifaddr *)ifa; 1263 break; 1264 } 1265 } 1266 if (ia != NULL) { 1267 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1268 goto done; 1269 } 1270 } 1271 1272 /* 3. As a last resort return the 'default' jail address. */ 1273 error = prison_get_ip4(cred, laddr); 1274 goto done; 1275 } 1276 1277 done: 1278 if (error == 0 && laddr->s_addr == INADDR_ANY) 1279 return (EHOSTUNREACH); 1280 return (error); 1281 } 1282 1283 /* 1284 * Set up for a connect from a socket to the specified address. 1285 * On entry, *laddrp and *lportp should contain the current local 1286 * address and port for the PCB; these are updated to the values 1287 * that should be placed in inp_laddr and inp_lport to complete 1288 * the connect. 1289 * 1290 * On success, *faddrp and *fportp will be set to the remote address 1291 * and port. These are not updated in the error case. 1292 */ 1293 int 1294 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, 1295 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1296 struct ucred *cred) 1297 { 1298 struct in_ifaddr *ia; 1299 struct in_addr laddr, faddr; 1300 u_short lport, fport; 1301 int error; 1302 1303 KASSERT(sin->sin_family == AF_INET, 1304 ("%s: invalid address family for %p", __func__, sin)); 1305 KASSERT(sin->sin_len == sizeof(*sin), 1306 ("%s: invalid address length for %p", __func__, sin)); 1307 1308 /* 1309 * Because a global state change doesn't actually occur here, a read 1310 * lock is sufficient. 1311 */ 1312 NET_EPOCH_ASSERT(); 1313 INP_LOCK_ASSERT(inp); 1314 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1315 1316 if (sin->sin_port == 0) 1317 return (EADDRNOTAVAIL); 1318 laddr.s_addr = *laddrp; 1319 lport = *lportp; 1320 faddr = sin->sin_addr; 1321 fport = sin->sin_port; 1322 #ifdef ROUTE_MPATH 1323 if (CALC_FLOWID_OUTBOUND) { 1324 uint32_t hash_val, hash_type; 1325 1326 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1327 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1328 1329 inp->inp_flowid = hash_val; 1330 inp->inp_flowtype = hash_type; 1331 } 1332 #endif 1333 if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1334 /* 1335 * If the destination address is INADDR_ANY, 1336 * use the primary local address. 1337 * If the supplied address is INADDR_BROADCAST, 1338 * and the primary interface supports broadcast, 1339 * choose the broadcast address for that interface. 1340 */ 1341 if (faddr.s_addr == INADDR_ANY) { 1342 faddr = 1343 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1344 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1345 return (error); 1346 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1347 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1348 IFF_BROADCAST) 1349 faddr = satosin(&CK_STAILQ_FIRST( 1350 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1351 } 1352 } else if (faddr.s_addr == INADDR_ANY) { 1353 return (ENETUNREACH); 1354 } 1355 if (laddr.s_addr == INADDR_ANY) { 1356 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1357 /* 1358 * If the destination address is multicast and an outgoing 1359 * interface has been set as a multicast option, prefer the 1360 * address of that interface as our source address. 1361 */ 1362 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1363 inp->inp_moptions != NULL) { 1364 struct ip_moptions *imo; 1365 struct ifnet *ifp; 1366 1367 imo = inp->inp_moptions; 1368 if (imo->imo_multicast_ifp != NULL) { 1369 ifp = imo->imo_multicast_ifp; 1370 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1371 if (ia->ia_ifp == ifp && 1372 prison_check_ip4(cred, 1373 &ia->ia_addr.sin_addr) == 0) 1374 break; 1375 } 1376 if (ia == NULL) 1377 error = EADDRNOTAVAIL; 1378 else { 1379 laddr = ia->ia_addr.sin_addr; 1380 error = 0; 1381 } 1382 } 1383 } 1384 if (error) 1385 return (error); 1386 } 1387 1388 if (lport != 0) { 1389 if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1390 fport, laddr, lport, 0, M_NODOM) != NULL) 1391 return (EADDRINUSE); 1392 } else { 1393 struct sockaddr_in lsin, fsin; 1394 1395 bzero(&lsin, sizeof(lsin)); 1396 bzero(&fsin, sizeof(fsin)); 1397 lsin.sin_family = AF_INET; 1398 lsin.sin_addr = laddr; 1399 fsin.sin_family = AF_INET; 1400 fsin.sin_addr = faddr; 1401 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1402 &lport, (struct sockaddr *)& fsin, fport, cred, 1403 INPLOOKUP_WILDCARD); 1404 if (error) 1405 return (error); 1406 } 1407 *laddrp = laddr.s_addr; 1408 *lportp = lport; 1409 *faddrp = faddr.s_addr; 1410 *fportp = fport; 1411 return (0); 1412 } 1413 1414 void 1415 in_pcbdisconnect(struct inpcb *inp) 1416 { 1417 1418 INP_WLOCK_ASSERT(inp); 1419 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1420 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1421 ("%s: inp %p was already disconnected", __func__, inp)); 1422 1423 in_pcbremhash_locked(inp); 1424 1425 /* See the comment in in_pcbinshash(). */ 1426 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1427 inp->inp_laddr.s_addr = INADDR_ANY; 1428 inp->inp_faddr.s_addr = INADDR_ANY; 1429 inp->inp_fport = 0; 1430 } 1431 #endif /* INET */ 1432 1433 /* 1434 * inpcb hash lookups are protected by SMR section. 1435 * 1436 * Once desired pcb has been found, switching from SMR section to a pcb 1437 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1438 * here because SMR is a critical section. 1439 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1440 */ 1441 void 1442 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1443 { 1444 1445 lock == INPLOOKUP_RLOCKPCB ? 1446 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1447 } 1448 1449 void 1450 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1451 { 1452 1453 lock == INPLOOKUP_RLOCKPCB ? 1454 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1455 } 1456 1457 int 1458 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1459 { 1460 1461 return (lock == INPLOOKUP_RLOCKPCB ? 1462 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1463 } 1464 1465 static inline bool 1466 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1467 { 1468 1469 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1470 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1471 1472 if (__predict_true(inp_trylock(inp, lock))) { 1473 if (__predict_false(inp->inp_flags & ignflags)) { 1474 smr_exit(inp->inp_pcbinfo->ipi_smr); 1475 inp_unlock(inp, lock); 1476 return (false); 1477 } 1478 smr_exit(inp->inp_pcbinfo->ipi_smr); 1479 return (true); 1480 } 1481 1482 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1483 smr_exit(inp->inp_pcbinfo->ipi_smr); 1484 inp_lock(inp, lock); 1485 if (__predict_false(in_pcbrele(inp, lock))) 1486 return (false); 1487 /* 1488 * inp acquired through refcount & lock for sure didn't went 1489 * through uma_zfree(). However, it may have already went 1490 * through in_pcbfree() and has another reference, that 1491 * prevented its release by our in_pcbrele(). 1492 */ 1493 if (__predict_false(inp->inp_flags & ignflags)) { 1494 inp_unlock(inp, lock); 1495 return (false); 1496 } 1497 return (true); 1498 } else { 1499 smr_exit(inp->inp_pcbinfo->ipi_smr); 1500 return (false); 1501 } 1502 } 1503 1504 bool 1505 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1506 { 1507 1508 /* 1509 * in_pcblookup() family of functions ignore not only freed entries, 1510 * that may be found due to lockless access to the hash, but dropped 1511 * entries, too. 1512 */ 1513 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1514 } 1515 1516 /* 1517 * inp_next() - inpcb hash/list traversal iterator 1518 * 1519 * Requires initialized struct inpcb_iterator for context. 1520 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1521 * 1522 * - Iterator can have either write-lock or read-lock semantics, that can not 1523 * be changed later. 1524 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1525 * a single hash slot. Note: only rip_input() does the latter. 1526 * - Iterator may have optional bool matching function. The matching function 1527 * will be executed for each inpcb in the SMR context, so it can not acquire 1528 * locks and can safely access only immutable fields of inpcb. 1529 * 1530 * A fresh initialized iterator has NULL inpcb in its context and that 1531 * means that inp_next() call would return the very first inpcb on the list 1532 * locked with desired semantic. In all following calls the context pointer 1533 * shall hold the current inpcb pointer. The KPI user is not supposed to 1534 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1535 * and write NULL to its context. After end of traversal an iterator can be 1536 * reused. 1537 * 1538 * List traversals have the following features/constraints: 1539 * - New entries won't be seen, as they are always added to the head of a list. 1540 * - Removed entries won't stop traversal as long as they are not added to 1541 * a different list. This is violated by in_pcbrehash(). 1542 */ 1543 #define II_LIST_FIRST(ipi, hash) \ 1544 (((hash) == INP_ALL_LIST) ? \ 1545 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1546 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1547 #define II_LIST_NEXT(inp, hash) \ 1548 (((hash) == INP_ALL_LIST) ? \ 1549 CK_LIST_NEXT((inp), inp_list) : \ 1550 CK_LIST_NEXT((inp), inp_hash_exact)) 1551 #define II_LOCK_ASSERT(inp, lock) \ 1552 rw_assert(&(inp)->inp_lock, \ 1553 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1554 struct inpcb * 1555 inp_next(struct inpcb_iterator *ii) 1556 { 1557 const struct inpcbinfo *ipi = ii->ipi; 1558 inp_match_t *match = ii->match; 1559 void *ctx = ii->ctx; 1560 inp_lookup_t lock = ii->lock; 1561 int hash = ii->hash; 1562 struct inpcb *inp; 1563 1564 if (ii->inp == NULL) { /* First call. */ 1565 smr_enter(ipi->ipi_smr); 1566 /* This is unrolled CK_LIST_FOREACH(). */ 1567 for (inp = II_LIST_FIRST(ipi, hash); 1568 inp != NULL; 1569 inp = II_LIST_NEXT(inp, hash)) { 1570 if (match != NULL && (match)(inp, ctx) == false) 1571 continue; 1572 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1573 break; 1574 else { 1575 smr_enter(ipi->ipi_smr); 1576 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1577 inp = II_LIST_FIRST(ipi, hash); 1578 if (inp == NULL) 1579 break; 1580 } 1581 } 1582 1583 if (inp == NULL) 1584 smr_exit(ipi->ipi_smr); 1585 else 1586 ii->inp = inp; 1587 1588 return (inp); 1589 } 1590 1591 /* Not a first call. */ 1592 smr_enter(ipi->ipi_smr); 1593 restart: 1594 inp = ii->inp; 1595 II_LOCK_ASSERT(inp, lock); 1596 next: 1597 inp = II_LIST_NEXT(inp, hash); 1598 if (inp == NULL) { 1599 smr_exit(ipi->ipi_smr); 1600 goto found; 1601 } 1602 1603 if (match != NULL && (match)(inp, ctx) == false) 1604 goto next; 1605 1606 if (__predict_true(inp_trylock(inp, lock))) { 1607 if (__predict_false(inp->inp_flags & INP_FREED)) { 1608 /* 1609 * Entries are never inserted in middle of a list, thus 1610 * as long as we are in SMR, we can continue traversal. 1611 * Jump to 'restart' should yield in the same result, 1612 * but could produce unnecessary looping. Could this 1613 * looping be unbound? 1614 */ 1615 inp_unlock(inp, lock); 1616 goto next; 1617 } else { 1618 smr_exit(ipi->ipi_smr); 1619 goto found; 1620 } 1621 } 1622 1623 /* 1624 * Can't obtain lock immediately, thus going hard. Once we exit the 1625 * SMR section we can no longer jump to 'next', and our only stable 1626 * anchoring point is ii->inp, which we keep locked for this case, so 1627 * we jump to 'restart'. 1628 */ 1629 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1630 smr_exit(ipi->ipi_smr); 1631 inp_lock(inp, lock); 1632 if (__predict_false(in_pcbrele(inp, lock))) { 1633 smr_enter(ipi->ipi_smr); 1634 goto restart; 1635 } 1636 /* 1637 * See comment in inp_smr_lock(). 1638 */ 1639 if (__predict_false(inp->inp_flags & INP_FREED)) { 1640 inp_unlock(inp, lock); 1641 smr_enter(ipi->ipi_smr); 1642 goto restart; 1643 } 1644 } else 1645 goto next; 1646 1647 found: 1648 inp_unlock(ii->inp, lock); 1649 ii->inp = inp; 1650 1651 return (ii->inp); 1652 } 1653 1654 /* 1655 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1656 * stability of an inpcb pointer despite the inpcb lock being released or 1657 * SMR section exited. 1658 * 1659 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1660 */ 1661 void 1662 in_pcbref(struct inpcb *inp) 1663 { 1664 u_int old __diagused; 1665 1666 old = refcount_acquire(&inp->inp_refcount); 1667 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1668 } 1669 1670 /* 1671 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1672 * freeing the pcb, if the reference was very last. 1673 */ 1674 bool 1675 in_pcbrele_rlocked(struct inpcb *inp) 1676 { 1677 1678 INP_RLOCK_ASSERT(inp); 1679 1680 if (!refcount_release(&inp->inp_refcount)) 1681 return (false); 1682 1683 MPASS(inp->inp_flags & INP_FREED); 1684 MPASS(inp->inp_socket == NULL); 1685 crfree(inp->inp_cred); 1686 #ifdef INVARIANTS 1687 inp->inp_cred = NULL; 1688 #endif 1689 INP_RUNLOCK(inp); 1690 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1691 return (true); 1692 } 1693 1694 bool 1695 in_pcbrele_wlocked(struct inpcb *inp) 1696 { 1697 1698 INP_WLOCK_ASSERT(inp); 1699 1700 if (!refcount_release(&inp->inp_refcount)) 1701 return (false); 1702 1703 MPASS(inp->inp_flags & INP_FREED); 1704 MPASS(inp->inp_socket == NULL); 1705 crfree(inp->inp_cred); 1706 #ifdef INVARIANTS 1707 inp->inp_cred = NULL; 1708 #endif 1709 INP_WUNLOCK(inp); 1710 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1711 return (true); 1712 } 1713 1714 bool 1715 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1716 { 1717 1718 return (lock == INPLOOKUP_RLOCKPCB ? 1719 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1720 } 1721 1722 /* 1723 * Unconditionally schedule an inpcb to be freed by decrementing its 1724 * reference count, which should occur only after the inpcb has been detached 1725 * from its socket. If another thread holds a temporary reference (acquired 1726 * using in_pcbref()) then the free is deferred until that reference is 1727 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1728 * Almost all work, including removal from global lists, is done in this 1729 * context, where the pcbinfo lock is held. 1730 */ 1731 void 1732 in_pcbfree(struct inpcb *inp) 1733 { 1734 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1735 #ifdef INET 1736 struct ip_moptions *imo; 1737 #endif 1738 #ifdef INET6 1739 struct ip6_moptions *im6o; 1740 #endif 1741 1742 INP_WLOCK_ASSERT(inp); 1743 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1744 KASSERT((inp->inp_flags & INP_FREED) == 0, 1745 ("%s: called twice for pcb %p", __func__, inp)); 1746 1747 /* 1748 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1749 * from the hash without acquiring inpcb lock, they rely on the hash 1750 * lock, thus in_pcbremhash() should be the first action. 1751 */ 1752 if (inp->inp_flags & INP_INHASHLIST) 1753 in_pcbremhash(inp); 1754 INP_INFO_WLOCK(pcbinfo); 1755 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1756 pcbinfo->ipi_count--; 1757 CK_LIST_REMOVE(inp, inp_list); 1758 INP_INFO_WUNLOCK(pcbinfo); 1759 1760 #ifdef RATELIMIT 1761 if (inp->inp_snd_tag != NULL) 1762 in_pcbdetach_txrtlmt(inp); 1763 #endif 1764 inp->inp_flags |= INP_FREED; 1765 inp->inp_socket->so_pcb = NULL; 1766 inp->inp_socket = NULL; 1767 1768 RO_INVALIDATE_CACHE(&inp->inp_route); 1769 #ifdef MAC 1770 mac_inpcb_destroy(inp); 1771 #endif 1772 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1773 if (inp->inp_sp != NULL) 1774 ipsec_delete_pcbpolicy(inp); 1775 #endif 1776 #ifdef INET 1777 if (inp->inp_options) 1778 (void)m_free(inp->inp_options); 1779 DEBUG_POISON_POINTER(inp->inp_options); 1780 imo = inp->inp_moptions; 1781 DEBUG_POISON_POINTER(inp->inp_moptions); 1782 #endif 1783 #ifdef INET6 1784 if (inp->inp_vflag & INP_IPV6PROTO) { 1785 ip6_freepcbopts(inp->in6p_outputopts); 1786 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1787 im6o = inp->in6p_moptions; 1788 DEBUG_POISON_POINTER(inp->in6p_moptions); 1789 } else 1790 im6o = NULL; 1791 #endif 1792 1793 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1794 INP_WUNLOCK(inp); 1795 } 1796 #ifdef INET6 1797 ip6_freemoptions(im6o); 1798 #endif 1799 #ifdef INET 1800 inp_freemoptions(imo); 1801 #endif 1802 } 1803 1804 /* 1805 * Different protocols initialize their inpcbs differently - giving 1806 * different name to the lock. But they all are disposed the same. 1807 */ 1808 static void 1809 inpcb_fini(void *mem, int size) 1810 { 1811 struct inpcb *inp = mem; 1812 1813 INP_LOCK_DESTROY(inp); 1814 } 1815 1816 /* 1817 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1818 * port reservation, and preventing it from being returned by inpcb lookups. 1819 * 1820 * It is used by TCP to mark an inpcb as unused and avoid future packet 1821 * delivery or event notification when a socket remains open but TCP has 1822 * closed. This might occur as a result of a shutdown()-initiated TCP close 1823 * or a RST on the wire, and allows the port binding to be reused while still 1824 * maintaining the invariant that so_pcb always points to a valid inpcb until 1825 * in_pcbdetach(). 1826 * 1827 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1828 * in_pcbpurgeif0()? 1829 */ 1830 void 1831 in_pcbdrop(struct inpcb *inp) 1832 { 1833 1834 INP_WLOCK_ASSERT(inp); 1835 1836 inp->inp_flags |= INP_DROPPED; 1837 if (inp->inp_flags & INP_INHASHLIST) 1838 in_pcbremhash(inp); 1839 } 1840 1841 #ifdef INET 1842 /* 1843 * Common routines to return the socket addresses associated with inpcbs. 1844 */ 1845 int 1846 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1847 { 1848 struct inpcb *inp; 1849 1850 inp = sotoinpcb(so); 1851 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1852 1853 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1854 .sin_len = sizeof(struct sockaddr_in), 1855 .sin_family = AF_INET, 1856 .sin_port = inp->inp_lport, 1857 .sin_addr = inp->inp_laddr, 1858 }; 1859 1860 return (0); 1861 } 1862 1863 int 1864 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1865 { 1866 struct inpcb *inp; 1867 1868 inp = sotoinpcb(so); 1869 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1870 1871 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1872 .sin_len = sizeof(struct sockaddr_in), 1873 .sin_family = AF_INET, 1874 .sin_port = inp->inp_fport, 1875 .sin_addr = inp->inp_faddr, 1876 }; 1877 1878 return (0); 1879 } 1880 1881 static bool 1882 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1883 { 1884 1885 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1886 return (true); 1887 else 1888 return (false); 1889 } 1890 1891 void 1892 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1893 { 1894 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1895 inp_v4_multi_match, NULL); 1896 struct inpcb *inp; 1897 struct in_multi *inm; 1898 struct in_mfilter *imf; 1899 struct ip_moptions *imo; 1900 1901 IN_MULTI_LOCK_ASSERT(); 1902 1903 while ((inp = inp_next(&inpi)) != NULL) { 1904 INP_WLOCK_ASSERT(inp); 1905 1906 imo = inp->inp_moptions; 1907 /* 1908 * Unselect the outgoing interface if it is being 1909 * detached. 1910 */ 1911 if (imo->imo_multicast_ifp == ifp) 1912 imo->imo_multicast_ifp = NULL; 1913 1914 /* 1915 * Drop multicast group membership if we joined 1916 * through the interface being detached. 1917 * 1918 * XXX This can all be deferred to an epoch_call 1919 */ 1920 restart: 1921 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1922 if ((inm = imf->imf_inm) == NULL) 1923 continue; 1924 if (inm->inm_ifp != ifp) 1925 continue; 1926 ip_mfilter_remove(&imo->imo_head, imf); 1927 in_leavegroup_locked(inm, NULL); 1928 ip_mfilter_free(imf); 1929 goto restart; 1930 } 1931 } 1932 } 1933 1934 /* 1935 * Lookup a PCB based on the local address and port. Caller must hold the 1936 * hash lock. No inpcb locks or references are acquired. 1937 */ 1938 #define INP_LOOKUP_MAPPED_PCB_COST 3 1939 struct inpcb * 1940 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1941 u_short lport, int lookupflags, struct ucred *cred) 1942 { 1943 struct inpcb *inp; 1944 #ifdef INET6 1945 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1946 #else 1947 int matchwild = 3; 1948 #endif 1949 int wildcard; 1950 1951 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1952 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1953 INP_HASH_LOCK_ASSERT(pcbinfo); 1954 1955 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 1956 struct inpcbhead *head; 1957 /* 1958 * Look for an unconnected (wildcard foreign addr) PCB that 1959 * matches the local address and port we're looking for. 1960 */ 1961 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 1962 pcbinfo->ipi_hashmask)]; 1963 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 1964 #ifdef INET6 1965 /* XXX inp locking */ 1966 if ((inp->inp_vflag & INP_IPV4) == 0) 1967 continue; 1968 #endif 1969 if (inp->inp_faddr.s_addr == INADDR_ANY && 1970 inp->inp_laddr.s_addr == laddr.s_addr && 1971 inp->inp_lport == lport) { 1972 /* 1973 * Found? 1974 */ 1975 if (prison_equal_ip4(cred->cr_prison, 1976 inp->inp_cred->cr_prison)) 1977 return (inp); 1978 } 1979 } 1980 /* 1981 * Not found. 1982 */ 1983 return (NULL); 1984 } else { 1985 struct inpcbporthead *porthash; 1986 struct inpcbport *phd; 1987 struct inpcb *match = NULL; 1988 /* 1989 * Best fit PCB lookup. 1990 * 1991 * First see if this local port is in use by looking on the 1992 * port hash list. 1993 */ 1994 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 1995 pcbinfo->ipi_porthashmask)]; 1996 CK_LIST_FOREACH(phd, porthash, phd_hash) { 1997 if (phd->phd_port == lport) 1998 break; 1999 } 2000 if (phd != NULL) { 2001 /* 2002 * Port is in use by one or more PCBs. Look for best 2003 * fit. 2004 */ 2005 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2006 wildcard = 0; 2007 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2008 cred->cr_prison)) 2009 continue; 2010 #ifdef INET6 2011 /* XXX inp locking */ 2012 if ((inp->inp_vflag & INP_IPV4) == 0) 2013 continue; 2014 /* 2015 * We never select the PCB that has 2016 * INP_IPV6 flag and is bound to :: if 2017 * we have another PCB which is bound 2018 * to 0.0.0.0. If a PCB has the 2019 * INP_IPV6 flag, then we set its cost 2020 * higher than IPv4 only PCBs. 2021 * 2022 * Note that the case only happens 2023 * when a socket is bound to ::, under 2024 * the condition that the use of the 2025 * mapped address is allowed. 2026 */ 2027 if ((inp->inp_vflag & INP_IPV6) != 0) 2028 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2029 #endif 2030 if (inp->inp_faddr.s_addr != INADDR_ANY) 2031 wildcard++; 2032 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2033 if (laddr.s_addr == INADDR_ANY) 2034 wildcard++; 2035 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2036 continue; 2037 } else { 2038 if (laddr.s_addr != INADDR_ANY) 2039 wildcard++; 2040 } 2041 if (wildcard < matchwild) { 2042 match = inp; 2043 matchwild = wildcard; 2044 if (matchwild == 0) 2045 break; 2046 } 2047 } 2048 } 2049 return (match); 2050 } 2051 } 2052 #undef INP_LOOKUP_MAPPED_PCB_COST 2053 2054 static bool 2055 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) 2056 { 2057 return (domain == M_NODOM || domain == grp->il_numa_domain); 2058 } 2059 2060 static struct inpcb * 2061 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2062 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2063 uint16_t lport, int domain) 2064 { 2065 const struct inpcblbgrouphead *hdr; 2066 struct inpcblbgroup *grp; 2067 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2068 struct inpcb *inp; 2069 u_int count; 2070 2071 INP_HASH_LOCK_ASSERT(pcbinfo); 2072 NET_EPOCH_ASSERT(); 2073 2074 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2075 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2076 2077 /* 2078 * Search for an LB group match based on the following criteria: 2079 * - prefer jailed groups to non-jailed groups 2080 * - prefer exact source address matches to wildcard matches 2081 * - prefer groups bound to the specified NUMA domain 2082 */ 2083 jail_exact = jail_wild = local_exact = local_wild = NULL; 2084 CK_LIST_FOREACH(grp, hdr, il_list) { 2085 bool injail; 2086 2087 #ifdef INET6 2088 if (!(grp->il_vflag & INP_IPV4)) 2089 continue; 2090 #endif 2091 if (grp->il_lport != lport) 2092 continue; 2093 2094 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2095 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2096 laddr) != 0) 2097 continue; 2098 2099 if (grp->il_laddr.s_addr == laddr->s_addr) { 2100 if (injail) { 2101 jail_exact = grp; 2102 if (in_pcblookup_lb_numa_match(grp, domain)) 2103 /* This is a perfect match. */ 2104 goto out; 2105 } else if (local_exact == NULL || 2106 in_pcblookup_lb_numa_match(grp, domain)) { 2107 local_exact = grp; 2108 } 2109 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2110 if (injail) { 2111 if (jail_wild == NULL || 2112 in_pcblookup_lb_numa_match(grp, domain)) 2113 jail_wild = grp; 2114 } else if (local_wild == NULL || 2115 in_pcblookup_lb_numa_match(grp, domain)) { 2116 local_wild = grp; 2117 } 2118 } 2119 } 2120 2121 if (jail_exact != NULL) 2122 grp = jail_exact; 2123 else if (jail_wild != NULL) 2124 grp = jail_wild; 2125 else if (local_exact != NULL) 2126 grp = local_exact; 2127 else 2128 grp = local_wild; 2129 if (grp == NULL) 2130 return (NULL); 2131 2132 out: 2133 /* 2134 * Synchronize with in_pcblbgroup_insert(). 2135 */ 2136 count = atomic_load_acq_int(&grp->il_inpcnt); 2137 if (count == 0) 2138 return (NULL); 2139 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2140 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2141 return (inp); 2142 } 2143 2144 static bool 2145 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2146 u_short fport, struct in_addr laddr, u_short lport) 2147 { 2148 #ifdef INET6 2149 /* XXX inp locking */ 2150 if ((inp->inp_vflag & INP_IPV4) == 0) 2151 return (false); 2152 #endif 2153 if (inp->inp_faddr.s_addr == faddr.s_addr && 2154 inp->inp_laddr.s_addr == laddr.s_addr && 2155 inp->inp_fport == fport && 2156 inp->inp_lport == lport) 2157 return (true); 2158 return (false); 2159 } 2160 2161 static struct inpcb * 2162 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2163 u_short fport, struct in_addr laddr, u_short lport) 2164 { 2165 struct inpcbhead *head; 2166 struct inpcb *inp; 2167 2168 INP_HASH_LOCK_ASSERT(pcbinfo); 2169 2170 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2171 pcbinfo->ipi_hashmask)]; 2172 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2173 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2174 return (inp); 2175 } 2176 return (NULL); 2177 } 2178 2179 typedef enum { 2180 INPLOOKUP_MATCH_NONE = 0, 2181 INPLOOKUP_MATCH_WILD = 1, 2182 INPLOOKUP_MATCH_LADDR = 2, 2183 } inp_lookup_match_t; 2184 2185 static inp_lookup_match_t 2186 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2187 u_short lport) 2188 { 2189 #ifdef INET6 2190 /* XXX inp locking */ 2191 if ((inp->inp_vflag & INP_IPV4) == 0) 2192 return (INPLOOKUP_MATCH_NONE); 2193 #endif 2194 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2195 return (INPLOOKUP_MATCH_NONE); 2196 if (inp->inp_laddr.s_addr == INADDR_ANY) 2197 return (INPLOOKUP_MATCH_WILD); 2198 if (inp->inp_laddr.s_addr == laddr.s_addr) 2199 return (INPLOOKUP_MATCH_LADDR); 2200 return (INPLOOKUP_MATCH_NONE); 2201 } 2202 2203 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2204 2205 static struct inpcb * 2206 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2207 u_short lport, const inp_lookup_t lockflags) 2208 { 2209 struct inpcbhead *head; 2210 struct inpcb *inp; 2211 2212 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2213 ("%s: not in SMR read section", __func__)); 2214 2215 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2216 pcbinfo->ipi_hashmask)]; 2217 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2218 inp_lookup_match_t match; 2219 2220 match = in_pcblookup_wild_match(inp, laddr, lport); 2221 if (match == INPLOOKUP_MATCH_NONE) 2222 continue; 2223 2224 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2225 match = in_pcblookup_wild_match(inp, laddr, lport); 2226 if (match != INPLOOKUP_MATCH_NONE && 2227 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2228 &laddr) == 0) 2229 return (inp); 2230 inp_unlock(inp, lockflags); 2231 } 2232 2233 /* 2234 * The matching socket disappeared out from under us. Fall back 2235 * to a serialized lookup. 2236 */ 2237 return (INP_LOOKUP_AGAIN); 2238 } 2239 return (NULL); 2240 } 2241 2242 static struct inpcb * 2243 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2244 u_short lport) 2245 { 2246 struct inpcbhead *head; 2247 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2248 #ifdef INET6 2249 struct inpcb *local_wild_mapped; 2250 #endif 2251 2252 INP_HASH_LOCK_ASSERT(pcbinfo); 2253 2254 /* 2255 * Order of socket selection - we always prefer jails. 2256 * 1. jailed, non-wild. 2257 * 2. jailed, wild. 2258 * 3. non-jailed, non-wild. 2259 * 4. non-jailed, wild. 2260 */ 2261 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2262 pcbinfo->ipi_hashmask)]; 2263 local_wild = local_exact = jail_wild = NULL; 2264 #ifdef INET6 2265 local_wild_mapped = NULL; 2266 #endif 2267 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2268 inp_lookup_match_t match; 2269 bool injail; 2270 2271 match = in_pcblookup_wild_match(inp, laddr, lport); 2272 if (match == INPLOOKUP_MATCH_NONE) 2273 continue; 2274 2275 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2276 if (injail) { 2277 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2278 &laddr) != 0) 2279 continue; 2280 } else { 2281 if (local_exact != NULL) 2282 continue; 2283 } 2284 2285 if (match == INPLOOKUP_MATCH_LADDR) { 2286 if (injail) 2287 return (inp); 2288 local_exact = inp; 2289 } else { 2290 #ifdef INET6 2291 /* XXX inp locking, NULL check */ 2292 if (inp->inp_vflag & INP_IPV6PROTO) 2293 local_wild_mapped = inp; 2294 else 2295 #endif 2296 if (injail) 2297 jail_wild = inp; 2298 else 2299 local_wild = inp; 2300 } 2301 } 2302 if (jail_wild != NULL) 2303 return (jail_wild); 2304 if (local_exact != NULL) 2305 return (local_exact); 2306 if (local_wild != NULL) 2307 return (local_wild); 2308 #ifdef INET6 2309 if (local_wild_mapped != NULL) 2310 return (local_wild_mapped); 2311 #endif 2312 return (NULL); 2313 } 2314 2315 /* 2316 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2317 * that the caller has either locked the hash list, which usually happens 2318 * for bind(2) operations, or is in SMR section, which happens when sorting 2319 * out incoming packets. 2320 */ 2321 static struct inpcb * 2322 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2323 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2324 uint8_t numa_domain) 2325 { 2326 struct inpcb *inp; 2327 const u_short fport = fport_arg, lport = lport_arg; 2328 2329 KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0, 2330 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2331 KASSERT(faddr.s_addr != INADDR_ANY, 2332 ("%s: invalid foreign address", __func__)); 2333 KASSERT(laddr.s_addr != INADDR_ANY, 2334 ("%s: invalid local address", __func__)); 2335 INP_HASH_WLOCK_ASSERT(pcbinfo); 2336 2337 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2338 if (inp != NULL) 2339 return (inp); 2340 2341 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2342 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2343 &laddr, lport, numa_domain); 2344 if (inp == NULL) { 2345 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2346 lport); 2347 } 2348 } 2349 2350 return (inp); 2351 } 2352 2353 static struct inpcb * 2354 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2355 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2356 uint8_t numa_domain) 2357 { 2358 struct inpcb *inp; 2359 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2360 2361 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2362 ("%s: LOCKPCB not set", __func__)); 2363 2364 INP_HASH_WLOCK(pcbinfo); 2365 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2366 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain); 2367 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2368 in_pcbref(inp); 2369 INP_HASH_WUNLOCK(pcbinfo); 2370 inp_lock(inp, lockflags); 2371 if (in_pcbrele(inp, lockflags)) 2372 /* XXX-MJ or retry until we get a negative match? */ 2373 inp = NULL; 2374 } else { 2375 INP_HASH_WUNLOCK(pcbinfo); 2376 } 2377 return (inp); 2378 } 2379 2380 static struct inpcb * 2381 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2382 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2383 uint8_t numa_domain) 2384 { 2385 struct inpcb *inp; 2386 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2387 const u_short fport = fport_arg, lport = lport_arg; 2388 2389 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2390 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2391 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2392 ("%s: LOCKPCB not set", __func__)); 2393 2394 smr_enter(pcbinfo->ipi_smr); 2395 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2396 if (inp != NULL) { 2397 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2398 /* 2399 * Revalidate the 4-tuple, the socket could have been 2400 * disconnected. 2401 */ 2402 if (__predict_true(in_pcblookup_exact_match(inp, 2403 faddr, fport, laddr, lport))) 2404 return (inp); 2405 inp_unlock(inp, lockflags); 2406 } 2407 2408 /* 2409 * We failed to lock the inpcb, or its connection state changed 2410 * out from under us. Fall back to a precise search. 2411 */ 2412 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2413 lookupflags, numa_domain)); 2414 } 2415 2416 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2417 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2418 &laddr, lport, numa_domain); 2419 if (inp != NULL) { 2420 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2421 if (__predict_true(in_pcblookup_wild_match(inp, 2422 laddr, lport) != INPLOOKUP_MATCH_NONE)) 2423 return (inp); 2424 inp_unlock(inp, lockflags); 2425 } 2426 inp = INP_LOOKUP_AGAIN; 2427 } else { 2428 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2429 lockflags); 2430 } 2431 if (inp == INP_LOOKUP_AGAIN) { 2432 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2433 lport, lookupflags, numa_domain)); 2434 } 2435 } 2436 2437 if (inp == NULL) 2438 smr_exit(pcbinfo->ipi_smr); 2439 2440 return (inp); 2441 } 2442 2443 /* 2444 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2445 * from which a pre-calculated hash value may be extracted. 2446 */ 2447 struct inpcb * 2448 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2449 struct in_addr laddr, u_int lport, int lookupflags, 2450 struct ifnet *ifp __unused) 2451 { 2452 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2453 lookupflags, M_NODOM)); 2454 } 2455 2456 struct inpcb * 2457 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2458 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2459 struct ifnet *ifp __unused, struct mbuf *m) 2460 { 2461 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2462 lookupflags, m->m_pkthdr.numa_domain)); 2463 } 2464 #endif /* INET */ 2465 2466 static bool 2467 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2468 { 2469 return (prison_flag(inp->inp_cred, flag) != 0); 2470 } 2471 2472 /* 2473 * Insert the PCB into a hash chain using ordering rules which ensure that 2474 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2475 * 2476 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2477 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2478 * always appear last no matter whether they are jailed. 2479 */ 2480 static void 2481 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2482 { 2483 struct inpcb *last; 2484 bool bound, injail; 2485 2486 INP_LOCK_ASSERT(inp); 2487 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2488 2489 last = NULL; 2490 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2491 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2492 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2493 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2494 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2495 return; 2496 } 2497 } 2498 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2499 return; 2500 } 2501 2502 injail = in_pcbjailed(inp, PR_IP4); 2503 if (!injail) { 2504 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2505 if (!in_pcbjailed(last, PR_IP4)) 2506 break; 2507 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2508 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2509 return; 2510 } 2511 } 2512 } else if (!CK_LIST_EMPTY(pcbhash) && 2513 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2514 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2515 return; 2516 } 2517 if (!bound) { 2518 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2519 if (last->inp_laddr.s_addr == INADDR_ANY) 2520 break; 2521 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2522 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2523 return; 2524 } 2525 } 2526 } 2527 if (last == NULL) 2528 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2529 else 2530 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2531 } 2532 2533 #ifdef INET6 2534 /* 2535 * See the comment above _in_pcbinshash_wild(). 2536 */ 2537 static void 2538 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2539 { 2540 struct inpcb *last; 2541 bool bound, injail; 2542 2543 INP_LOCK_ASSERT(inp); 2544 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2545 2546 last = NULL; 2547 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2548 injail = in_pcbjailed(inp, PR_IP6); 2549 if (!injail) { 2550 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2551 if (!in_pcbjailed(last, PR_IP6)) 2552 break; 2553 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2554 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2555 return; 2556 } 2557 } 2558 } else if (!CK_LIST_EMPTY(pcbhash) && 2559 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2560 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2561 return; 2562 } 2563 if (!bound) { 2564 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2565 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2566 break; 2567 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2568 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2569 return; 2570 } 2571 } 2572 } 2573 if (last == NULL) 2574 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2575 else 2576 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2577 } 2578 #endif 2579 2580 /* 2581 * Insert PCB onto various hash lists. 2582 */ 2583 int 2584 in_pcbinshash(struct inpcb *inp) 2585 { 2586 struct inpcbhead *pcbhash; 2587 struct inpcbporthead *pcbporthash; 2588 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2589 struct inpcbport *phd; 2590 uint32_t hash; 2591 bool connected; 2592 2593 INP_WLOCK_ASSERT(inp); 2594 INP_HASH_WLOCK_ASSERT(pcbinfo); 2595 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2596 ("in_pcbinshash: INP_INHASHLIST")); 2597 2598 #ifdef INET6 2599 if (inp->inp_vflag & INP_IPV6) { 2600 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2601 inp->inp_fport, pcbinfo->ipi_hashmask); 2602 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2603 } else 2604 #endif 2605 { 2606 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2607 inp->inp_fport, pcbinfo->ipi_hashmask); 2608 connected = !in_nullhost(inp->inp_faddr); 2609 } 2610 2611 if (connected) 2612 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2613 else 2614 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2615 2616 pcbporthash = &pcbinfo->ipi_porthashbase[ 2617 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2618 2619 /* 2620 * Add entry to load balance group. 2621 * Only do this if SO_REUSEPORT_LB is set. 2622 */ 2623 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2624 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2625 if (error != 0) 2626 return (error); 2627 } 2628 2629 /* 2630 * Go through port list and look for a head for this lport. 2631 */ 2632 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2633 if (phd->phd_port == inp->inp_lport) 2634 break; 2635 } 2636 2637 /* 2638 * If none exists, malloc one and tack it on. 2639 */ 2640 if (phd == NULL) { 2641 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2642 if (phd == NULL) { 2643 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2644 in_pcbremlbgrouphash(inp); 2645 return (ENOMEM); 2646 } 2647 phd->phd_port = inp->inp_lport; 2648 CK_LIST_INIT(&phd->phd_pcblist); 2649 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2650 } 2651 inp->inp_phd = phd; 2652 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2653 2654 /* 2655 * The PCB may have been disconnected in the past. Before we can safely 2656 * make it visible in the hash table, we must wait for all readers which 2657 * may be traversing this PCB to finish. 2658 */ 2659 if (inp->inp_smr != SMR_SEQ_INVALID) { 2660 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2661 inp->inp_smr = SMR_SEQ_INVALID; 2662 } 2663 2664 if (connected) 2665 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2666 else { 2667 #ifdef INET6 2668 if ((inp->inp_vflag & INP_IPV6) != 0) 2669 _in6_pcbinshash_wild(pcbhash, inp); 2670 else 2671 #endif 2672 _in_pcbinshash_wild(pcbhash, inp); 2673 } 2674 inp->inp_flags |= INP_INHASHLIST; 2675 2676 return (0); 2677 } 2678 2679 void 2680 in_pcbremhash_locked(struct inpcb *inp) 2681 { 2682 struct inpcbport *phd = inp->inp_phd; 2683 2684 INP_WLOCK_ASSERT(inp); 2685 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2686 MPASS(inp->inp_flags & INP_INHASHLIST); 2687 2688 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2689 in_pcbremlbgrouphash(inp); 2690 #ifdef INET6 2691 if (inp->inp_vflag & INP_IPV6) { 2692 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2693 CK_LIST_REMOVE(inp, inp_hash_wild); 2694 else 2695 CK_LIST_REMOVE(inp, inp_hash_exact); 2696 } else 2697 #endif 2698 { 2699 if (in_nullhost(inp->inp_faddr)) 2700 CK_LIST_REMOVE(inp, inp_hash_wild); 2701 else 2702 CK_LIST_REMOVE(inp, inp_hash_exact); 2703 } 2704 CK_LIST_REMOVE(inp, inp_portlist); 2705 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2706 CK_LIST_REMOVE(phd, phd_hash); 2707 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2708 } 2709 inp->inp_flags &= ~INP_INHASHLIST; 2710 } 2711 2712 static void 2713 in_pcbremhash(struct inpcb *inp) 2714 { 2715 INP_HASH_WLOCK(inp->inp_pcbinfo); 2716 in_pcbremhash_locked(inp); 2717 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2718 } 2719 2720 /* 2721 * Move PCB to the proper hash bucket when { faddr, fport } have been 2722 * changed. NOTE: This does not handle the case of the lport changing (the 2723 * hashed port list would have to be updated as well), so the lport must 2724 * not change after in_pcbinshash() has been called. 2725 */ 2726 void 2727 in_pcbrehash(struct inpcb *inp) 2728 { 2729 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2730 struct inpcbhead *head; 2731 uint32_t hash; 2732 bool connected; 2733 2734 INP_WLOCK_ASSERT(inp); 2735 INP_HASH_WLOCK_ASSERT(pcbinfo); 2736 KASSERT(inp->inp_flags & INP_INHASHLIST, 2737 ("%s: !INP_INHASHLIST", __func__)); 2738 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2739 ("%s: inp was disconnected", __func__)); 2740 2741 #ifdef INET6 2742 if (inp->inp_vflag & INP_IPV6) { 2743 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2744 inp->inp_fport, pcbinfo->ipi_hashmask); 2745 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2746 } else 2747 #endif 2748 { 2749 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2750 inp->inp_fport, pcbinfo->ipi_hashmask); 2751 connected = !in_nullhost(inp->inp_faddr); 2752 } 2753 2754 /* 2755 * When rehashing, the caller must ensure that either the new or the old 2756 * foreign address was unspecified. 2757 */ 2758 if (connected) 2759 CK_LIST_REMOVE(inp, inp_hash_wild); 2760 else 2761 CK_LIST_REMOVE(inp, inp_hash_exact); 2762 2763 if (connected) { 2764 head = &pcbinfo->ipi_hash_exact[hash]; 2765 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2766 } else { 2767 head = &pcbinfo->ipi_hash_wild[hash]; 2768 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2769 } 2770 } 2771 2772 /* 2773 * Check for alternatives when higher level complains 2774 * about service problems. For now, invalidate cached 2775 * routing information. If the route was created dynamically 2776 * (by a redirect), time to try a default gateway again. 2777 */ 2778 void 2779 in_losing(struct inpcb *inp) 2780 { 2781 2782 RO_INVALIDATE_CACHE(&inp->inp_route); 2783 return; 2784 } 2785 2786 /* 2787 * A set label operation has occurred at the socket layer, propagate the 2788 * label change into the in_pcb for the socket. 2789 */ 2790 void 2791 in_pcbsosetlabel(struct socket *so) 2792 { 2793 #ifdef MAC 2794 struct inpcb *inp; 2795 2796 inp = sotoinpcb(so); 2797 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2798 2799 INP_WLOCK(inp); 2800 SOCK_LOCK(so); 2801 mac_inpcb_sosetlabel(so, inp); 2802 SOCK_UNLOCK(so); 2803 INP_WUNLOCK(inp); 2804 #endif 2805 } 2806 2807 void 2808 inp_wlock(struct inpcb *inp) 2809 { 2810 2811 INP_WLOCK(inp); 2812 } 2813 2814 void 2815 inp_wunlock(struct inpcb *inp) 2816 { 2817 2818 INP_WUNLOCK(inp); 2819 } 2820 2821 void 2822 inp_rlock(struct inpcb *inp) 2823 { 2824 2825 INP_RLOCK(inp); 2826 } 2827 2828 void 2829 inp_runlock(struct inpcb *inp) 2830 { 2831 2832 INP_RUNLOCK(inp); 2833 } 2834 2835 #ifdef INVARIANT_SUPPORT 2836 void 2837 inp_lock_assert(struct inpcb *inp) 2838 { 2839 2840 INP_WLOCK_ASSERT(inp); 2841 } 2842 2843 void 2844 inp_unlock_assert(struct inpcb *inp) 2845 { 2846 2847 INP_UNLOCK_ASSERT(inp); 2848 } 2849 #endif 2850 2851 void 2852 inp_apply_all(struct inpcbinfo *pcbinfo, 2853 void (*func)(struct inpcb *, void *), void *arg) 2854 { 2855 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2856 INPLOOKUP_WLOCKPCB); 2857 struct inpcb *inp; 2858 2859 while ((inp = inp_next(&inpi)) != NULL) 2860 func(inp, arg); 2861 } 2862 2863 struct socket * 2864 inp_inpcbtosocket(struct inpcb *inp) 2865 { 2866 2867 INP_WLOCK_ASSERT(inp); 2868 return (inp->inp_socket); 2869 } 2870 2871 void 2872 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2873 uint32_t *faddr, uint16_t *fp) 2874 { 2875 2876 INP_LOCK_ASSERT(inp); 2877 *laddr = inp->inp_laddr.s_addr; 2878 *faddr = inp->inp_faddr.s_addr; 2879 *lp = inp->inp_lport; 2880 *fp = inp->inp_fport; 2881 } 2882 2883 /* 2884 * Create an external-format (``xinpcb'') structure using the information in 2885 * the kernel-format in_pcb structure pointed to by inp. This is done to 2886 * reduce the spew of irrelevant information over this interface, to isolate 2887 * user code from changes in the kernel structure, and potentially to provide 2888 * information-hiding if we decide that some of this information should be 2889 * hidden from users. 2890 */ 2891 void 2892 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2893 { 2894 2895 bzero(xi, sizeof(*xi)); 2896 xi->xi_len = sizeof(struct xinpcb); 2897 if (inp->inp_socket) 2898 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2899 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2900 xi->inp_gencnt = inp->inp_gencnt; 2901 xi->inp_flow = inp->inp_flow; 2902 xi->inp_flowid = inp->inp_flowid; 2903 xi->inp_flowtype = inp->inp_flowtype; 2904 xi->inp_flags = inp->inp_flags; 2905 xi->inp_flags2 = inp->inp_flags2; 2906 xi->in6p_cksum = inp->in6p_cksum; 2907 xi->in6p_hops = inp->in6p_hops; 2908 xi->inp_ip_tos = inp->inp_ip_tos; 2909 xi->inp_vflag = inp->inp_vflag; 2910 xi->inp_ip_ttl = inp->inp_ip_ttl; 2911 xi->inp_ip_p = inp->inp_ip_p; 2912 xi->inp_ip_minttl = inp->inp_ip_minttl; 2913 } 2914 2915 int 2916 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2917 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2918 { 2919 struct sockopt sopt; 2920 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2921 INPLOOKUP_WLOCKPCB); 2922 struct inpcb *inp; 2923 struct sockopt_parameters *params; 2924 struct socket *so; 2925 int error; 2926 char buf[1024]; 2927 2928 if (req->oldptr != NULL || req->oldlen != 0) 2929 return (EINVAL); 2930 if (req->newptr == NULL) 2931 return (EPERM); 2932 if (req->newlen > sizeof(buf)) 2933 return (ENOMEM); 2934 error = SYSCTL_IN(req, buf, req->newlen); 2935 if (error != 0) 2936 return (error); 2937 if (req->newlen < sizeof(struct sockopt_parameters)) 2938 return (EINVAL); 2939 params = (struct sockopt_parameters *)buf; 2940 sopt.sopt_level = params->sop_level; 2941 sopt.sopt_name = params->sop_optname; 2942 sopt.sopt_dir = SOPT_SET; 2943 sopt.sopt_val = params->sop_optval; 2944 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2945 sopt.sopt_td = NULL; 2946 #ifdef INET6 2947 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2948 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2949 params->sop_inc.inc6_laddr.s6_addr16[1] = 2950 htons(params->sop_inc.inc6_zoneid & 0xffff); 2951 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2952 params->sop_inc.inc6_faddr.s6_addr16[1] = 2953 htons(params->sop_inc.inc6_zoneid & 0xffff); 2954 } 2955 #endif 2956 if (params->sop_inc.inc_lport != htons(0) && 2957 params->sop_inc.inc_fport != htons(0)) { 2958 #ifdef INET6 2959 if (params->sop_inc.inc_flags & INC_ISIPV6) 2960 inpi.hash = INP6_PCBHASH( 2961 ¶ms->sop_inc.inc6_faddr, 2962 params->sop_inc.inc_lport, 2963 params->sop_inc.inc_fport, 2964 pcbinfo->ipi_hashmask); 2965 else 2966 #endif 2967 inpi.hash = INP_PCBHASH( 2968 ¶ms->sop_inc.inc_faddr, 2969 params->sop_inc.inc_lport, 2970 params->sop_inc.inc_fport, 2971 pcbinfo->ipi_hashmask); 2972 } 2973 while ((inp = inp_next(&inpi)) != NULL) 2974 if (inp->inp_gencnt == params->sop_id) { 2975 if (inp->inp_flags & INP_DROPPED) { 2976 INP_WUNLOCK(inp); 2977 return (ECONNRESET); 2978 } 2979 so = inp->inp_socket; 2980 KASSERT(so != NULL, ("inp_socket == NULL")); 2981 soref(so); 2982 if (params->sop_level == SOL_SOCKET) { 2983 INP_WUNLOCK(inp); 2984 error = sosetopt(so, &sopt); 2985 } else 2986 error = (*ctloutput_set)(inp, &sopt); 2987 sorele(so); 2988 break; 2989 } 2990 if (inp == NULL) 2991 error = ESRCH; 2992 return (error); 2993 } 2994 2995 #ifdef DDB 2996 static void 2997 db_print_indent(int indent) 2998 { 2999 int i; 3000 3001 for (i = 0; i < indent; i++) 3002 db_printf(" "); 3003 } 3004 3005 static void 3006 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3007 { 3008 char faddr_str[48], laddr_str[48]; 3009 3010 db_print_indent(indent); 3011 db_printf("%s at %p\n", name, inc); 3012 3013 indent += 2; 3014 3015 #ifdef INET6 3016 if (inc->inc_flags & INC_ISIPV6) { 3017 /* IPv6. */ 3018 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3019 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3020 } else 3021 #endif 3022 { 3023 /* IPv4. */ 3024 inet_ntoa_r(inc->inc_laddr, laddr_str); 3025 inet_ntoa_r(inc->inc_faddr, faddr_str); 3026 } 3027 db_print_indent(indent); 3028 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3029 ntohs(inc->inc_lport)); 3030 db_print_indent(indent); 3031 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3032 ntohs(inc->inc_fport)); 3033 } 3034 3035 static void 3036 db_print_inpflags(int inp_flags) 3037 { 3038 int comma; 3039 3040 comma = 0; 3041 if (inp_flags & INP_RECVOPTS) { 3042 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 3043 comma = 1; 3044 } 3045 if (inp_flags & INP_RECVRETOPTS) { 3046 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 3047 comma = 1; 3048 } 3049 if (inp_flags & INP_RECVDSTADDR) { 3050 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 3051 comma = 1; 3052 } 3053 if (inp_flags & INP_ORIGDSTADDR) { 3054 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 3055 comma = 1; 3056 } 3057 if (inp_flags & INP_HDRINCL) { 3058 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 3059 comma = 1; 3060 } 3061 if (inp_flags & INP_HIGHPORT) { 3062 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 3063 comma = 1; 3064 } 3065 if (inp_flags & INP_LOWPORT) { 3066 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 3067 comma = 1; 3068 } 3069 if (inp_flags & INP_ANONPORT) { 3070 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 3071 comma = 1; 3072 } 3073 if (inp_flags & INP_RECVIF) { 3074 db_printf("%sINP_RECVIF", comma ? ", " : ""); 3075 comma = 1; 3076 } 3077 if (inp_flags & INP_MTUDISC) { 3078 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 3079 comma = 1; 3080 } 3081 if (inp_flags & INP_RECVTTL) { 3082 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 3083 comma = 1; 3084 } 3085 if (inp_flags & INP_DONTFRAG) { 3086 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 3087 comma = 1; 3088 } 3089 if (inp_flags & INP_RECVTOS) { 3090 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 3091 comma = 1; 3092 } 3093 if (inp_flags & IN6P_IPV6_V6ONLY) { 3094 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3095 comma = 1; 3096 } 3097 if (inp_flags & IN6P_PKTINFO) { 3098 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3099 comma = 1; 3100 } 3101 if (inp_flags & IN6P_HOPLIMIT) { 3102 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3103 comma = 1; 3104 } 3105 if (inp_flags & IN6P_HOPOPTS) { 3106 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3107 comma = 1; 3108 } 3109 if (inp_flags & IN6P_DSTOPTS) { 3110 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3111 comma = 1; 3112 } 3113 if (inp_flags & IN6P_RTHDR) { 3114 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3115 comma = 1; 3116 } 3117 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3118 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3119 comma = 1; 3120 } 3121 if (inp_flags & IN6P_TCLASS) { 3122 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3123 comma = 1; 3124 } 3125 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3126 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3127 comma = 1; 3128 } 3129 if (inp_flags & INP_ONESBCAST) { 3130 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3131 comma = 1; 3132 } 3133 if (inp_flags & INP_DROPPED) { 3134 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3135 comma = 1; 3136 } 3137 if (inp_flags & INP_SOCKREF) { 3138 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3139 comma = 1; 3140 } 3141 if (inp_flags & IN6P_RFC2292) { 3142 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3143 comma = 1; 3144 } 3145 if (inp_flags & IN6P_MTU) { 3146 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3147 comma = 1; 3148 } 3149 } 3150 3151 static void 3152 db_print_inpvflag(u_char inp_vflag) 3153 { 3154 int comma; 3155 3156 comma = 0; 3157 if (inp_vflag & INP_IPV4) { 3158 db_printf("%sINP_IPV4", comma ? ", " : ""); 3159 comma = 1; 3160 } 3161 if (inp_vflag & INP_IPV6) { 3162 db_printf("%sINP_IPV6", comma ? ", " : ""); 3163 comma = 1; 3164 } 3165 if (inp_vflag & INP_IPV6PROTO) { 3166 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3167 comma = 1; 3168 } 3169 } 3170 3171 static void 3172 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3173 { 3174 3175 db_print_indent(indent); 3176 db_printf("%s at %p\n", name, inp); 3177 3178 indent += 2; 3179 3180 db_print_indent(indent); 3181 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3182 3183 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3184 3185 db_print_indent(indent); 3186 db_printf("inp_label: %p inp_flags: 0x%x (", 3187 inp->inp_label, inp->inp_flags); 3188 db_print_inpflags(inp->inp_flags); 3189 db_printf(")\n"); 3190 3191 db_print_indent(indent); 3192 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3193 inp->inp_vflag); 3194 db_print_inpvflag(inp->inp_vflag); 3195 db_printf(")\n"); 3196 3197 db_print_indent(indent); 3198 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3199 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3200 3201 db_print_indent(indent); 3202 #ifdef INET6 3203 if (inp->inp_vflag & INP_IPV6) { 3204 db_printf("in6p_options: %p in6p_outputopts: %p " 3205 "in6p_moptions: %p\n", inp->in6p_options, 3206 inp->in6p_outputopts, inp->in6p_moptions); 3207 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3208 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3209 inp->in6p_hops); 3210 } else 3211 #endif 3212 { 3213 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3214 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3215 inp->inp_options, inp->inp_moptions); 3216 } 3217 3218 db_print_indent(indent); 3219 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3220 (uintmax_t)inp->inp_gencnt); 3221 } 3222 3223 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3224 { 3225 struct inpcb *inp; 3226 3227 if (!have_addr) { 3228 db_printf("usage: show inpcb <addr>\n"); 3229 return; 3230 } 3231 inp = (struct inpcb *)addr; 3232 3233 db_print_inpcb(inp, "inpcb", 0); 3234 } 3235 #endif /* DDB */ 3236 3237 #ifdef RATELIMIT 3238 /* 3239 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3240 * if any. 3241 */ 3242 int 3243 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3244 { 3245 union if_snd_tag_modify_params params = { 3246 .rate_limit.max_rate = max_pacing_rate, 3247 .rate_limit.flags = M_NOWAIT, 3248 }; 3249 struct m_snd_tag *mst; 3250 int error; 3251 3252 mst = inp->inp_snd_tag; 3253 if (mst == NULL) 3254 return (EINVAL); 3255 3256 if (mst->sw->snd_tag_modify == NULL) { 3257 error = EOPNOTSUPP; 3258 } else { 3259 error = mst->sw->snd_tag_modify(mst, ¶ms); 3260 } 3261 return (error); 3262 } 3263 3264 /* 3265 * Query existing TX rate limit based on the existing 3266 * "inp->inp_snd_tag", if any. 3267 */ 3268 int 3269 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3270 { 3271 union if_snd_tag_query_params params = { }; 3272 struct m_snd_tag *mst; 3273 int error; 3274 3275 mst = inp->inp_snd_tag; 3276 if (mst == NULL) 3277 return (EINVAL); 3278 3279 if (mst->sw->snd_tag_query == NULL) { 3280 error = EOPNOTSUPP; 3281 } else { 3282 error = mst->sw->snd_tag_query(mst, ¶ms); 3283 if (error == 0 && p_max_pacing_rate != NULL) 3284 *p_max_pacing_rate = params.rate_limit.max_rate; 3285 } 3286 return (error); 3287 } 3288 3289 /* 3290 * Query existing TX queue level based on the existing 3291 * "inp->inp_snd_tag", if any. 3292 */ 3293 int 3294 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3295 { 3296 union if_snd_tag_query_params params = { }; 3297 struct m_snd_tag *mst; 3298 int error; 3299 3300 mst = inp->inp_snd_tag; 3301 if (mst == NULL) 3302 return (EINVAL); 3303 3304 if (mst->sw->snd_tag_query == NULL) 3305 return (EOPNOTSUPP); 3306 3307 error = mst->sw->snd_tag_query(mst, ¶ms); 3308 if (error == 0 && p_txqueue_level != NULL) 3309 *p_txqueue_level = params.rate_limit.queue_level; 3310 return (error); 3311 } 3312 3313 /* 3314 * Allocate a new TX rate limit send tag from the network interface 3315 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3316 */ 3317 int 3318 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3319 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3320 3321 { 3322 union if_snd_tag_alloc_params params = { 3323 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3324 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3325 .rate_limit.hdr.flowid = flowid, 3326 .rate_limit.hdr.flowtype = flowtype, 3327 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3328 .rate_limit.max_rate = max_pacing_rate, 3329 .rate_limit.flags = M_NOWAIT, 3330 }; 3331 int error; 3332 3333 INP_WLOCK_ASSERT(inp); 3334 3335 /* 3336 * If there is already a send tag, or the INP is being torn 3337 * down, allocating a new send tag is not allowed. Else send 3338 * tags may leak. 3339 */ 3340 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3341 return (EINVAL); 3342 3343 error = m_snd_tag_alloc(ifp, ¶ms, st); 3344 #ifdef INET 3345 if (error == 0) { 3346 counter_u64_add(rate_limit_set_ok, 1); 3347 counter_u64_add(rate_limit_active, 1); 3348 } else if (error != EOPNOTSUPP) 3349 counter_u64_add(rate_limit_alloc_fail, 1); 3350 #endif 3351 return (error); 3352 } 3353 3354 void 3355 in_pcbdetach_tag(struct m_snd_tag *mst) 3356 { 3357 3358 m_snd_tag_rele(mst); 3359 #ifdef INET 3360 counter_u64_add(rate_limit_active, -1); 3361 #endif 3362 } 3363 3364 /* 3365 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3366 * if any: 3367 */ 3368 void 3369 in_pcbdetach_txrtlmt(struct inpcb *inp) 3370 { 3371 struct m_snd_tag *mst; 3372 3373 INP_WLOCK_ASSERT(inp); 3374 3375 mst = inp->inp_snd_tag; 3376 inp->inp_snd_tag = NULL; 3377 3378 if (mst == NULL) 3379 return; 3380 3381 m_snd_tag_rele(mst); 3382 #ifdef INET 3383 counter_u64_add(rate_limit_active, -1); 3384 #endif 3385 } 3386 3387 int 3388 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3389 { 3390 int error; 3391 3392 /* 3393 * If the existing send tag is for the wrong interface due to 3394 * a route change, first drop the existing tag. Set the 3395 * CHANGED flag so that we will keep trying to allocate a new 3396 * tag if we fail to allocate one this time. 3397 */ 3398 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3399 in_pcbdetach_txrtlmt(inp); 3400 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3401 } 3402 3403 /* 3404 * NOTE: When attaching to a network interface a reference is 3405 * made to ensure the network interface doesn't go away until 3406 * all ratelimit connections are gone. The network interface 3407 * pointers compared below represent valid network interfaces, 3408 * except when comparing towards NULL. 3409 */ 3410 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3411 error = 0; 3412 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3413 if (inp->inp_snd_tag != NULL) 3414 in_pcbdetach_txrtlmt(inp); 3415 error = 0; 3416 } else if (inp->inp_snd_tag == NULL) { 3417 /* 3418 * In order to utilize packet pacing with RSS, we need 3419 * to wait until there is a valid RSS hash before we 3420 * can proceed: 3421 */ 3422 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3423 error = EAGAIN; 3424 } else { 3425 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3426 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3427 } 3428 } else { 3429 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3430 } 3431 if (error == 0 || error == EOPNOTSUPP) 3432 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3433 3434 return (error); 3435 } 3436 3437 /* 3438 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3439 * is set in the fast path and will attach/detach/modify the TX rate 3440 * limit send tag based on the socket's so_max_pacing_rate value. 3441 */ 3442 void 3443 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3444 { 3445 struct socket *socket; 3446 uint32_t max_pacing_rate; 3447 bool did_upgrade; 3448 3449 if (inp == NULL) 3450 return; 3451 3452 socket = inp->inp_socket; 3453 if (socket == NULL) 3454 return; 3455 3456 if (!INP_WLOCKED(inp)) { 3457 /* 3458 * NOTE: If the write locking fails, we need to bail 3459 * out and use the non-ratelimited ring for the 3460 * transmit until there is a new chance to get the 3461 * write lock. 3462 */ 3463 if (!INP_TRY_UPGRADE(inp)) 3464 return; 3465 did_upgrade = 1; 3466 } else { 3467 did_upgrade = 0; 3468 } 3469 3470 /* 3471 * NOTE: The so_max_pacing_rate value is read unlocked, 3472 * because atomic updates are not required since the variable 3473 * is checked at every mbuf we send. It is assumed that the 3474 * variable read itself will be atomic. 3475 */ 3476 max_pacing_rate = socket->so_max_pacing_rate; 3477 3478 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3479 3480 if (did_upgrade) 3481 INP_DOWNGRADE(inp); 3482 } 3483 3484 /* 3485 * Track route changes for TX rate limiting. 3486 */ 3487 void 3488 in_pcboutput_eagain(struct inpcb *inp) 3489 { 3490 bool did_upgrade; 3491 3492 if (inp == NULL) 3493 return; 3494 3495 if (inp->inp_snd_tag == NULL) 3496 return; 3497 3498 if (!INP_WLOCKED(inp)) { 3499 /* 3500 * NOTE: If the write locking fails, we need to bail 3501 * out and use the non-ratelimited ring for the 3502 * transmit until there is a new chance to get the 3503 * write lock. 3504 */ 3505 if (!INP_TRY_UPGRADE(inp)) 3506 return; 3507 did_upgrade = 1; 3508 } else { 3509 did_upgrade = 0; 3510 } 3511 3512 /* detach rate limiting */ 3513 in_pcbdetach_txrtlmt(inp); 3514 3515 /* make sure new mbuf send tag allocation is made */ 3516 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3517 3518 if (did_upgrade) 3519 INP_DOWNGRADE(inp); 3520 } 3521 3522 #ifdef INET 3523 static void 3524 rl_init(void *st) 3525 { 3526 rate_limit_new = counter_u64_alloc(M_WAITOK); 3527 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3528 rate_limit_active = counter_u64_alloc(M_WAITOK); 3529 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3530 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3531 } 3532 3533 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3534 #endif 3535 #endif /* RATELIMIT */ 3536