1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/hash.h> 53 #include <sys/systm.h> 54 #include <sys/libkern.h> 55 #include <sys/lock.h> 56 #include <sys/malloc.h> 57 #include <sys/mbuf.h> 58 #include <sys/eventhandler.h> 59 #include <sys/domain.h> 60 #include <sys/protosw.h> 61 #include <sys/smp.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/sockio.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/jail.h> 69 #include <sys/kernel.h> 70 #include <sys/sysctl.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 #include <vm/vm.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 113 114 /* 115 * These configure the range of local port addresses assigned to 116 * "unspecified" outgoing connections/packets/whatever. 117 */ 118 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 119 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 120 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 121 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 122 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 123 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 124 125 /* 126 * Reserved ports accessible only to root. There are significant 127 * security considerations that must be accounted for when changing these, 128 * but the security benefits can be great. Please be careful. 129 */ 130 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 131 VNET_DEFINE(int, ipport_reservedlow); 132 133 /* Enable random ephemeral port allocation by default. */ 134 VNET_DEFINE(int, ipport_randomized) = 1; 135 136 #ifdef INET 137 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 138 struct in_addr faddr, u_int fport_arg, 139 struct in_addr laddr, u_int lport_arg, 140 int lookupflags, struct ifnet *ifp, 141 uint8_t numa_domain); 142 143 #define RANGECHK(var, min, max) \ 144 if ((var) < (min)) { (var) = (min); } \ 145 else if ((var) > (max)) { (var) = (max); } 146 147 static int 148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 149 { 150 int error; 151 152 error = sysctl_handle_int(oidp, arg1, arg2, req); 153 if (error == 0) { 154 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 155 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 157 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 160 } 161 return (error); 162 } 163 164 #undef RANGECHK 165 166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 167 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 168 "IP Ports"); 169 170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 171 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 172 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 173 ""); 174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 175 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 176 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 177 ""); 178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 179 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 180 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 181 ""); 182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 183 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 184 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 185 ""); 186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 187 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 188 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 189 ""); 190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 191 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 192 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 193 ""); 194 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 195 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 196 &VNET_NAME(ipport_reservedhigh), 0, ""); 197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 198 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 200 CTLFLAG_VNET | CTLFLAG_RW, 201 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 202 203 #ifdef RATELIMIT 204 counter_u64_t rate_limit_new; 205 counter_u64_t rate_limit_chg; 206 counter_u64_t rate_limit_active; 207 counter_u64_t rate_limit_alloc_fail; 208 counter_u64_t rate_limit_set_ok; 209 210 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 211 "IP Rate Limiting"); 212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 213 &rate_limit_active, "Active rate limited connections"); 214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 215 &rate_limit_alloc_fail, "Rate limited connection failures"); 216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 217 &rate_limit_set_ok, "Rate limited setting succeeded"); 218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 219 &rate_limit_new, "Total Rate limit new attempts"); 220 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 221 &rate_limit_chg, "Total Rate limited change attempts"); 222 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, 0); 236 237 static void in_pcbremhash(struct inpcb *); 238 239 /* 240 * in_pcb.c: manage the Protocol Control Blocks. 241 * 242 * NOTE: It is assumed that most of these functions will be called with 243 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 244 * functions often modify hash chains or addresses in pcbs. 245 */ 246 247 static struct inpcblbgroup * 248 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, 249 u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, 250 uint8_t numa_domain) 251 { 252 struct inpcblbgroup *grp; 253 size_t bytes; 254 255 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 256 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 257 if (grp == NULL) 258 return (NULL); 259 grp->il_cred = crhold(cred); 260 grp->il_vflag = vflag; 261 grp->il_lport = port; 262 grp->il_numa_domain = numa_domain; 263 grp->il_dependladdr = *addr; 264 grp->il_inpsiz = size; 265 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 266 return (grp); 267 } 268 269 static void 270 in_pcblbgroup_free_deferred(epoch_context_t ctx) 271 { 272 struct inpcblbgroup *grp; 273 274 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 275 crfree(grp->il_cred); 276 free(grp, M_PCB); 277 } 278 279 static void 280 in_pcblbgroup_free(struct inpcblbgroup *grp) 281 { 282 283 CK_LIST_REMOVE(grp, il_list); 284 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 285 } 286 287 static struct inpcblbgroup * 288 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 289 struct inpcblbgroup *old_grp, int size) 290 { 291 struct inpcblbgroup *grp; 292 int i; 293 294 grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, 295 old_grp->il_lport, &old_grp->il_dependladdr, size, 296 old_grp->il_numa_domain); 297 if (grp == NULL) 298 return (NULL); 299 300 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 301 ("invalid new local group size %d and old local group count %d", 302 grp->il_inpsiz, old_grp->il_inpcnt)); 303 304 for (i = 0; i < old_grp->il_inpcnt; ++i) 305 grp->il_inp[i] = old_grp->il_inp[i]; 306 grp->il_inpcnt = old_grp->il_inpcnt; 307 in_pcblbgroup_free(old_grp); 308 return (grp); 309 } 310 311 /* 312 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 313 * and shrink group if possible. 314 */ 315 static void 316 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 317 int i) 318 { 319 struct inpcblbgroup *grp, *new_grp; 320 321 grp = *grpp; 322 for (; i + 1 < grp->il_inpcnt; ++i) 323 grp->il_inp[i] = grp->il_inp[i + 1]; 324 grp->il_inpcnt--; 325 326 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 327 grp->il_inpcnt <= grp->il_inpsiz / 4) { 328 /* Shrink this group. */ 329 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 330 if (new_grp != NULL) 331 *grpp = new_grp; 332 } 333 } 334 335 /* 336 * Add PCB to load balance group for SO_REUSEPORT_LB option. 337 */ 338 static int 339 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 340 { 341 const static struct timeval interval = { 60, 0 }; 342 static struct timeval lastprint; 343 struct inpcbinfo *pcbinfo; 344 struct inpcblbgrouphead *hdr; 345 struct inpcblbgroup *grp; 346 uint32_t idx; 347 348 pcbinfo = inp->inp_pcbinfo; 349 350 INP_WLOCK_ASSERT(inp); 351 INP_HASH_WLOCK_ASSERT(pcbinfo); 352 353 #ifdef INET6 354 /* 355 * Don't allow IPv4 mapped INET6 wild socket. 356 */ 357 if ((inp->inp_vflag & INP_IPV4) && 358 inp->inp_laddr.s_addr == INADDR_ANY && 359 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 360 return (0); 361 } 362 #endif 363 364 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 365 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 366 CK_LIST_FOREACH(grp, hdr, il_list) { 367 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 368 grp->il_vflag == inp->inp_vflag && 369 grp->il_lport == inp->inp_lport && 370 grp->il_numa_domain == numa_domain && 371 memcmp(&grp->il_dependladdr, 372 &inp->inp_inc.inc_ie.ie_dependladdr, 373 sizeof(grp->il_dependladdr)) == 0) { 374 break; 375 } 376 } 377 if (grp == NULL) { 378 /* Create new load balance group. */ 379 grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, 380 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 381 INPCBLBGROUP_SIZMIN, numa_domain); 382 if (grp == NULL) 383 return (ENOBUFS); 384 } else if (grp->il_inpcnt == grp->il_inpsiz) { 385 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 386 if (ratecheck(&lastprint, &interval)) 387 printf("lb group port %d, limit reached\n", 388 ntohs(grp->il_lport)); 389 return (0); 390 } 391 392 /* Expand this local group. */ 393 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 394 if (grp == NULL) 395 return (ENOBUFS); 396 } 397 398 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 399 ("invalid local group size %d and count %d", grp->il_inpsiz, 400 grp->il_inpcnt)); 401 402 grp->il_inp[grp->il_inpcnt] = inp; 403 grp->il_inpcnt++; 404 return (0); 405 } 406 407 /* 408 * Remove PCB from load balance group. 409 */ 410 static void 411 in_pcbremlbgrouphash(struct inpcb *inp) 412 { 413 struct inpcbinfo *pcbinfo; 414 struct inpcblbgrouphead *hdr; 415 struct inpcblbgroup *grp; 416 int i; 417 418 pcbinfo = inp->inp_pcbinfo; 419 420 INP_WLOCK_ASSERT(inp); 421 INP_HASH_WLOCK_ASSERT(pcbinfo); 422 423 hdr = &pcbinfo->ipi_lbgrouphashbase[ 424 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 425 CK_LIST_FOREACH(grp, hdr, il_list) { 426 for (i = 0; i < grp->il_inpcnt; ++i) { 427 if (grp->il_inp[i] != inp) 428 continue; 429 430 if (grp->il_inpcnt == 1) { 431 /* We are the last, free this local group. */ 432 in_pcblbgroup_free(grp); 433 } else { 434 /* Pull up inpcbs, shrink group if possible. */ 435 in_pcblbgroup_reorder(hdr, &grp, i); 436 } 437 return; 438 } 439 } 440 } 441 442 int 443 in_pcblbgroup_numa(struct inpcb *inp, int arg) 444 { 445 struct inpcbinfo *pcbinfo; 446 struct inpcblbgrouphead *hdr; 447 struct inpcblbgroup *grp; 448 int err, i; 449 uint8_t numa_domain; 450 451 switch (arg) { 452 case TCP_REUSPORT_LB_NUMA_NODOM: 453 numa_domain = M_NODOM; 454 break; 455 case TCP_REUSPORT_LB_NUMA_CURDOM: 456 numa_domain = PCPU_GET(domain); 457 break; 458 default: 459 if (arg < 0 || arg >= vm_ndomains) 460 return (EINVAL); 461 numa_domain = arg; 462 } 463 464 err = 0; 465 pcbinfo = inp->inp_pcbinfo; 466 INP_WLOCK_ASSERT(inp); 467 INP_HASH_WLOCK(pcbinfo); 468 hdr = &pcbinfo->ipi_lbgrouphashbase[ 469 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 470 CK_LIST_FOREACH(grp, hdr, il_list) { 471 for (i = 0; i < grp->il_inpcnt; ++i) { 472 if (grp->il_inp[i] != inp) 473 continue; 474 475 if (grp->il_numa_domain == numa_domain) { 476 goto abort_with_hash_wlock; 477 } 478 479 /* Remove it from the old group. */ 480 in_pcbremlbgrouphash(inp); 481 482 /* Add it to the new group based on numa domain. */ 483 in_pcbinslbgrouphash(inp, numa_domain); 484 goto abort_with_hash_wlock; 485 } 486 } 487 err = ENOENT; 488 abort_with_hash_wlock: 489 INP_HASH_WUNLOCK(pcbinfo); 490 return (err); 491 } 492 493 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 494 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 495 496 /* 497 * Initialize an inpcbinfo - a per-VNET instance of connections db. 498 */ 499 void 500 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 501 u_int hash_nelements, u_int porthash_nelements) 502 { 503 504 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 505 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 506 NULL, MTX_DEF); 507 #ifdef VIMAGE 508 pcbinfo->ipi_vnet = curvnet; 509 #endif 510 CK_LIST_INIT(&pcbinfo->ipi_listhead); 511 pcbinfo->ipi_count = 0; 512 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 513 &pcbinfo->ipi_hashmask); 514 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 515 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 516 &pcbinfo->ipi_porthashmask); 517 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 518 &pcbinfo->ipi_lbgrouphashmask); 519 pcbinfo->ipi_zone = pcbstor->ips_zone; 520 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 521 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 522 } 523 524 /* 525 * Destroy an inpcbinfo. 526 */ 527 void 528 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 529 { 530 531 KASSERT(pcbinfo->ipi_count == 0, 532 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 533 534 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 535 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 536 pcbinfo->ipi_porthashmask); 537 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 538 pcbinfo->ipi_lbgrouphashmask); 539 mtx_destroy(&pcbinfo->ipi_hash_lock); 540 mtx_destroy(&pcbinfo->ipi_lock); 541 } 542 543 /* 544 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 545 */ 546 static void inpcb_dtor(void *, int, void *); 547 static void inpcb_fini(void *, int); 548 void 549 in_pcbstorage_init(void *arg) 550 { 551 struct inpcbstorage *pcbstor = arg; 552 553 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 554 pcbstor->ips_size, NULL, inpcb_dtor, pcbstor->ips_pcbinit, 555 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 556 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 557 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 558 uma_zone_set_smr(pcbstor->ips_portzone, 559 uma_zone_get_smr(pcbstor->ips_zone)); 560 } 561 562 /* 563 * Destroy a pcbstorage - used by unloadable protocols. 564 */ 565 void 566 in_pcbstorage_destroy(void *arg) 567 { 568 struct inpcbstorage *pcbstor = arg; 569 570 uma_zdestroy(pcbstor->ips_zone); 571 uma_zdestroy(pcbstor->ips_portzone); 572 } 573 574 /* 575 * Allocate a PCB and associate it with the socket. 576 * On success return with the PCB locked. 577 */ 578 int 579 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 580 { 581 struct inpcb *inp; 582 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 583 int error; 584 #endif 585 586 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 587 if (inp == NULL) 588 return (ENOBUFS); 589 bzero(&inp->inp_start_zero, inp_zero_size); 590 #ifdef NUMA 591 inp->inp_numa_domain = M_NODOM; 592 #endif 593 inp->inp_pcbinfo = pcbinfo; 594 inp->inp_socket = so; 595 inp->inp_cred = crhold(so->so_cred); 596 inp->inp_inc.inc_fibnum = so->so_fibnum; 597 #ifdef MAC 598 error = mac_inpcb_init(inp, M_NOWAIT); 599 if (error != 0) 600 goto out; 601 mac_inpcb_create(so, inp); 602 #endif 603 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 604 error = ipsec_init_pcbpolicy(inp); 605 if (error != 0) { 606 #ifdef MAC 607 mac_inpcb_destroy(inp); 608 #endif 609 goto out; 610 } 611 #endif /*IPSEC*/ 612 #ifdef INET6 613 if (INP_SOCKAF(so) == AF_INET6) { 614 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 615 if (V_ip6_v6only) 616 inp->inp_flags |= IN6P_IPV6_V6ONLY; 617 #ifdef INET 618 else 619 inp->inp_vflag |= INP_IPV4; 620 #endif 621 if (V_ip6_auto_flowlabel) 622 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 623 inp->in6p_hops = -1; /* use kernel default */ 624 } 625 #endif 626 #if defined(INET) && defined(INET6) 627 else 628 #endif 629 #ifdef INET 630 inp->inp_vflag |= INP_IPV4; 631 #endif 632 /* 633 * Routes in inpcb's can cache L2 as well; they are guaranteed 634 * to be cleaned up. 635 */ 636 inp->inp_route.ro_flags = RT_LLE_CACHE; 637 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 638 INP_WLOCK(inp); 639 INP_INFO_WLOCK(pcbinfo); 640 pcbinfo->ipi_count++; 641 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 642 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 643 INP_INFO_WUNLOCK(pcbinfo); 644 so->so_pcb = inp; 645 646 return (0); 647 648 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 649 out: 650 uma_zfree_smr(pcbinfo->ipi_zone, inp); 651 return (error); 652 #endif 653 } 654 655 #ifdef INET 656 int 657 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 658 { 659 int anonport, error; 660 661 KASSERT(nam == NULL || nam->sa_family == AF_INET, 662 ("%s: invalid address family for %p", __func__, nam)); 663 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 664 ("%s: invalid address length for %p", __func__, nam)); 665 INP_WLOCK_ASSERT(inp); 666 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 667 668 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 669 return (EINVAL); 670 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 671 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 672 &inp->inp_lport, cred); 673 if (error) 674 return (error); 675 if (in_pcbinshash(inp) != 0) { 676 inp->inp_laddr.s_addr = INADDR_ANY; 677 inp->inp_lport = 0; 678 return (EAGAIN); 679 } 680 if (anonport) 681 inp->inp_flags |= INP_ANONPORT; 682 return (0); 683 } 684 #endif 685 686 #if defined(INET) || defined(INET6) 687 /* 688 * Assign a local port like in_pcb_lport(), but also used with connect() 689 * and a foreign address and port. If fsa is non-NULL, choose a local port 690 * that is unused with those, otherwise one that is completely unused. 691 * lsa can be NULL for IPv6. 692 */ 693 int 694 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 695 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 696 { 697 struct inpcbinfo *pcbinfo; 698 struct inpcb *tmpinp; 699 unsigned short *lastport; 700 int count, error; 701 u_short aux, first, last, lport; 702 #ifdef INET 703 struct in_addr laddr, faddr; 704 #endif 705 #ifdef INET6 706 struct in6_addr *laddr6, *faddr6; 707 #endif 708 709 pcbinfo = inp->inp_pcbinfo; 710 711 /* 712 * Because no actual state changes occur here, a global write lock on 713 * the pcbinfo isn't required. 714 */ 715 INP_LOCK_ASSERT(inp); 716 INP_HASH_LOCK_ASSERT(pcbinfo); 717 718 if (inp->inp_flags & INP_HIGHPORT) { 719 first = V_ipport_hifirstauto; /* sysctl */ 720 last = V_ipport_hilastauto; 721 lastport = &pcbinfo->ipi_lasthi; 722 } else if (inp->inp_flags & INP_LOWPORT) { 723 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 724 if (error) 725 return (error); 726 first = V_ipport_lowfirstauto; /* 1023 */ 727 last = V_ipport_lowlastauto; /* 600 */ 728 lastport = &pcbinfo->ipi_lastlow; 729 } else { 730 first = V_ipport_firstauto; /* sysctl */ 731 last = V_ipport_lastauto; 732 lastport = &pcbinfo->ipi_lastport; 733 } 734 735 /* 736 * Instead of having two loops further down counting up or down 737 * make sure that first is always <= last and go with only one 738 * code path implementing all logic. 739 */ 740 if (first > last) { 741 aux = first; 742 first = last; 743 last = aux; 744 } 745 746 #ifdef INET 747 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 748 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 749 if (lsa != NULL) 750 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 751 if (fsa != NULL) 752 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 753 } 754 #endif 755 #ifdef INET6 756 laddr6 = NULL; 757 if ((inp->inp_vflag & INP_IPV6) != 0) { 758 if (lsa != NULL) 759 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 760 if (fsa != NULL) 761 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 762 } 763 #endif 764 765 tmpinp = NULL; 766 lport = *lportp; 767 768 if (V_ipport_randomized) 769 *lastport = first + (arc4random() % (last - first)); 770 771 count = last - first; 772 773 do { 774 if (count-- < 0) /* completely used? */ 775 return (EADDRNOTAVAIL); 776 ++*lastport; 777 if (*lastport < first || *lastport > last) 778 *lastport = first; 779 lport = htons(*lastport); 780 781 if (fsa != NULL) { 782 #ifdef INET 783 if (lsa->sa_family == AF_INET) { 784 tmpinp = in_pcblookup_hash_locked(pcbinfo, 785 faddr, fport, laddr, lport, lookupflags, 786 NULL, M_NODOM); 787 } 788 #endif 789 #ifdef INET6 790 if (lsa->sa_family == AF_INET6) { 791 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 792 faddr6, fport, laddr6, lport, lookupflags, 793 NULL, M_NODOM); 794 } 795 #endif 796 } else { 797 #ifdef INET6 798 if ((inp->inp_vflag & INP_IPV6) != 0) { 799 tmpinp = in6_pcblookup_local(pcbinfo, 800 &inp->in6p_laddr, lport, lookupflags, cred); 801 #ifdef INET 802 if (tmpinp == NULL && 803 (inp->inp_vflag & INP_IPV4)) 804 tmpinp = in_pcblookup_local(pcbinfo, 805 laddr, lport, lookupflags, cred); 806 #endif 807 } 808 #endif 809 #if defined(INET) && defined(INET6) 810 else 811 #endif 812 #ifdef INET 813 tmpinp = in_pcblookup_local(pcbinfo, laddr, 814 lport, lookupflags, cred); 815 #endif 816 } 817 } while (tmpinp != NULL); 818 819 *lportp = lport; 820 821 return (0); 822 } 823 824 /* 825 * Select a local port (number) to use. 826 */ 827 int 828 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 829 struct ucred *cred, int lookupflags) 830 { 831 struct sockaddr_in laddr; 832 833 if (laddrp) { 834 bzero(&laddr, sizeof(laddr)); 835 laddr.sin_family = AF_INET; 836 laddr.sin_addr = *laddrp; 837 } 838 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 839 NULL, lportp, NULL, 0, cred, lookupflags)); 840 } 841 842 /* 843 * Return cached socket options. 844 */ 845 int 846 inp_so_options(const struct inpcb *inp) 847 { 848 int so_options; 849 850 so_options = 0; 851 852 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 853 so_options |= SO_REUSEPORT_LB; 854 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 855 so_options |= SO_REUSEPORT; 856 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 857 so_options |= SO_REUSEADDR; 858 return (so_options); 859 } 860 #endif /* INET || INET6 */ 861 862 /* 863 * Check if a new BINDMULTI socket is allowed to be created. 864 * 865 * ni points to the new inp. 866 * oi points to the existing inp. 867 * 868 * This checks whether the existing inp also has BINDMULTI and 869 * whether the credentials match. 870 */ 871 int 872 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 873 { 874 /* Check permissions match */ 875 if ((ni->inp_flags2 & INP_BINDMULTI) && 876 (ni->inp_cred->cr_uid != 877 oi->inp_cred->cr_uid)) 878 return (0); 879 880 /* Check the existing inp has BINDMULTI set */ 881 if ((ni->inp_flags2 & INP_BINDMULTI) && 882 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 883 return (0); 884 885 /* 886 * We're okay - either INP_BINDMULTI isn't set on ni, or 887 * it is and it matches the checks. 888 */ 889 return (1); 890 } 891 892 #ifdef INET 893 /* 894 * Set up a bind operation on a PCB, performing port allocation 895 * as required, but do not actually modify the PCB. Callers can 896 * either complete the bind by setting inp_laddr/inp_lport and 897 * calling in_pcbinshash(), or they can just use the resulting 898 * port and address to authorise the sending of a once-off packet. 899 * 900 * On error, the values of *laddrp and *lportp are not changed. 901 */ 902 int 903 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 904 u_short *lportp, struct ucred *cred) 905 { 906 struct socket *so = inp->inp_socket; 907 struct sockaddr_in *sin; 908 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 909 struct in_addr laddr; 910 u_short lport = 0; 911 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 912 int error; 913 914 /* 915 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 916 * so that we don't have to add to the (already messy) code below. 917 */ 918 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 919 920 /* 921 * No state changes, so read locks are sufficient here. 922 */ 923 INP_LOCK_ASSERT(inp); 924 INP_HASH_LOCK_ASSERT(pcbinfo); 925 926 laddr.s_addr = *laddrp; 927 if (nam != NULL && laddr.s_addr != INADDR_ANY) 928 return (EINVAL); 929 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 930 lookupflags = INPLOOKUP_WILDCARD; 931 if (nam == NULL) { 932 if ((error = prison_local_ip4(cred, &laddr)) != 0) 933 return (error); 934 } else { 935 sin = (struct sockaddr_in *)nam; 936 KASSERT(sin->sin_family == AF_INET, 937 ("%s: invalid family for address %p", __func__, sin)); 938 KASSERT(sin->sin_len == sizeof(*sin), 939 ("%s: invalid length for address %p", __func__, sin)); 940 941 error = prison_local_ip4(cred, &sin->sin_addr); 942 if (error) 943 return (error); 944 if (sin->sin_port != *lportp) { 945 /* Don't allow the port to change. */ 946 if (*lportp != 0) 947 return (EINVAL); 948 lport = sin->sin_port; 949 } 950 /* NB: lport is left as 0 if the port isn't being changed. */ 951 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 952 /* 953 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 954 * allow complete duplication of binding if 955 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 956 * and a multicast address is bound on both 957 * new and duplicated sockets. 958 */ 959 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 960 reuseport = SO_REUSEADDR|SO_REUSEPORT; 961 /* 962 * XXX: How to deal with SO_REUSEPORT_LB here? 963 * Treat same as SO_REUSEPORT for now. 964 */ 965 if ((so->so_options & 966 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 967 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 968 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 969 sin->sin_port = 0; /* yech... */ 970 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 971 /* 972 * Is the address a local IP address? 973 * If INP_BINDANY is set, then the socket may be bound 974 * to any endpoint address, local or not. 975 */ 976 if ((inp->inp_flags & INP_BINDANY) == 0 && 977 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 978 return (EADDRNOTAVAIL); 979 } 980 laddr = sin->sin_addr; 981 if (lport) { 982 struct inpcb *t; 983 984 /* GROSS */ 985 if (ntohs(lport) <= V_ipport_reservedhigh && 986 ntohs(lport) >= V_ipport_reservedlow && 987 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 988 return (EACCES); 989 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 990 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 991 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 992 lport, INPLOOKUP_WILDCARD, cred); 993 /* 994 * XXX 995 * This entire block sorely needs a rewrite. 996 */ 997 if (t && 998 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 999 (so->so_type != SOCK_STREAM || 1000 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1001 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1002 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1003 (t->inp_flags2 & INP_REUSEPORT) || 1004 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1005 (inp->inp_cred->cr_uid != 1006 t->inp_cred->cr_uid)) 1007 return (EADDRINUSE); 1008 1009 /* 1010 * If the socket is a BINDMULTI socket, then 1011 * the credentials need to match and the 1012 * original socket also has to have been bound 1013 * with BINDMULTI. 1014 */ 1015 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1016 return (EADDRINUSE); 1017 } 1018 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1019 lport, lookupflags, cred); 1020 if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1021 (reuseport & inp_so_options(t)) == 0 && 1022 (reuseport_lb & inp_so_options(t)) == 0) { 1023 #ifdef INET6 1024 if (ntohl(sin->sin_addr.s_addr) != 1025 INADDR_ANY || 1026 ntohl(t->inp_laddr.s_addr) != 1027 INADDR_ANY || 1028 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1029 (t->inp_vflag & INP_IPV6PROTO) == 0) 1030 #endif 1031 return (EADDRINUSE); 1032 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1033 return (EADDRINUSE); 1034 } 1035 } 1036 } 1037 if (*lportp != 0) 1038 lport = *lportp; 1039 if (lport == 0) { 1040 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1041 if (error != 0) 1042 return (error); 1043 } 1044 *laddrp = laddr.s_addr; 1045 *lportp = lport; 1046 return (0); 1047 } 1048 1049 /* 1050 * Connect from a socket to a specified address. 1051 * Both address and port must be specified in argument sin. 1052 * If don't have a local address for this socket yet, 1053 * then pick one. 1054 */ 1055 int 1056 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1057 bool rehash) 1058 { 1059 u_short lport, fport; 1060 in_addr_t laddr, faddr; 1061 int anonport, error; 1062 1063 INP_WLOCK_ASSERT(inp); 1064 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1065 1066 lport = inp->inp_lport; 1067 laddr = inp->inp_laddr.s_addr; 1068 anonport = (lport == 0); 1069 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1070 NULL, cred); 1071 if (error) 1072 return (error); 1073 1074 /* Do the initial binding of the local address if required. */ 1075 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1076 KASSERT(rehash == true, 1077 ("Rehashing required for unbound inps")); 1078 inp->inp_lport = lport; 1079 inp->inp_laddr.s_addr = laddr; 1080 if (in_pcbinshash(inp) != 0) { 1081 inp->inp_laddr.s_addr = INADDR_ANY; 1082 inp->inp_lport = 0; 1083 return (EAGAIN); 1084 } 1085 } 1086 1087 /* Commit the remaining changes. */ 1088 inp->inp_lport = lport; 1089 inp->inp_laddr.s_addr = laddr; 1090 inp->inp_faddr.s_addr = faddr; 1091 inp->inp_fport = fport; 1092 if (rehash) { 1093 in_pcbrehash(inp); 1094 } else { 1095 in_pcbinshash(inp); 1096 } 1097 1098 if (anonport) 1099 inp->inp_flags |= INP_ANONPORT; 1100 return (0); 1101 } 1102 1103 /* 1104 * Do proper source address selection on an unbound socket in case 1105 * of connect. Take jails into account as well. 1106 */ 1107 int 1108 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1109 struct ucred *cred) 1110 { 1111 struct ifaddr *ifa; 1112 struct sockaddr *sa; 1113 struct sockaddr_in *sin, dst; 1114 struct nhop_object *nh; 1115 int error; 1116 1117 NET_EPOCH_ASSERT(); 1118 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1119 1120 /* 1121 * Bypass source address selection and use the primary jail IP 1122 * if requested. 1123 */ 1124 if (!prison_saddrsel_ip4(cred, laddr)) 1125 return (0); 1126 1127 error = 0; 1128 1129 nh = NULL; 1130 bzero(&dst, sizeof(dst)); 1131 sin = &dst; 1132 sin->sin_family = AF_INET; 1133 sin->sin_len = sizeof(struct sockaddr_in); 1134 sin->sin_addr.s_addr = faddr->s_addr; 1135 1136 /* 1137 * If route is known our src addr is taken from the i/f, 1138 * else punt. 1139 * 1140 * Find out route to destination. 1141 */ 1142 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1143 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1144 0, NHR_NONE, 0); 1145 1146 /* 1147 * If we found a route, use the address corresponding to 1148 * the outgoing interface. 1149 * 1150 * Otherwise assume faddr is reachable on a directly connected 1151 * network and try to find a corresponding interface to take 1152 * the source address from. 1153 */ 1154 if (nh == NULL || nh->nh_ifp == NULL) { 1155 struct in_ifaddr *ia; 1156 struct ifnet *ifp; 1157 1158 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1159 inp->inp_socket->so_fibnum)); 1160 if (ia == NULL) { 1161 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1162 inp->inp_socket->so_fibnum)); 1163 } 1164 if (ia == NULL) { 1165 error = ENETUNREACH; 1166 goto done; 1167 } 1168 1169 if (!prison_flag(cred, PR_IP4)) { 1170 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1171 goto done; 1172 } 1173 1174 ifp = ia->ia_ifp; 1175 ia = NULL; 1176 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1177 sa = ifa->ifa_addr; 1178 if (sa->sa_family != AF_INET) 1179 continue; 1180 sin = (struct sockaddr_in *)sa; 1181 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1182 ia = (struct in_ifaddr *)ifa; 1183 break; 1184 } 1185 } 1186 if (ia != NULL) { 1187 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1188 goto done; 1189 } 1190 1191 /* 3. As a last resort return the 'default' jail address. */ 1192 error = prison_get_ip4(cred, laddr); 1193 goto done; 1194 } 1195 1196 /* 1197 * If the outgoing interface on the route found is not 1198 * a loopback interface, use the address from that interface. 1199 * In case of jails do those three steps: 1200 * 1. check if the interface address belongs to the jail. If so use it. 1201 * 2. check if we have any address on the outgoing interface 1202 * belonging to this jail. If so use it. 1203 * 3. as a last resort return the 'default' jail address. 1204 */ 1205 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1206 struct in_ifaddr *ia; 1207 struct ifnet *ifp; 1208 1209 /* If not jailed, use the default returned. */ 1210 if (!prison_flag(cred, PR_IP4)) { 1211 ia = (struct in_ifaddr *)nh->nh_ifa; 1212 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1213 goto done; 1214 } 1215 1216 /* Jailed. */ 1217 /* 1. Check if the iface address belongs to the jail. */ 1218 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1219 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1220 ia = (struct in_ifaddr *)nh->nh_ifa; 1221 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1222 goto done; 1223 } 1224 1225 /* 1226 * 2. Check if we have any address on the outgoing interface 1227 * belonging to this jail. 1228 */ 1229 ia = NULL; 1230 ifp = nh->nh_ifp; 1231 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1232 sa = ifa->ifa_addr; 1233 if (sa->sa_family != AF_INET) 1234 continue; 1235 sin = (struct sockaddr_in *)sa; 1236 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1237 ia = (struct in_ifaddr *)ifa; 1238 break; 1239 } 1240 } 1241 if (ia != NULL) { 1242 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1243 goto done; 1244 } 1245 1246 /* 3. As a last resort return the 'default' jail address. */ 1247 error = prison_get_ip4(cred, laddr); 1248 goto done; 1249 } 1250 1251 /* 1252 * The outgoing interface is marked with 'loopback net', so a route 1253 * to ourselves is here. 1254 * Try to find the interface of the destination address and then 1255 * take the address from there. That interface is not necessarily 1256 * a loopback interface. 1257 * In case of jails, check that it is an address of the jail 1258 * and if we cannot find, fall back to the 'default' jail address. 1259 */ 1260 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1261 struct in_ifaddr *ia; 1262 1263 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1264 inp->inp_socket->so_fibnum)); 1265 if (ia == NULL) 1266 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1267 inp->inp_socket->so_fibnum)); 1268 if (ia == NULL) 1269 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1270 1271 if (!prison_flag(cred, PR_IP4)) { 1272 if (ia == NULL) { 1273 error = ENETUNREACH; 1274 goto done; 1275 } 1276 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1277 goto done; 1278 } 1279 1280 /* Jailed. */ 1281 if (ia != NULL) { 1282 struct ifnet *ifp; 1283 1284 ifp = ia->ia_ifp; 1285 ia = NULL; 1286 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1287 sa = ifa->ifa_addr; 1288 if (sa->sa_family != AF_INET) 1289 continue; 1290 sin = (struct sockaddr_in *)sa; 1291 if (prison_check_ip4(cred, 1292 &sin->sin_addr) == 0) { 1293 ia = (struct in_ifaddr *)ifa; 1294 break; 1295 } 1296 } 1297 if (ia != NULL) { 1298 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1299 goto done; 1300 } 1301 } 1302 1303 /* 3. As a last resort return the 'default' jail address. */ 1304 error = prison_get_ip4(cred, laddr); 1305 goto done; 1306 } 1307 1308 done: 1309 return (error); 1310 } 1311 1312 /* 1313 * Set up for a connect from a socket to the specified address. 1314 * On entry, *laddrp and *lportp should contain the current local 1315 * address and port for the PCB; these are updated to the values 1316 * that should be placed in inp_laddr and inp_lport to complete 1317 * the connect. 1318 * 1319 * On success, *faddrp and *fportp will be set to the remote address 1320 * and port. These are not updated in the error case. 1321 * 1322 * If the operation fails because the connection already exists, 1323 * *oinpp will be set to the PCB of that connection so that the 1324 * caller can decide to override it. In all other cases, *oinpp 1325 * is set to NULL. 1326 */ 1327 int 1328 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1329 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1330 struct inpcb **oinpp, struct ucred *cred) 1331 { 1332 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1333 struct in_ifaddr *ia; 1334 struct inpcb *oinp; 1335 struct in_addr laddr, faddr; 1336 u_short lport, fport; 1337 int error; 1338 1339 KASSERT(sin->sin_family == AF_INET, 1340 ("%s: invalid address family for %p", __func__, sin)); 1341 KASSERT(sin->sin_len == sizeof(*sin), 1342 ("%s: invalid address length for %p", __func__, sin)); 1343 1344 /* 1345 * Because a global state change doesn't actually occur here, a read 1346 * lock is sufficient. 1347 */ 1348 NET_EPOCH_ASSERT(); 1349 INP_LOCK_ASSERT(inp); 1350 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1351 1352 if (oinpp != NULL) 1353 *oinpp = NULL; 1354 if (sin->sin_port == 0) 1355 return (EADDRNOTAVAIL); 1356 laddr.s_addr = *laddrp; 1357 lport = *lportp; 1358 faddr = sin->sin_addr; 1359 fport = sin->sin_port; 1360 #ifdef ROUTE_MPATH 1361 if (CALC_FLOWID_OUTBOUND) { 1362 uint32_t hash_val, hash_type; 1363 1364 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1365 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1366 1367 inp->inp_flowid = hash_val; 1368 inp->inp_flowtype = hash_type; 1369 } 1370 #endif 1371 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1372 /* 1373 * If the destination address is INADDR_ANY, 1374 * use the primary local address. 1375 * If the supplied address is INADDR_BROADCAST, 1376 * and the primary interface supports broadcast, 1377 * choose the broadcast address for that interface. 1378 */ 1379 if (faddr.s_addr == INADDR_ANY) { 1380 faddr = 1381 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1382 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1383 return (error); 1384 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1385 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1386 IFF_BROADCAST) 1387 faddr = satosin(&CK_STAILQ_FIRST( 1388 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1389 } 1390 } 1391 if (laddr.s_addr == INADDR_ANY) { 1392 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1393 /* 1394 * If the destination address is multicast and an outgoing 1395 * interface has been set as a multicast option, prefer the 1396 * address of that interface as our source address. 1397 */ 1398 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1399 inp->inp_moptions != NULL) { 1400 struct ip_moptions *imo; 1401 struct ifnet *ifp; 1402 1403 imo = inp->inp_moptions; 1404 if (imo->imo_multicast_ifp != NULL) { 1405 ifp = imo->imo_multicast_ifp; 1406 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1407 if (ia->ia_ifp == ifp && 1408 prison_check_ip4(cred, 1409 &ia->ia_addr.sin_addr) == 0) 1410 break; 1411 } 1412 if (ia == NULL) 1413 error = EADDRNOTAVAIL; 1414 else { 1415 laddr = ia->ia_addr.sin_addr; 1416 error = 0; 1417 } 1418 } 1419 } 1420 if (error) 1421 return (error); 1422 } 1423 1424 if (lport != 0) { 1425 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1426 fport, laddr, lport, 0, NULL, M_NODOM); 1427 if (oinp != NULL) { 1428 if (oinpp != NULL) 1429 *oinpp = oinp; 1430 return (EADDRINUSE); 1431 } 1432 } else { 1433 struct sockaddr_in lsin, fsin; 1434 1435 bzero(&lsin, sizeof(lsin)); 1436 bzero(&fsin, sizeof(fsin)); 1437 lsin.sin_family = AF_INET; 1438 lsin.sin_addr = laddr; 1439 fsin.sin_family = AF_INET; 1440 fsin.sin_addr = faddr; 1441 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1442 &lport, (struct sockaddr *)& fsin, fport, cred, 1443 INPLOOKUP_WILDCARD); 1444 if (error) 1445 return (error); 1446 } 1447 *laddrp = laddr.s_addr; 1448 *lportp = lport; 1449 *faddrp = faddr.s_addr; 1450 *fportp = fport; 1451 return (0); 1452 } 1453 1454 void 1455 in_pcbdisconnect(struct inpcb *inp) 1456 { 1457 1458 INP_WLOCK_ASSERT(inp); 1459 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1460 1461 inp->inp_faddr.s_addr = INADDR_ANY; 1462 inp->inp_fport = 0; 1463 in_pcbrehash(inp); 1464 } 1465 #endif /* INET */ 1466 1467 /* 1468 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1469 * For most protocols, this will be invoked immediately prior to calling 1470 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1471 * socket, in which case in_pcbfree() is deferred. 1472 */ 1473 void 1474 in_pcbdetach(struct inpcb *inp) 1475 { 1476 1477 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1478 1479 #ifdef RATELIMIT 1480 if (inp->inp_snd_tag != NULL) 1481 in_pcbdetach_txrtlmt(inp); 1482 #endif 1483 inp->inp_socket->so_pcb = NULL; 1484 inp->inp_socket = NULL; 1485 } 1486 1487 /* 1488 * inpcb hash lookups are protected by SMR section. 1489 * 1490 * Once desired pcb has been found, switching from SMR section to a pcb 1491 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1492 * here because SMR is a critical section. 1493 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1494 */ 1495 static inline void 1496 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1497 { 1498 1499 lock == INPLOOKUP_RLOCKPCB ? 1500 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1501 } 1502 1503 static inline void 1504 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1505 { 1506 1507 lock == INPLOOKUP_RLOCKPCB ? 1508 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1509 } 1510 1511 static inline int 1512 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1513 { 1514 1515 return (lock == INPLOOKUP_RLOCKPCB ? 1516 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1517 } 1518 1519 static inline bool 1520 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1521 { 1522 1523 return (lock == INPLOOKUP_RLOCKPCB ? 1524 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1525 } 1526 1527 static inline bool 1528 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1529 { 1530 1531 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1532 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1533 1534 if (__predict_true(inp_trylock(inp, lock))) { 1535 if (__predict_false(inp->inp_flags & ignflags)) { 1536 smr_exit(inp->inp_pcbinfo->ipi_smr); 1537 inp_unlock(inp, lock); 1538 return (false); 1539 } 1540 smr_exit(inp->inp_pcbinfo->ipi_smr); 1541 return (true); 1542 } 1543 1544 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1545 smr_exit(inp->inp_pcbinfo->ipi_smr); 1546 inp_lock(inp, lock); 1547 if (__predict_false(in_pcbrele(inp, lock))) 1548 return (false); 1549 /* 1550 * inp acquired through refcount & lock for sure didn't went 1551 * through uma_zfree(). However, it may have already went 1552 * through in_pcbfree() and has another reference, that 1553 * prevented its release by our in_pcbrele(). 1554 */ 1555 if (__predict_false(inp->inp_flags & ignflags)) { 1556 inp_unlock(inp, lock); 1557 return (false); 1558 } 1559 return (true); 1560 } else { 1561 smr_exit(inp->inp_pcbinfo->ipi_smr); 1562 return (false); 1563 } 1564 } 1565 1566 bool 1567 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1568 { 1569 1570 /* 1571 * in_pcblookup() family of functions ignore not only freed entries, 1572 * that may be found due to lockless access to the hash, but dropped 1573 * entries, too. 1574 */ 1575 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1576 } 1577 1578 /* 1579 * inp_next() - inpcb hash/list traversal iterator 1580 * 1581 * Requires initialized struct inpcb_iterator for context. 1582 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1583 * 1584 * - Iterator can have either write-lock or read-lock semantics, that can not 1585 * be changed later. 1586 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1587 * a single hash slot. Note: only rip_input() does the latter. 1588 * - Iterator may have optional bool matching function. The matching function 1589 * will be executed for each inpcb in the SMR context, so it can not acquire 1590 * locks and can safely access only immutable fields of inpcb. 1591 * 1592 * A fresh initialized iterator has NULL inpcb in its context and that 1593 * means that inp_next() call would return the very first inpcb on the list 1594 * locked with desired semantic. In all following calls the context pointer 1595 * shall hold the current inpcb pointer. The KPI user is not supposed to 1596 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1597 * and write NULL to its context. After end of traversal an iterator can be 1598 * reused. 1599 * 1600 * List traversals have the following features/constraints: 1601 * - New entries won't be seen, as they are always added to the head of a list. 1602 * - Removed entries won't stop traversal as long as they are not added to 1603 * a different list. This is violated by in_pcbrehash(). 1604 */ 1605 #define II_LIST_FIRST(ipi, hash) \ 1606 (((hash) == INP_ALL_LIST) ? \ 1607 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1608 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1609 #define II_LIST_NEXT(inp, hash) \ 1610 (((hash) == INP_ALL_LIST) ? \ 1611 CK_LIST_NEXT((inp), inp_list) : \ 1612 CK_LIST_NEXT((inp), inp_hash)) 1613 #define II_LOCK_ASSERT(inp, lock) \ 1614 rw_assert(&(inp)->inp_lock, \ 1615 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1616 struct inpcb * 1617 inp_next(struct inpcb_iterator *ii) 1618 { 1619 const struct inpcbinfo *ipi = ii->ipi; 1620 inp_match_t *match = ii->match; 1621 void *ctx = ii->ctx; 1622 inp_lookup_t lock = ii->lock; 1623 int hash = ii->hash; 1624 struct inpcb *inp; 1625 1626 if (ii->inp == NULL) { /* First call. */ 1627 smr_enter(ipi->ipi_smr); 1628 /* This is unrolled CK_LIST_FOREACH(). */ 1629 for (inp = II_LIST_FIRST(ipi, hash); 1630 inp != NULL; 1631 inp = II_LIST_NEXT(inp, hash)) { 1632 if (match != NULL && (match)(inp, ctx) == false) 1633 continue; 1634 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1635 break; 1636 else { 1637 smr_enter(ipi->ipi_smr); 1638 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1639 inp = II_LIST_FIRST(ipi, hash); 1640 if (inp == NULL) 1641 break; 1642 } 1643 } 1644 1645 if (inp == NULL) 1646 smr_exit(ipi->ipi_smr); 1647 else 1648 ii->inp = inp; 1649 1650 return (inp); 1651 } 1652 1653 /* Not a first call. */ 1654 smr_enter(ipi->ipi_smr); 1655 restart: 1656 inp = ii->inp; 1657 II_LOCK_ASSERT(inp, lock); 1658 next: 1659 inp = II_LIST_NEXT(inp, hash); 1660 if (inp == NULL) { 1661 smr_exit(ipi->ipi_smr); 1662 goto found; 1663 } 1664 1665 if (match != NULL && (match)(inp, ctx) == false) 1666 goto next; 1667 1668 if (__predict_true(inp_trylock(inp, lock))) { 1669 if (__predict_false(inp->inp_flags & INP_FREED)) { 1670 /* 1671 * Entries are never inserted in middle of a list, thus 1672 * as long as we are in SMR, we can continue traversal. 1673 * Jump to 'restart' should yield in the same result, 1674 * but could produce unnecessary looping. Could this 1675 * looping be unbound? 1676 */ 1677 inp_unlock(inp, lock); 1678 goto next; 1679 } else { 1680 smr_exit(ipi->ipi_smr); 1681 goto found; 1682 } 1683 } 1684 1685 /* 1686 * Can't obtain lock immediately, thus going hard. Once we exit the 1687 * SMR section we can no longer jump to 'next', and our only stable 1688 * anchoring point is ii->inp, which we keep locked for this case, so 1689 * we jump to 'restart'. 1690 */ 1691 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1692 smr_exit(ipi->ipi_smr); 1693 inp_lock(inp, lock); 1694 if (__predict_false(in_pcbrele(inp, lock))) { 1695 smr_enter(ipi->ipi_smr); 1696 goto restart; 1697 } 1698 /* 1699 * See comment in inp_smr_lock(). 1700 */ 1701 if (__predict_false(inp->inp_flags & INP_FREED)) { 1702 inp_unlock(inp, lock); 1703 smr_enter(ipi->ipi_smr); 1704 goto restart; 1705 } 1706 } else 1707 goto next; 1708 1709 found: 1710 inp_unlock(ii->inp, lock); 1711 ii->inp = inp; 1712 1713 return (ii->inp); 1714 } 1715 1716 /* 1717 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1718 * stability of an inpcb pointer despite the inpcb lock being released or 1719 * SMR section exited. 1720 * 1721 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1722 */ 1723 void 1724 in_pcbref(struct inpcb *inp) 1725 { 1726 u_int old __diagused; 1727 1728 old = refcount_acquire(&inp->inp_refcount); 1729 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1730 } 1731 1732 /* 1733 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1734 * freeing the pcb, if the reference was very last. 1735 */ 1736 bool 1737 in_pcbrele_rlocked(struct inpcb *inp) 1738 { 1739 1740 INP_RLOCK_ASSERT(inp); 1741 1742 if (refcount_release(&inp->inp_refcount) == 0) 1743 return (false); 1744 1745 MPASS(inp->inp_flags & INP_FREED); 1746 MPASS(inp->inp_socket == NULL); 1747 MPASS(inp->inp_in_hpts == 0); 1748 INP_RUNLOCK(inp); 1749 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1750 return (true); 1751 } 1752 1753 bool 1754 in_pcbrele_wlocked(struct inpcb *inp) 1755 { 1756 1757 INP_WLOCK_ASSERT(inp); 1758 1759 if (refcount_release(&inp->inp_refcount) == 0) 1760 return (false); 1761 1762 MPASS(inp->inp_flags & INP_FREED); 1763 MPASS(inp->inp_socket == NULL); 1764 MPASS(inp->inp_in_hpts == 0); 1765 INP_WUNLOCK(inp); 1766 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1767 return (true); 1768 } 1769 1770 /* 1771 * Unconditionally schedule an inpcb to be freed by decrementing its 1772 * reference count, which should occur only after the inpcb has been detached 1773 * from its socket. If another thread holds a temporary reference (acquired 1774 * using in_pcbref()) then the free is deferred until that reference is 1775 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1776 * Almost all work, including removal from global lists, is done in this 1777 * context, where the pcbinfo lock is held. 1778 */ 1779 void 1780 in_pcbfree(struct inpcb *inp) 1781 { 1782 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1783 #ifdef INET 1784 struct ip_moptions *imo; 1785 #endif 1786 #ifdef INET6 1787 struct ip6_moptions *im6o; 1788 #endif 1789 1790 INP_WLOCK_ASSERT(inp); 1791 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1792 KASSERT((inp->inp_flags & INP_FREED) == 0, 1793 ("%s: called twice for pcb %p", __func__, inp)); 1794 1795 inp->inp_flags |= INP_FREED; 1796 INP_INFO_WLOCK(pcbinfo); 1797 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1798 pcbinfo->ipi_count--; 1799 CK_LIST_REMOVE(inp, inp_list); 1800 INP_INFO_WUNLOCK(pcbinfo); 1801 1802 if (inp->inp_flags & INP_INHASHLIST) 1803 in_pcbremhash(inp); 1804 1805 RO_INVALIDATE_CACHE(&inp->inp_route); 1806 #ifdef MAC 1807 mac_inpcb_destroy(inp); 1808 #endif 1809 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1810 if (inp->inp_sp != NULL) 1811 ipsec_delete_pcbpolicy(inp); 1812 #endif 1813 #ifdef INET 1814 if (inp->inp_options) 1815 (void)m_free(inp->inp_options); 1816 imo = inp->inp_moptions; 1817 #endif 1818 #ifdef INET6 1819 if (inp->inp_vflag & INP_IPV6PROTO) { 1820 ip6_freepcbopts(inp->in6p_outputopts); 1821 im6o = inp->in6p_moptions; 1822 } else 1823 im6o = NULL; 1824 #endif 1825 1826 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1827 INP_WUNLOCK(inp); 1828 } 1829 #ifdef INET6 1830 ip6_freemoptions(im6o); 1831 #endif 1832 #ifdef INET 1833 inp_freemoptions(imo); 1834 #endif 1835 /* Destruction is finalized in inpcb_dtor(). */ 1836 } 1837 1838 static void 1839 inpcb_dtor(void *mem, int size, void *arg) 1840 { 1841 struct inpcb *inp = mem; 1842 1843 crfree(inp->inp_cred); 1844 #ifdef INVARIANTS 1845 inp->inp_cred = NULL; 1846 #endif 1847 } 1848 1849 /* 1850 * Different protocols initialize their inpcbs differently - giving 1851 * different name to the lock. But they all are disposed the same. 1852 */ 1853 static void 1854 inpcb_fini(void *mem, int size) 1855 { 1856 struct inpcb *inp = mem; 1857 1858 INP_LOCK_DESTROY(inp); 1859 } 1860 1861 /* 1862 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1863 * port reservation, and preventing it from being returned by inpcb lookups. 1864 * 1865 * It is used by TCP to mark an inpcb as unused and avoid future packet 1866 * delivery or event notification when a socket remains open but TCP has 1867 * closed. This might occur as a result of a shutdown()-initiated TCP close 1868 * or a RST on the wire, and allows the port binding to be reused while still 1869 * maintaining the invariant that so_pcb always points to a valid inpcb until 1870 * in_pcbdetach(). 1871 * 1872 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1873 * in_pcbnotifyall() and in_pcbpurgeif0()? 1874 */ 1875 void 1876 in_pcbdrop(struct inpcb *inp) 1877 { 1878 1879 INP_WLOCK_ASSERT(inp); 1880 #ifdef INVARIANTS 1881 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1882 MPASS(inp->inp_refcount > 1); 1883 #endif 1884 1885 inp->inp_flags |= INP_DROPPED; 1886 if (inp->inp_flags & INP_INHASHLIST) 1887 in_pcbremhash(inp); 1888 } 1889 1890 #ifdef INET 1891 /* 1892 * Common routines to return the socket addresses associated with inpcbs. 1893 */ 1894 struct sockaddr * 1895 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1896 { 1897 struct sockaddr_in *sin; 1898 1899 sin = malloc(sizeof *sin, M_SONAME, 1900 M_WAITOK | M_ZERO); 1901 sin->sin_family = AF_INET; 1902 sin->sin_len = sizeof(*sin); 1903 sin->sin_addr = *addr_p; 1904 sin->sin_port = port; 1905 1906 return (struct sockaddr *)sin; 1907 } 1908 1909 int 1910 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1911 { 1912 struct inpcb *inp; 1913 struct in_addr addr; 1914 in_port_t port; 1915 1916 inp = sotoinpcb(so); 1917 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1918 1919 INP_RLOCK(inp); 1920 port = inp->inp_lport; 1921 addr = inp->inp_laddr; 1922 INP_RUNLOCK(inp); 1923 1924 *nam = in_sockaddr(port, &addr); 1925 return 0; 1926 } 1927 1928 int 1929 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1930 { 1931 struct inpcb *inp; 1932 struct in_addr addr; 1933 in_port_t port; 1934 1935 inp = sotoinpcb(so); 1936 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1937 1938 INP_RLOCK(inp); 1939 port = inp->inp_fport; 1940 addr = inp->inp_faddr; 1941 INP_RUNLOCK(inp); 1942 1943 *nam = in_sockaddr(port, &addr); 1944 return 0; 1945 } 1946 1947 void 1948 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1949 struct inpcb *(*notify)(struct inpcb *, int)) 1950 { 1951 struct inpcb *inp, *inp_temp; 1952 1953 INP_INFO_WLOCK(pcbinfo); 1954 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1955 INP_WLOCK(inp); 1956 #ifdef INET6 1957 if ((inp->inp_vflag & INP_IPV4) == 0) { 1958 INP_WUNLOCK(inp); 1959 continue; 1960 } 1961 #endif 1962 if (inp->inp_faddr.s_addr != faddr.s_addr || 1963 inp->inp_socket == NULL) { 1964 INP_WUNLOCK(inp); 1965 continue; 1966 } 1967 if ((*notify)(inp, errno)) 1968 INP_WUNLOCK(inp); 1969 } 1970 INP_INFO_WUNLOCK(pcbinfo); 1971 } 1972 1973 static bool 1974 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1975 { 1976 1977 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1978 return (true); 1979 else 1980 return (false); 1981 } 1982 1983 void 1984 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1985 { 1986 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1987 inp_v4_multi_match, NULL); 1988 struct inpcb *inp; 1989 struct in_multi *inm; 1990 struct in_mfilter *imf; 1991 struct ip_moptions *imo; 1992 1993 IN_MULTI_LOCK_ASSERT(); 1994 1995 while ((inp = inp_next(&inpi)) != NULL) { 1996 INP_WLOCK_ASSERT(inp); 1997 1998 imo = inp->inp_moptions; 1999 /* 2000 * Unselect the outgoing interface if it is being 2001 * detached. 2002 */ 2003 if (imo->imo_multicast_ifp == ifp) 2004 imo->imo_multicast_ifp = NULL; 2005 2006 /* 2007 * Drop multicast group membership if we joined 2008 * through the interface being detached. 2009 * 2010 * XXX This can all be deferred to an epoch_call 2011 */ 2012 restart: 2013 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2014 if ((inm = imf->imf_inm) == NULL) 2015 continue; 2016 if (inm->inm_ifp != ifp) 2017 continue; 2018 ip_mfilter_remove(&imo->imo_head, imf); 2019 in_leavegroup_locked(inm, NULL); 2020 ip_mfilter_free(imf); 2021 goto restart; 2022 } 2023 } 2024 } 2025 2026 /* 2027 * Lookup a PCB based on the local address and port. Caller must hold the 2028 * hash lock. No inpcb locks or references are acquired. 2029 */ 2030 #define INP_LOOKUP_MAPPED_PCB_COST 3 2031 struct inpcb * 2032 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2033 u_short lport, int lookupflags, struct ucred *cred) 2034 { 2035 struct inpcb *inp; 2036 #ifdef INET6 2037 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2038 #else 2039 int matchwild = 3; 2040 #endif 2041 int wildcard; 2042 2043 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2044 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2045 INP_HASH_LOCK_ASSERT(pcbinfo); 2046 2047 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2048 struct inpcbhead *head; 2049 /* 2050 * Look for an unconnected (wildcard foreign addr) PCB that 2051 * matches the local address and port we're looking for. 2052 */ 2053 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2054 pcbinfo->ipi_hashmask)]; 2055 CK_LIST_FOREACH(inp, head, inp_hash) { 2056 #ifdef INET6 2057 /* XXX inp locking */ 2058 if ((inp->inp_vflag & INP_IPV4) == 0) 2059 continue; 2060 #endif 2061 if (inp->inp_faddr.s_addr == INADDR_ANY && 2062 inp->inp_laddr.s_addr == laddr.s_addr && 2063 inp->inp_lport == lport) { 2064 /* 2065 * Found? 2066 */ 2067 if (prison_equal_ip4(cred->cr_prison, 2068 inp->inp_cred->cr_prison)) 2069 return (inp); 2070 } 2071 } 2072 /* 2073 * Not found. 2074 */ 2075 return (NULL); 2076 } else { 2077 struct inpcbporthead *porthash; 2078 struct inpcbport *phd; 2079 struct inpcb *match = NULL; 2080 /* 2081 * Best fit PCB lookup. 2082 * 2083 * First see if this local port is in use by looking on the 2084 * port hash list. 2085 */ 2086 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2087 pcbinfo->ipi_porthashmask)]; 2088 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2089 if (phd->phd_port == lport) 2090 break; 2091 } 2092 if (phd != NULL) { 2093 /* 2094 * Port is in use by one or more PCBs. Look for best 2095 * fit. 2096 */ 2097 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2098 wildcard = 0; 2099 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2100 cred->cr_prison)) 2101 continue; 2102 #ifdef INET6 2103 /* XXX inp locking */ 2104 if ((inp->inp_vflag & INP_IPV4) == 0) 2105 continue; 2106 /* 2107 * We never select the PCB that has 2108 * INP_IPV6 flag and is bound to :: if 2109 * we have another PCB which is bound 2110 * to 0.0.0.0. If a PCB has the 2111 * INP_IPV6 flag, then we set its cost 2112 * higher than IPv4 only PCBs. 2113 * 2114 * Note that the case only happens 2115 * when a socket is bound to ::, under 2116 * the condition that the use of the 2117 * mapped address is allowed. 2118 */ 2119 if ((inp->inp_vflag & INP_IPV6) != 0) 2120 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2121 #endif 2122 if (inp->inp_faddr.s_addr != INADDR_ANY) 2123 wildcard++; 2124 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2125 if (laddr.s_addr == INADDR_ANY) 2126 wildcard++; 2127 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2128 continue; 2129 } else { 2130 if (laddr.s_addr != INADDR_ANY) 2131 wildcard++; 2132 } 2133 if (wildcard < matchwild) { 2134 match = inp; 2135 matchwild = wildcard; 2136 if (matchwild == 0) 2137 break; 2138 } 2139 } 2140 } 2141 return (match); 2142 } 2143 } 2144 #undef INP_LOOKUP_MAPPED_PCB_COST 2145 2146 static bool 2147 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) 2148 { 2149 return (domain == M_NODOM || domain == grp->il_numa_domain); 2150 } 2151 2152 static struct inpcb * 2153 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2154 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2155 uint16_t fport, int lookupflags, int domain) 2156 { 2157 const struct inpcblbgrouphead *hdr; 2158 struct inpcblbgroup *grp; 2159 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2160 2161 INP_HASH_LOCK_ASSERT(pcbinfo); 2162 2163 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2164 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2165 2166 /* 2167 * Search for an LB group match based on the following criteria: 2168 * - prefer jailed groups to non-jailed groups 2169 * - prefer exact source address matches to wildcard matches 2170 * - prefer groups bound to the specified NUMA domain 2171 */ 2172 jail_exact = jail_wild = local_exact = local_wild = NULL; 2173 CK_LIST_FOREACH(grp, hdr, il_list) { 2174 bool injail; 2175 2176 #ifdef INET6 2177 if (!(grp->il_vflag & INP_IPV4)) 2178 continue; 2179 #endif 2180 if (grp->il_lport != lport) 2181 continue; 2182 2183 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2184 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2185 laddr) != 0) 2186 continue; 2187 2188 if (grp->il_laddr.s_addr == laddr->s_addr) { 2189 if (injail) { 2190 jail_exact = grp; 2191 if (in_pcblookup_lb_numa_match(grp, domain)) 2192 /* This is a perfect match. */ 2193 goto out; 2194 } else if (local_exact == NULL || 2195 in_pcblookup_lb_numa_match(grp, domain)) { 2196 local_exact = grp; 2197 } 2198 } else if (grp->il_laddr.s_addr == INADDR_ANY && 2199 (lookupflags & INPLOOKUP_WILDCARD) != 0) { 2200 if (injail) { 2201 if (jail_wild == NULL || 2202 in_pcblookup_lb_numa_match(grp, domain)) 2203 jail_wild = grp; 2204 } else if (local_wild == NULL || 2205 in_pcblookup_lb_numa_match(grp, domain)) { 2206 local_wild = grp; 2207 } 2208 } 2209 } 2210 2211 if (jail_exact != NULL) 2212 grp = jail_exact; 2213 else if (jail_wild != NULL) 2214 grp = jail_wild; 2215 else if (local_exact != NULL) 2216 grp = local_exact; 2217 else 2218 grp = local_wild; 2219 if (grp == NULL) 2220 return (NULL); 2221 out: 2222 return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2223 grp->il_inpcnt]); 2224 } 2225 2226 /* 2227 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2228 * that the caller has either locked the hash list, which usually happens 2229 * for bind(2) operations, or is in SMR section, which happens when sorting 2230 * out incoming packets. 2231 */ 2232 static struct inpcb * 2233 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2234 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2235 struct ifnet *ifp, uint8_t numa_domain) 2236 { 2237 struct inpcbhead *head; 2238 struct inpcb *inp, *tmpinp; 2239 u_short fport = fport_arg, lport = lport_arg; 2240 2241 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2242 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2243 INP_HASH_LOCK_ASSERT(pcbinfo); 2244 2245 /* 2246 * First look for an exact match. 2247 */ 2248 tmpinp = NULL; 2249 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, 2250 pcbinfo->ipi_hashmask)]; 2251 CK_LIST_FOREACH(inp, head, inp_hash) { 2252 #ifdef INET6 2253 /* XXX inp locking */ 2254 if ((inp->inp_vflag & INP_IPV4) == 0) 2255 continue; 2256 #endif 2257 if (inp->inp_faddr.s_addr == faddr.s_addr && 2258 inp->inp_laddr.s_addr == laddr.s_addr && 2259 inp->inp_fport == fport && 2260 inp->inp_lport == lport) { 2261 /* 2262 * XXX We should be able to directly return 2263 * the inp here, without any checks. 2264 * Well unless both bound with SO_REUSEPORT? 2265 */ 2266 if (prison_flag(inp->inp_cred, PR_IP4)) 2267 return (inp); 2268 if (tmpinp == NULL) 2269 tmpinp = inp; 2270 } 2271 } 2272 if (tmpinp != NULL) 2273 return (tmpinp); 2274 2275 /* 2276 * Then look for a wildcard match, if requested. 2277 */ 2278 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2279 struct inpcb *local_wild = NULL, *local_exact = NULL; 2280 #ifdef INET6 2281 struct inpcb *local_wild_mapped = NULL; 2282 #endif 2283 struct inpcb *jail_wild = NULL; 2284 int injail; 2285 2286 /* 2287 * First see if an LB group matches the request before scanning 2288 * all sockets on this port. 2289 */ 2290 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2291 fport, lookupflags, numa_domain); 2292 if (inp != NULL) 2293 return (inp); 2294 2295 /* 2296 * Order of socket selection - we always prefer jails. 2297 * 1. jailed, non-wild. 2298 * 2. jailed, wild. 2299 * 3. non-jailed, non-wild. 2300 * 4. non-jailed, wild. 2301 */ 2302 2303 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2304 pcbinfo->ipi_hashmask)]; 2305 CK_LIST_FOREACH(inp, head, inp_hash) { 2306 #ifdef INET6 2307 /* XXX inp locking */ 2308 if ((inp->inp_vflag & INP_IPV4) == 0) 2309 continue; 2310 #endif 2311 if (inp->inp_faddr.s_addr != INADDR_ANY || 2312 inp->inp_lport != lport) 2313 continue; 2314 2315 injail = prison_flag(inp->inp_cred, PR_IP4); 2316 if (injail) { 2317 if (prison_check_ip4_locked( 2318 inp->inp_cred->cr_prison, &laddr) != 0) 2319 continue; 2320 } else { 2321 if (local_exact != NULL) 2322 continue; 2323 } 2324 2325 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2326 if (injail) 2327 return (inp); 2328 else 2329 local_exact = inp; 2330 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2331 #ifdef INET6 2332 /* XXX inp locking, NULL check */ 2333 if (inp->inp_vflag & INP_IPV6PROTO) 2334 local_wild_mapped = inp; 2335 else 2336 #endif 2337 if (injail) 2338 jail_wild = inp; 2339 else 2340 local_wild = inp; 2341 } 2342 } /* LIST_FOREACH */ 2343 if (jail_wild != NULL) 2344 return (jail_wild); 2345 if (local_exact != NULL) 2346 return (local_exact); 2347 if (local_wild != NULL) 2348 return (local_wild); 2349 #ifdef INET6 2350 if (local_wild_mapped != NULL) 2351 return (local_wild_mapped); 2352 #endif 2353 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2354 2355 return (NULL); 2356 } 2357 2358 /* 2359 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2360 * hash list lock, and will return the inpcb locked (i.e., requires 2361 * INPLOOKUP_LOCKPCB). 2362 */ 2363 static struct inpcb * 2364 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2365 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2366 struct ifnet *ifp, uint8_t numa_domain) 2367 { 2368 struct inpcb *inp; 2369 2370 smr_enter(pcbinfo->ipi_smr); 2371 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2372 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2373 if (inp != NULL) { 2374 if (__predict_false(inp_smr_lock(inp, 2375 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2376 inp = NULL; 2377 } else 2378 smr_exit(pcbinfo->ipi_smr); 2379 2380 return (inp); 2381 } 2382 2383 /* 2384 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2385 * from which a pre-calculated hash value may be extracted. 2386 */ 2387 struct inpcb * 2388 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2389 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2390 { 2391 2392 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2393 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2394 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2395 ("%s: LOCKPCB not set", __func__)); 2396 2397 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2398 lookupflags, ifp, M_NODOM)); 2399 } 2400 2401 struct inpcb * 2402 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2403 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2404 struct ifnet *ifp, struct mbuf *m) 2405 { 2406 2407 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2408 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2409 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2410 ("%s: LOCKPCB not set", __func__)); 2411 2412 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2413 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2414 } 2415 #endif /* INET */ 2416 2417 /* 2418 * Insert PCB onto various hash lists. 2419 */ 2420 int 2421 in_pcbinshash(struct inpcb *inp) 2422 { 2423 struct inpcbhead *pcbhash; 2424 struct inpcbporthead *pcbporthash; 2425 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2426 struct inpcbport *phd; 2427 2428 INP_WLOCK_ASSERT(inp); 2429 INP_HASH_WLOCK_ASSERT(pcbinfo); 2430 2431 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2432 ("in_pcbinshash: INP_INHASHLIST")); 2433 2434 #ifdef INET6 2435 if (inp->inp_vflag & INP_IPV6) 2436 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2437 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2438 else 2439 #endif 2440 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2441 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2442 2443 pcbporthash = &pcbinfo->ipi_porthashbase[ 2444 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2445 2446 /* 2447 * Add entry to load balance group. 2448 * Only do this if SO_REUSEPORT_LB is set. 2449 */ 2450 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) { 2451 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2452 if (error != 0) 2453 return (error); 2454 } 2455 2456 /* 2457 * Go through port list and look for a head for this lport. 2458 */ 2459 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2460 if (phd->phd_port == inp->inp_lport) 2461 break; 2462 } 2463 2464 /* 2465 * If none exists, malloc one and tack it on. 2466 */ 2467 if (phd == NULL) { 2468 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2469 if (phd == NULL) { 2470 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2471 in_pcbremlbgrouphash(inp); 2472 return (ENOMEM); 2473 } 2474 phd->phd_port = inp->inp_lport; 2475 CK_LIST_INIT(&phd->phd_pcblist); 2476 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2477 } 2478 inp->inp_phd = phd; 2479 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2480 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2481 inp->inp_flags |= INP_INHASHLIST; 2482 2483 return (0); 2484 } 2485 2486 static void 2487 in_pcbremhash(struct inpcb *inp) 2488 { 2489 struct inpcbport *phd = inp->inp_phd; 2490 2491 INP_WLOCK_ASSERT(inp); 2492 MPASS(inp->inp_flags & INP_INHASHLIST); 2493 2494 INP_HASH_WLOCK(inp->inp_pcbinfo); 2495 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2496 in_pcbremlbgrouphash(inp); 2497 CK_LIST_REMOVE(inp, inp_hash); 2498 CK_LIST_REMOVE(inp, inp_portlist); 2499 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2500 CK_LIST_REMOVE(phd, phd_hash); 2501 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2502 } 2503 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2504 inp->inp_flags &= ~INP_INHASHLIST; 2505 } 2506 2507 /* 2508 * Move PCB to the proper hash bucket when { faddr, fport } have been 2509 * changed. NOTE: This does not handle the case of the lport changing (the 2510 * hashed port list would have to be updated as well), so the lport must 2511 * not change after in_pcbinshash() has been called. 2512 * 2513 * XXXGL: a race between this function and SMR-protected hash iterator 2514 * will lead to iterator traversing a possibly wrong hash list. However, 2515 * this race should have been here since change from rwlock to epoch. 2516 */ 2517 void 2518 in_pcbrehash(struct inpcb *inp) 2519 { 2520 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2521 struct inpcbhead *head; 2522 2523 INP_WLOCK_ASSERT(inp); 2524 INP_HASH_WLOCK_ASSERT(pcbinfo); 2525 2526 KASSERT(inp->inp_flags & INP_INHASHLIST, 2527 ("in_pcbrehash: !INP_INHASHLIST")); 2528 2529 #ifdef INET6 2530 if (inp->inp_vflag & INP_IPV6) 2531 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2532 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2533 else 2534 #endif 2535 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2536 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2537 2538 CK_LIST_REMOVE(inp, inp_hash); 2539 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2540 } 2541 2542 /* 2543 * Check for alternatives when higher level complains 2544 * about service problems. For now, invalidate cached 2545 * routing information. If the route was created dynamically 2546 * (by a redirect), time to try a default gateway again. 2547 */ 2548 void 2549 in_losing(struct inpcb *inp) 2550 { 2551 2552 RO_INVALIDATE_CACHE(&inp->inp_route); 2553 return; 2554 } 2555 2556 /* 2557 * A set label operation has occurred at the socket layer, propagate the 2558 * label change into the in_pcb for the socket. 2559 */ 2560 void 2561 in_pcbsosetlabel(struct socket *so) 2562 { 2563 #ifdef MAC 2564 struct inpcb *inp; 2565 2566 inp = sotoinpcb(so); 2567 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2568 2569 INP_WLOCK(inp); 2570 SOCK_LOCK(so); 2571 mac_inpcb_sosetlabel(so, inp); 2572 SOCK_UNLOCK(so); 2573 INP_WUNLOCK(inp); 2574 #endif 2575 } 2576 2577 void 2578 inp_wlock(struct inpcb *inp) 2579 { 2580 2581 INP_WLOCK(inp); 2582 } 2583 2584 void 2585 inp_wunlock(struct inpcb *inp) 2586 { 2587 2588 INP_WUNLOCK(inp); 2589 } 2590 2591 void 2592 inp_rlock(struct inpcb *inp) 2593 { 2594 2595 INP_RLOCK(inp); 2596 } 2597 2598 void 2599 inp_runlock(struct inpcb *inp) 2600 { 2601 2602 INP_RUNLOCK(inp); 2603 } 2604 2605 #ifdef INVARIANT_SUPPORT 2606 void 2607 inp_lock_assert(struct inpcb *inp) 2608 { 2609 2610 INP_WLOCK_ASSERT(inp); 2611 } 2612 2613 void 2614 inp_unlock_assert(struct inpcb *inp) 2615 { 2616 2617 INP_UNLOCK_ASSERT(inp); 2618 } 2619 #endif 2620 2621 void 2622 inp_apply_all(struct inpcbinfo *pcbinfo, 2623 void (*func)(struct inpcb *, void *), void *arg) 2624 { 2625 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2626 INPLOOKUP_WLOCKPCB); 2627 struct inpcb *inp; 2628 2629 while ((inp = inp_next(&inpi)) != NULL) 2630 func(inp, arg); 2631 } 2632 2633 struct socket * 2634 inp_inpcbtosocket(struct inpcb *inp) 2635 { 2636 2637 INP_WLOCK_ASSERT(inp); 2638 return (inp->inp_socket); 2639 } 2640 2641 struct tcpcb * 2642 inp_inpcbtotcpcb(struct inpcb *inp) 2643 { 2644 2645 INP_WLOCK_ASSERT(inp); 2646 return ((struct tcpcb *)inp->inp_ppcb); 2647 } 2648 2649 int 2650 inp_ip_tos_get(const struct inpcb *inp) 2651 { 2652 2653 return (inp->inp_ip_tos); 2654 } 2655 2656 void 2657 inp_ip_tos_set(struct inpcb *inp, int val) 2658 { 2659 2660 inp->inp_ip_tos = val; 2661 } 2662 2663 void 2664 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2665 uint32_t *faddr, uint16_t *fp) 2666 { 2667 2668 INP_LOCK_ASSERT(inp); 2669 *laddr = inp->inp_laddr.s_addr; 2670 *faddr = inp->inp_faddr.s_addr; 2671 *lp = inp->inp_lport; 2672 *fp = inp->inp_fport; 2673 } 2674 2675 struct inpcb * 2676 so_sotoinpcb(struct socket *so) 2677 { 2678 2679 return (sotoinpcb(so)); 2680 } 2681 2682 /* 2683 * Create an external-format (``xinpcb'') structure using the information in 2684 * the kernel-format in_pcb structure pointed to by inp. This is done to 2685 * reduce the spew of irrelevant information over this interface, to isolate 2686 * user code from changes in the kernel structure, and potentially to provide 2687 * information-hiding if we decide that some of this information should be 2688 * hidden from users. 2689 */ 2690 void 2691 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2692 { 2693 2694 bzero(xi, sizeof(*xi)); 2695 xi->xi_len = sizeof(struct xinpcb); 2696 if (inp->inp_socket) 2697 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2698 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2699 xi->inp_gencnt = inp->inp_gencnt; 2700 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2701 xi->inp_flow = inp->inp_flow; 2702 xi->inp_flowid = inp->inp_flowid; 2703 xi->inp_flowtype = inp->inp_flowtype; 2704 xi->inp_flags = inp->inp_flags; 2705 xi->inp_flags2 = inp->inp_flags2; 2706 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2707 xi->in6p_cksum = inp->in6p_cksum; 2708 xi->in6p_hops = inp->in6p_hops; 2709 xi->inp_ip_tos = inp->inp_ip_tos; 2710 xi->inp_vflag = inp->inp_vflag; 2711 xi->inp_ip_ttl = inp->inp_ip_ttl; 2712 xi->inp_ip_p = inp->inp_ip_p; 2713 xi->inp_ip_minttl = inp->inp_ip_minttl; 2714 } 2715 2716 int 2717 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2718 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2719 { 2720 struct sockopt sopt; 2721 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2722 INPLOOKUP_WLOCKPCB); 2723 struct inpcb *inp; 2724 struct sockopt_parameters *params; 2725 struct socket *so; 2726 int error; 2727 char buf[1024]; 2728 2729 if (req->oldptr != NULL || req->oldlen != 0) 2730 return (EINVAL); 2731 if (req->newptr == NULL) 2732 return (EPERM); 2733 if (req->newlen > sizeof(buf)) 2734 return (ENOMEM); 2735 error = SYSCTL_IN(req, buf, req->newlen); 2736 if (error != 0) 2737 return (error); 2738 if (req->newlen < sizeof(struct sockopt_parameters)) 2739 return (EINVAL); 2740 params = (struct sockopt_parameters *)buf; 2741 sopt.sopt_level = params->sop_level; 2742 sopt.sopt_name = params->sop_optname; 2743 sopt.sopt_dir = SOPT_SET; 2744 sopt.sopt_val = params->sop_optval; 2745 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2746 sopt.sopt_td = NULL; 2747 #ifdef INET6 2748 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2749 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2750 params->sop_inc.inc6_laddr.s6_addr16[1] = 2751 htons(params->sop_inc.inc6_zoneid & 0xffff); 2752 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2753 params->sop_inc.inc6_faddr.s6_addr16[1] = 2754 htons(params->sop_inc.inc6_zoneid & 0xffff); 2755 } 2756 #endif 2757 if (params->sop_inc.inc_lport != htons(0)) { 2758 if (params->sop_inc.inc_fport == htons(0)) 2759 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, 2760 pcbinfo->ipi_hashmask); 2761 else 2762 #ifdef INET6 2763 if (params->sop_inc.inc_flags & INC_ISIPV6) 2764 inpi.hash = INP6_PCBHASH( 2765 ¶ms->sop_inc.inc6_faddr, 2766 params->sop_inc.inc_lport, 2767 params->sop_inc.inc_fport, 2768 pcbinfo->ipi_hashmask); 2769 else 2770 #endif 2771 inpi.hash = INP_PCBHASH( 2772 ¶ms->sop_inc.inc_faddr, 2773 params->sop_inc.inc_lport, 2774 params->sop_inc.inc_fport, 2775 pcbinfo->ipi_hashmask); 2776 } 2777 while ((inp = inp_next(&inpi)) != NULL) 2778 if (inp->inp_gencnt == params->sop_id) { 2779 if (inp->inp_flags & INP_DROPPED) { 2780 INP_WUNLOCK(inp); 2781 return (ECONNRESET); 2782 } 2783 so = inp->inp_socket; 2784 KASSERT(so != NULL, ("inp_socket == NULL")); 2785 soref(so); 2786 error = (*ctloutput_set)(inp, &sopt); 2787 sorele(so); 2788 break; 2789 } 2790 if (inp == NULL) 2791 error = ESRCH; 2792 return (error); 2793 } 2794 2795 #ifdef DDB 2796 static void 2797 db_print_indent(int indent) 2798 { 2799 int i; 2800 2801 for (i = 0; i < indent; i++) 2802 db_printf(" "); 2803 } 2804 2805 static void 2806 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2807 { 2808 char faddr_str[48], laddr_str[48]; 2809 2810 db_print_indent(indent); 2811 db_printf("%s at %p\n", name, inc); 2812 2813 indent += 2; 2814 2815 #ifdef INET6 2816 if (inc->inc_flags & INC_ISIPV6) { 2817 /* IPv6. */ 2818 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2819 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2820 } else 2821 #endif 2822 { 2823 /* IPv4. */ 2824 inet_ntoa_r(inc->inc_laddr, laddr_str); 2825 inet_ntoa_r(inc->inc_faddr, faddr_str); 2826 } 2827 db_print_indent(indent); 2828 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2829 ntohs(inc->inc_lport)); 2830 db_print_indent(indent); 2831 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2832 ntohs(inc->inc_fport)); 2833 } 2834 2835 static void 2836 db_print_inpflags(int inp_flags) 2837 { 2838 int comma; 2839 2840 comma = 0; 2841 if (inp_flags & INP_RECVOPTS) { 2842 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2843 comma = 1; 2844 } 2845 if (inp_flags & INP_RECVRETOPTS) { 2846 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2847 comma = 1; 2848 } 2849 if (inp_flags & INP_RECVDSTADDR) { 2850 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2851 comma = 1; 2852 } 2853 if (inp_flags & INP_ORIGDSTADDR) { 2854 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2855 comma = 1; 2856 } 2857 if (inp_flags & INP_HDRINCL) { 2858 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2859 comma = 1; 2860 } 2861 if (inp_flags & INP_HIGHPORT) { 2862 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2863 comma = 1; 2864 } 2865 if (inp_flags & INP_LOWPORT) { 2866 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2867 comma = 1; 2868 } 2869 if (inp_flags & INP_ANONPORT) { 2870 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2871 comma = 1; 2872 } 2873 if (inp_flags & INP_RECVIF) { 2874 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2875 comma = 1; 2876 } 2877 if (inp_flags & INP_MTUDISC) { 2878 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2879 comma = 1; 2880 } 2881 if (inp_flags & INP_RECVTTL) { 2882 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2883 comma = 1; 2884 } 2885 if (inp_flags & INP_DONTFRAG) { 2886 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2887 comma = 1; 2888 } 2889 if (inp_flags & INP_RECVTOS) { 2890 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2891 comma = 1; 2892 } 2893 if (inp_flags & IN6P_IPV6_V6ONLY) { 2894 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2895 comma = 1; 2896 } 2897 if (inp_flags & IN6P_PKTINFO) { 2898 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2899 comma = 1; 2900 } 2901 if (inp_flags & IN6P_HOPLIMIT) { 2902 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2903 comma = 1; 2904 } 2905 if (inp_flags & IN6P_HOPOPTS) { 2906 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2907 comma = 1; 2908 } 2909 if (inp_flags & IN6P_DSTOPTS) { 2910 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2911 comma = 1; 2912 } 2913 if (inp_flags & IN6P_RTHDR) { 2914 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2915 comma = 1; 2916 } 2917 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2918 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2919 comma = 1; 2920 } 2921 if (inp_flags & IN6P_TCLASS) { 2922 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2923 comma = 1; 2924 } 2925 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2926 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2927 comma = 1; 2928 } 2929 if (inp_flags & INP_ONESBCAST) { 2930 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2931 comma = 1; 2932 } 2933 if (inp_flags & INP_DROPPED) { 2934 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2935 comma = 1; 2936 } 2937 if (inp_flags & INP_SOCKREF) { 2938 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2939 comma = 1; 2940 } 2941 if (inp_flags & IN6P_RFC2292) { 2942 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2943 comma = 1; 2944 } 2945 if (inp_flags & IN6P_MTU) { 2946 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2947 comma = 1; 2948 } 2949 } 2950 2951 static void 2952 db_print_inpvflag(u_char inp_vflag) 2953 { 2954 int comma; 2955 2956 comma = 0; 2957 if (inp_vflag & INP_IPV4) { 2958 db_printf("%sINP_IPV4", comma ? ", " : ""); 2959 comma = 1; 2960 } 2961 if (inp_vflag & INP_IPV6) { 2962 db_printf("%sINP_IPV6", comma ? ", " : ""); 2963 comma = 1; 2964 } 2965 if (inp_vflag & INP_IPV6PROTO) { 2966 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2967 comma = 1; 2968 } 2969 } 2970 2971 static void 2972 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2973 { 2974 2975 db_print_indent(indent); 2976 db_printf("%s at %p\n", name, inp); 2977 2978 indent += 2; 2979 2980 db_print_indent(indent); 2981 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2982 2983 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2984 2985 db_print_indent(indent); 2986 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2987 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 2988 2989 db_print_indent(indent); 2990 db_printf("inp_label: %p inp_flags: 0x%x (", 2991 inp->inp_label, inp->inp_flags); 2992 db_print_inpflags(inp->inp_flags); 2993 db_printf(")\n"); 2994 2995 db_print_indent(indent); 2996 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 2997 inp->inp_vflag); 2998 db_print_inpvflag(inp->inp_vflag); 2999 db_printf(")\n"); 3000 3001 db_print_indent(indent); 3002 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3003 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3004 3005 db_print_indent(indent); 3006 #ifdef INET6 3007 if (inp->inp_vflag & INP_IPV6) { 3008 db_printf("in6p_options: %p in6p_outputopts: %p " 3009 "in6p_moptions: %p\n", inp->in6p_options, 3010 inp->in6p_outputopts, inp->in6p_moptions); 3011 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3012 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3013 inp->in6p_hops); 3014 } else 3015 #endif 3016 { 3017 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3018 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3019 inp->inp_options, inp->inp_moptions); 3020 } 3021 3022 db_print_indent(indent); 3023 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3024 (uintmax_t)inp->inp_gencnt); 3025 } 3026 3027 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3028 { 3029 struct inpcb *inp; 3030 3031 if (!have_addr) { 3032 db_printf("usage: show inpcb <addr>\n"); 3033 return; 3034 } 3035 inp = (struct inpcb *)addr; 3036 3037 db_print_inpcb(inp, "inpcb", 0); 3038 } 3039 #endif /* DDB */ 3040 3041 #ifdef RATELIMIT 3042 /* 3043 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3044 * if any. 3045 */ 3046 int 3047 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3048 { 3049 union if_snd_tag_modify_params params = { 3050 .rate_limit.max_rate = max_pacing_rate, 3051 .rate_limit.flags = M_NOWAIT, 3052 }; 3053 struct m_snd_tag *mst; 3054 int error; 3055 3056 mst = inp->inp_snd_tag; 3057 if (mst == NULL) 3058 return (EINVAL); 3059 3060 if (mst->sw->snd_tag_modify == NULL) { 3061 error = EOPNOTSUPP; 3062 } else { 3063 error = mst->sw->snd_tag_modify(mst, ¶ms); 3064 } 3065 return (error); 3066 } 3067 3068 /* 3069 * Query existing TX rate limit based on the existing 3070 * "inp->inp_snd_tag", if any. 3071 */ 3072 int 3073 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3074 { 3075 union if_snd_tag_query_params params = { }; 3076 struct m_snd_tag *mst; 3077 int error; 3078 3079 mst = inp->inp_snd_tag; 3080 if (mst == NULL) 3081 return (EINVAL); 3082 3083 if (mst->sw->snd_tag_query == NULL) { 3084 error = EOPNOTSUPP; 3085 } else { 3086 error = mst->sw->snd_tag_query(mst, ¶ms); 3087 if (error == 0 && p_max_pacing_rate != NULL) 3088 *p_max_pacing_rate = params.rate_limit.max_rate; 3089 } 3090 return (error); 3091 } 3092 3093 /* 3094 * Query existing TX queue level based on the existing 3095 * "inp->inp_snd_tag", if any. 3096 */ 3097 int 3098 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3099 { 3100 union if_snd_tag_query_params params = { }; 3101 struct m_snd_tag *mst; 3102 int error; 3103 3104 mst = inp->inp_snd_tag; 3105 if (mst == NULL) 3106 return (EINVAL); 3107 3108 if (mst->sw->snd_tag_query == NULL) 3109 return (EOPNOTSUPP); 3110 3111 error = mst->sw->snd_tag_query(mst, ¶ms); 3112 if (error == 0 && p_txqueue_level != NULL) 3113 *p_txqueue_level = params.rate_limit.queue_level; 3114 return (error); 3115 } 3116 3117 /* 3118 * Allocate a new TX rate limit send tag from the network interface 3119 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3120 */ 3121 int 3122 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3123 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3124 3125 { 3126 union if_snd_tag_alloc_params params = { 3127 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3128 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3129 .rate_limit.hdr.flowid = flowid, 3130 .rate_limit.hdr.flowtype = flowtype, 3131 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3132 .rate_limit.max_rate = max_pacing_rate, 3133 .rate_limit.flags = M_NOWAIT, 3134 }; 3135 int error; 3136 3137 INP_WLOCK_ASSERT(inp); 3138 3139 /* 3140 * If there is already a send tag, or the INP is being torn 3141 * down, allocating a new send tag is not allowed. Else send 3142 * tags may leak. 3143 */ 3144 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3145 return (EINVAL); 3146 3147 error = m_snd_tag_alloc(ifp, ¶ms, st); 3148 #ifdef INET 3149 if (error == 0) { 3150 counter_u64_add(rate_limit_set_ok, 1); 3151 counter_u64_add(rate_limit_active, 1); 3152 } else if (error != EOPNOTSUPP) 3153 counter_u64_add(rate_limit_alloc_fail, 1); 3154 #endif 3155 return (error); 3156 } 3157 3158 void 3159 in_pcbdetach_tag(struct m_snd_tag *mst) 3160 { 3161 3162 m_snd_tag_rele(mst); 3163 #ifdef INET 3164 counter_u64_add(rate_limit_active, -1); 3165 #endif 3166 } 3167 3168 /* 3169 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3170 * if any: 3171 */ 3172 void 3173 in_pcbdetach_txrtlmt(struct inpcb *inp) 3174 { 3175 struct m_snd_tag *mst; 3176 3177 INP_WLOCK_ASSERT(inp); 3178 3179 mst = inp->inp_snd_tag; 3180 inp->inp_snd_tag = NULL; 3181 3182 if (mst == NULL) 3183 return; 3184 3185 m_snd_tag_rele(mst); 3186 #ifdef INET 3187 counter_u64_add(rate_limit_active, -1); 3188 #endif 3189 } 3190 3191 int 3192 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3193 { 3194 int error; 3195 3196 /* 3197 * If the existing send tag is for the wrong interface due to 3198 * a route change, first drop the existing tag. Set the 3199 * CHANGED flag so that we will keep trying to allocate a new 3200 * tag if we fail to allocate one this time. 3201 */ 3202 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3203 in_pcbdetach_txrtlmt(inp); 3204 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3205 } 3206 3207 /* 3208 * NOTE: When attaching to a network interface a reference is 3209 * made to ensure the network interface doesn't go away until 3210 * all ratelimit connections are gone. The network interface 3211 * pointers compared below represent valid network interfaces, 3212 * except when comparing towards NULL. 3213 */ 3214 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3215 error = 0; 3216 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3217 if (inp->inp_snd_tag != NULL) 3218 in_pcbdetach_txrtlmt(inp); 3219 error = 0; 3220 } else if (inp->inp_snd_tag == NULL) { 3221 /* 3222 * In order to utilize packet pacing with RSS, we need 3223 * to wait until there is a valid RSS hash before we 3224 * can proceed: 3225 */ 3226 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3227 error = EAGAIN; 3228 } else { 3229 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3230 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3231 } 3232 } else { 3233 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3234 } 3235 if (error == 0 || error == EOPNOTSUPP) 3236 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3237 3238 return (error); 3239 } 3240 3241 /* 3242 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3243 * is set in the fast path and will attach/detach/modify the TX rate 3244 * limit send tag based on the socket's so_max_pacing_rate value. 3245 */ 3246 void 3247 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3248 { 3249 struct socket *socket; 3250 uint32_t max_pacing_rate; 3251 bool did_upgrade; 3252 3253 if (inp == NULL) 3254 return; 3255 3256 socket = inp->inp_socket; 3257 if (socket == NULL) 3258 return; 3259 3260 if (!INP_WLOCKED(inp)) { 3261 /* 3262 * NOTE: If the write locking fails, we need to bail 3263 * out and use the non-ratelimited ring for the 3264 * transmit until there is a new chance to get the 3265 * write lock. 3266 */ 3267 if (!INP_TRY_UPGRADE(inp)) 3268 return; 3269 did_upgrade = 1; 3270 } else { 3271 did_upgrade = 0; 3272 } 3273 3274 /* 3275 * NOTE: The so_max_pacing_rate value is read unlocked, 3276 * because atomic updates are not required since the variable 3277 * is checked at every mbuf we send. It is assumed that the 3278 * variable read itself will be atomic. 3279 */ 3280 max_pacing_rate = socket->so_max_pacing_rate; 3281 3282 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3283 3284 if (did_upgrade) 3285 INP_DOWNGRADE(inp); 3286 } 3287 3288 /* 3289 * Track route changes for TX rate limiting. 3290 */ 3291 void 3292 in_pcboutput_eagain(struct inpcb *inp) 3293 { 3294 bool did_upgrade; 3295 3296 if (inp == NULL) 3297 return; 3298 3299 if (inp->inp_snd_tag == NULL) 3300 return; 3301 3302 if (!INP_WLOCKED(inp)) { 3303 /* 3304 * NOTE: If the write locking fails, we need to bail 3305 * out and use the non-ratelimited ring for the 3306 * transmit until there is a new chance to get the 3307 * write lock. 3308 */ 3309 if (!INP_TRY_UPGRADE(inp)) 3310 return; 3311 did_upgrade = 1; 3312 } else { 3313 did_upgrade = 0; 3314 } 3315 3316 /* detach rate limiting */ 3317 in_pcbdetach_txrtlmt(inp); 3318 3319 /* make sure new mbuf send tag allocation is made */ 3320 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3321 3322 if (did_upgrade) 3323 INP_DOWNGRADE(inp); 3324 } 3325 3326 #ifdef INET 3327 static void 3328 rl_init(void *st) 3329 { 3330 rate_limit_new = counter_u64_alloc(M_WAITOK); 3331 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3332 rate_limit_active = counter_u64_alloc(M_WAITOK); 3333 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3334 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3335 } 3336 3337 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3338 #endif 3339 #endif /* RATELIMIT */ 3340