1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/hash.h> 53 #include <sys/systm.h> 54 #include <sys/libkern.h> 55 #include <sys/lock.h> 56 #include <sys/malloc.h> 57 #include <sys/mbuf.h> 58 #include <sys/eventhandler.h> 59 #include <sys/domain.h> 60 #include <sys/protosw.h> 61 #include <sys/smp.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/sockio.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/jail.h> 69 #include <sys/kernel.h> 70 #include <sys/sysctl.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 #include <vm/vm.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #ifdef INET 92 #include <netinet/in_var.h> 93 #include <netinet/in_fib.h> 94 #endif 95 #include <netinet/ip_var.h> 96 #include <netinet/tcp_var.h> 97 #ifdef TCPHPTS 98 #include <netinet/tcp_hpts.h> 99 #endif 100 #include <netinet/udp.h> 101 #include <netinet/udp_var.h> 102 #ifdef INET6 103 #include <netinet/ip6.h> 104 #include <netinet6/in6_pcb.h> 105 #include <netinet6/in6_var.h> 106 #include <netinet6/ip6_var.h> 107 #endif /* INET6 */ 108 #include <net/route/nhop.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #include <security/mac/mac_framework.h> 114 115 #define INPCBLBGROUP_SIZMIN 8 116 #define INPCBLBGROUP_SIZMAX 256 117 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 118 119 /* 120 * These configure the range of local port addresses assigned to 121 * "unspecified" outgoing connections/packets/whatever. 122 */ 123 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 124 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 125 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 126 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 127 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 128 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 129 130 /* 131 * Reserved ports accessible only to root. There are significant 132 * security considerations that must be accounted for when changing these, 133 * but the security benefits can be great. Please be careful. 134 */ 135 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 136 VNET_DEFINE(int, ipport_reservedlow); 137 138 /* Enable random ephemeral port allocation by default. */ 139 VNET_DEFINE(int, ipport_randomized) = 1; 140 141 #ifdef INET 142 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 143 struct in_addr faddr, u_int fport_arg, 144 struct in_addr laddr, u_int lport_arg, 145 int lookupflags, struct ifnet *ifp, 146 uint8_t numa_domain); 147 148 #define RANGECHK(var, min, max) \ 149 if ((var) < (min)) { (var) = (min); } \ 150 else if ((var) > (max)) { (var) = (max); } 151 152 static int 153 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 154 { 155 int error; 156 157 error = sysctl_handle_int(oidp, arg1, arg2, req); 158 if (error == 0) { 159 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 160 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 161 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 162 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 163 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 164 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 165 } 166 return (error); 167 } 168 169 #undef RANGECHK 170 171 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 172 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 173 "IP Ports"); 174 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 196 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 197 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 198 ""); 199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 200 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 201 &VNET_NAME(ipport_reservedhigh), 0, ""); 202 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 203 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 205 CTLFLAG_VNET | CTLFLAG_RW, 206 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 207 208 #ifdef RATELIMIT 209 counter_u64_t rate_limit_new; 210 counter_u64_t rate_limit_chg; 211 counter_u64_t rate_limit_active; 212 counter_u64_t rate_limit_alloc_fail; 213 counter_u64_t rate_limit_set_ok; 214 215 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 216 "IP Rate Limiting"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 218 &rate_limit_active, "Active rate limited connections"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 220 &rate_limit_alloc_fail, "Rate limited connection failures"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 222 &rate_limit_set_ok, "Rate limited setting succeeded"); 223 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 224 &rate_limit_new, "Total Rate limit new attempts"); 225 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 226 &rate_limit_chg, "Total Rate limited change attempts"); 227 228 #endif /* RATELIMIT */ 229 230 #endif /* INET */ 231 232 VNET_DEFINE(uint32_t, in_pcbhashseed); 233 static void 234 in_pcbhashseed_init(void) 235 { 236 237 V_in_pcbhashseed = arc4random(); 238 } 239 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 240 in_pcbhashseed_init, 0); 241 242 static void in_pcbremhash(struct inpcb *); 243 244 /* 245 * in_pcb.c: manage the Protocol Control Blocks. 246 * 247 * NOTE: It is assumed that most of these functions will be called with 248 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 249 * functions often modify hash chains or addresses in pcbs. 250 */ 251 252 static struct inpcblbgroup * 253 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, 254 u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, 255 uint8_t numa_domain) 256 { 257 struct inpcblbgroup *grp; 258 size_t bytes; 259 260 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 261 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 262 if (grp == NULL) 263 return (NULL); 264 grp->il_cred = crhold(cred); 265 grp->il_vflag = vflag; 266 grp->il_lport = port; 267 grp->il_numa_domain = numa_domain; 268 grp->il_dependladdr = *addr; 269 grp->il_inpsiz = size; 270 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 271 return (grp); 272 } 273 274 static void 275 in_pcblbgroup_free_deferred(epoch_context_t ctx) 276 { 277 struct inpcblbgroup *grp; 278 279 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 280 crfree(grp->il_cred); 281 free(grp, M_PCB); 282 } 283 284 static void 285 in_pcblbgroup_free(struct inpcblbgroup *grp) 286 { 287 288 CK_LIST_REMOVE(grp, il_list); 289 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 290 } 291 292 static struct inpcblbgroup * 293 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 294 struct inpcblbgroup *old_grp, int size) 295 { 296 struct inpcblbgroup *grp; 297 int i; 298 299 grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, 300 old_grp->il_lport, &old_grp->il_dependladdr, size, 301 old_grp->il_numa_domain); 302 if (grp == NULL) 303 return (NULL); 304 305 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 306 ("invalid new local group size %d and old local group count %d", 307 grp->il_inpsiz, old_grp->il_inpcnt)); 308 309 for (i = 0; i < old_grp->il_inpcnt; ++i) 310 grp->il_inp[i] = old_grp->il_inp[i]; 311 grp->il_inpcnt = old_grp->il_inpcnt; 312 in_pcblbgroup_free(old_grp); 313 return (grp); 314 } 315 316 /* 317 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 318 * and shrink group if possible. 319 */ 320 static void 321 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 322 int i) 323 { 324 struct inpcblbgroup *grp, *new_grp; 325 326 grp = *grpp; 327 for (; i + 1 < grp->il_inpcnt; ++i) 328 grp->il_inp[i] = grp->il_inp[i + 1]; 329 grp->il_inpcnt--; 330 331 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 332 grp->il_inpcnt <= grp->il_inpsiz / 4) { 333 /* Shrink this group. */ 334 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 335 if (new_grp != NULL) 336 *grpp = new_grp; 337 } 338 } 339 340 /* 341 * Add PCB to load balance group for SO_REUSEPORT_LB option. 342 */ 343 static int 344 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 345 { 346 const static struct timeval interval = { 60, 0 }; 347 static struct timeval lastprint; 348 struct inpcbinfo *pcbinfo; 349 struct inpcblbgrouphead *hdr; 350 struct inpcblbgroup *grp; 351 uint32_t idx; 352 353 pcbinfo = inp->inp_pcbinfo; 354 355 INP_WLOCK_ASSERT(inp); 356 INP_HASH_WLOCK_ASSERT(pcbinfo); 357 358 #ifdef INET6 359 /* 360 * Don't allow IPv4 mapped INET6 wild socket. 361 */ 362 if ((inp->inp_vflag & INP_IPV4) && 363 inp->inp_laddr.s_addr == INADDR_ANY && 364 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 365 return (0); 366 } 367 #endif 368 369 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 370 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 371 CK_LIST_FOREACH(grp, hdr, il_list) { 372 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 373 grp->il_vflag == inp->inp_vflag && 374 grp->il_lport == inp->inp_lport && 375 grp->il_numa_domain == numa_domain && 376 memcmp(&grp->il_dependladdr, 377 &inp->inp_inc.inc_ie.ie_dependladdr, 378 sizeof(grp->il_dependladdr)) == 0) { 379 break; 380 } 381 } 382 if (grp == NULL) { 383 /* Create new load balance group. */ 384 grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, 385 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 386 INPCBLBGROUP_SIZMIN, numa_domain); 387 if (grp == NULL) 388 return (ENOBUFS); 389 } else if (grp->il_inpcnt == grp->il_inpsiz) { 390 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 391 if (ratecheck(&lastprint, &interval)) 392 printf("lb group port %d, limit reached\n", 393 ntohs(grp->il_lport)); 394 return (0); 395 } 396 397 /* Expand this local group. */ 398 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 399 if (grp == NULL) 400 return (ENOBUFS); 401 } 402 403 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 404 ("invalid local group size %d and count %d", grp->il_inpsiz, 405 grp->il_inpcnt)); 406 407 grp->il_inp[grp->il_inpcnt] = inp; 408 grp->il_inpcnt++; 409 return (0); 410 } 411 412 /* 413 * Remove PCB from load balance group. 414 */ 415 static void 416 in_pcbremlbgrouphash(struct inpcb *inp) 417 { 418 struct inpcbinfo *pcbinfo; 419 struct inpcblbgrouphead *hdr; 420 struct inpcblbgroup *grp; 421 int i; 422 423 pcbinfo = inp->inp_pcbinfo; 424 425 INP_WLOCK_ASSERT(inp); 426 INP_HASH_WLOCK_ASSERT(pcbinfo); 427 428 hdr = &pcbinfo->ipi_lbgrouphashbase[ 429 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 430 CK_LIST_FOREACH(grp, hdr, il_list) { 431 for (i = 0; i < grp->il_inpcnt; ++i) { 432 if (grp->il_inp[i] != inp) 433 continue; 434 435 if (grp->il_inpcnt == 1) { 436 /* We are the last, free this local group. */ 437 in_pcblbgroup_free(grp); 438 } else { 439 /* Pull up inpcbs, shrink group if possible. */ 440 in_pcblbgroup_reorder(hdr, &grp, i); 441 } 442 return; 443 } 444 } 445 } 446 447 int 448 in_pcblbgroup_numa(struct inpcb *inp, int arg) 449 { 450 struct inpcbinfo *pcbinfo; 451 struct inpcblbgrouphead *hdr; 452 struct inpcblbgroup *grp; 453 int err, i; 454 uint8_t numa_domain; 455 456 switch (arg) { 457 case TCP_REUSPORT_LB_NUMA_NODOM: 458 numa_domain = M_NODOM; 459 break; 460 case TCP_REUSPORT_LB_NUMA_CURDOM: 461 numa_domain = PCPU_GET(domain); 462 break; 463 default: 464 if (arg < 0 || arg >= vm_ndomains) 465 return (EINVAL); 466 numa_domain = arg; 467 } 468 469 err = 0; 470 pcbinfo = inp->inp_pcbinfo; 471 INP_WLOCK_ASSERT(inp); 472 INP_HASH_WLOCK(pcbinfo); 473 hdr = &pcbinfo->ipi_lbgrouphashbase[ 474 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 475 CK_LIST_FOREACH(grp, hdr, il_list) { 476 for (i = 0; i < grp->il_inpcnt; ++i) { 477 if (grp->il_inp[i] != inp) 478 continue; 479 480 if (grp->il_numa_domain == numa_domain) { 481 goto abort_with_hash_wlock; 482 } 483 484 /* Remove it from the old group. */ 485 in_pcbremlbgrouphash(inp); 486 487 /* Add it to the new group based on numa domain. */ 488 in_pcbinslbgrouphash(inp, numa_domain); 489 goto abort_with_hash_wlock; 490 } 491 } 492 err = ENOENT; 493 abort_with_hash_wlock: 494 INP_HASH_WUNLOCK(pcbinfo); 495 return (err); 496 } 497 498 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 499 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 500 501 /* 502 * Initialize an inpcbinfo - a per-VNET instance of connections db. 503 */ 504 void 505 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 506 u_int hash_nelements, u_int porthash_nelements) 507 { 508 509 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 510 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 511 NULL, MTX_DEF); 512 #ifdef VIMAGE 513 pcbinfo->ipi_vnet = curvnet; 514 #endif 515 CK_LIST_INIT(&pcbinfo->ipi_listhead); 516 pcbinfo->ipi_count = 0; 517 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 518 &pcbinfo->ipi_hashmask); 519 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 520 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 521 &pcbinfo->ipi_porthashmask); 522 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 523 &pcbinfo->ipi_lbgrouphashmask); 524 pcbinfo->ipi_zone = pcbstor->ips_zone; 525 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 526 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 527 } 528 529 /* 530 * Destroy an inpcbinfo. 531 */ 532 void 533 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 534 { 535 536 KASSERT(pcbinfo->ipi_count == 0, 537 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 538 539 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 540 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 541 pcbinfo->ipi_porthashmask); 542 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 543 pcbinfo->ipi_lbgrouphashmask); 544 mtx_destroy(&pcbinfo->ipi_hash_lock); 545 mtx_destroy(&pcbinfo->ipi_lock); 546 } 547 548 /* 549 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 550 */ 551 static void inpcb_dtor(void *, int, void *); 552 static void inpcb_fini(void *, int); 553 void 554 in_pcbstorage_init(void *arg) 555 { 556 struct inpcbstorage *pcbstor = arg; 557 558 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 559 sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, 560 inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); 561 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 562 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 563 uma_zone_set_smr(pcbstor->ips_portzone, 564 uma_zone_get_smr(pcbstor->ips_zone)); 565 } 566 567 /* 568 * Destroy a pcbstorage - used by unloadable protocols. 569 */ 570 void 571 in_pcbstorage_destroy(void *arg) 572 { 573 struct inpcbstorage *pcbstor = arg; 574 575 uma_zdestroy(pcbstor->ips_zone); 576 uma_zdestroy(pcbstor->ips_portzone); 577 } 578 579 /* 580 * Allocate a PCB and associate it with the socket. 581 * On success return with the PCB locked. 582 */ 583 int 584 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 585 { 586 struct inpcb *inp; 587 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 588 int error; 589 #endif 590 591 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 592 if (inp == NULL) 593 return (ENOBUFS); 594 bzero(&inp->inp_start_zero, inp_zero_size); 595 #ifdef NUMA 596 inp->inp_numa_domain = M_NODOM; 597 #endif 598 inp->inp_pcbinfo = pcbinfo; 599 inp->inp_socket = so; 600 inp->inp_cred = crhold(so->so_cred); 601 inp->inp_inc.inc_fibnum = so->so_fibnum; 602 #ifdef MAC 603 error = mac_inpcb_init(inp, M_NOWAIT); 604 if (error != 0) 605 goto out; 606 mac_inpcb_create(so, inp); 607 #endif 608 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 609 error = ipsec_init_pcbpolicy(inp); 610 if (error != 0) { 611 #ifdef MAC 612 mac_inpcb_destroy(inp); 613 #endif 614 goto out; 615 } 616 #endif /*IPSEC*/ 617 #ifdef INET6 618 if (INP_SOCKAF(so) == AF_INET6) { 619 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 620 if (V_ip6_v6only) 621 inp->inp_flags |= IN6P_IPV6_V6ONLY; 622 #ifdef INET 623 else 624 inp->inp_vflag |= INP_IPV4; 625 #endif 626 if (V_ip6_auto_flowlabel) 627 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 628 inp->in6p_hops = -1; /* use kernel default */ 629 } 630 #endif 631 #if defined(INET) && defined(INET6) 632 else 633 #endif 634 #ifdef INET 635 inp->inp_vflag |= INP_IPV4; 636 #endif 637 /* 638 * Routes in inpcb's can cache L2 as well; they are guaranteed 639 * to be cleaned up. 640 */ 641 inp->inp_route.ro_flags = RT_LLE_CACHE; 642 #ifdef TCPHPTS 643 /* 644 * If using hpts lets drop a random number in so 645 * not all new connections fall on the same CPU. 646 */ 647 inp->inp_hpts_cpu = hpts_random_cpu(inp); 648 #endif 649 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 650 INP_WLOCK(inp); 651 INP_INFO_WLOCK(pcbinfo); 652 pcbinfo->ipi_count++; 653 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 654 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 655 INP_INFO_WUNLOCK(pcbinfo); 656 so->so_pcb = inp; 657 658 return (0); 659 660 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 661 out: 662 uma_zfree_smr(pcbinfo->ipi_zone, inp); 663 return (error); 664 #endif 665 } 666 667 #ifdef INET 668 int 669 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 670 { 671 int anonport, error; 672 673 KASSERT(nam == NULL || nam->sa_family == AF_INET, 674 ("%s: invalid address family for %p", __func__, nam)); 675 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 676 ("%s: invalid address length for %p", __func__, nam)); 677 INP_WLOCK_ASSERT(inp); 678 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 679 680 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 681 return (EINVAL); 682 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 683 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 684 &inp->inp_lport, cred); 685 if (error) 686 return (error); 687 if (in_pcbinshash(inp) != 0) { 688 inp->inp_laddr.s_addr = INADDR_ANY; 689 inp->inp_lport = 0; 690 return (EAGAIN); 691 } 692 if (anonport) 693 inp->inp_flags |= INP_ANONPORT; 694 return (0); 695 } 696 #endif 697 698 #if defined(INET) || defined(INET6) 699 /* 700 * Assign a local port like in_pcb_lport(), but also used with connect() 701 * and a foreign address and port. If fsa is non-NULL, choose a local port 702 * that is unused with those, otherwise one that is completely unused. 703 * lsa can be NULL for IPv6. 704 */ 705 int 706 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 707 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 708 { 709 struct inpcbinfo *pcbinfo; 710 struct inpcb *tmpinp; 711 unsigned short *lastport; 712 int count, error; 713 u_short aux, first, last, lport; 714 #ifdef INET 715 struct in_addr laddr, faddr; 716 #endif 717 #ifdef INET6 718 struct in6_addr *laddr6, *faddr6; 719 #endif 720 721 pcbinfo = inp->inp_pcbinfo; 722 723 /* 724 * Because no actual state changes occur here, a global write lock on 725 * the pcbinfo isn't required. 726 */ 727 INP_LOCK_ASSERT(inp); 728 INP_HASH_LOCK_ASSERT(pcbinfo); 729 730 if (inp->inp_flags & INP_HIGHPORT) { 731 first = V_ipport_hifirstauto; /* sysctl */ 732 last = V_ipport_hilastauto; 733 lastport = &pcbinfo->ipi_lasthi; 734 } else if (inp->inp_flags & INP_LOWPORT) { 735 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 736 if (error) 737 return (error); 738 first = V_ipport_lowfirstauto; /* 1023 */ 739 last = V_ipport_lowlastauto; /* 600 */ 740 lastport = &pcbinfo->ipi_lastlow; 741 } else { 742 first = V_ipport_firstauto; /* sysctl */ 743 last = V_ipport_lastauto; 744 lastport = &pcbinfo->ipi_lastport; 745 } 746 747 /* 748 * Instead of having two loops further down counting up or down 749 * make sure that first is always <= last and go with only one 750 * code path implementing all logic. 751 */ 752 if (first > last) { 753 aux = first; 754 first = last; 755 last = aux; 756 } 757 758 #ifdef INET 759 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 760 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 761 if (lsa != NULL) 762 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 763 if (fsa != NULL) 764 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 765 } 766 #endif 767 #ifdef INET6 768 laddr6 = NULL; 769 if ((inp->inp_vflag & INP_IPV6) != 0) { 770 if (lsa != NULL) 771 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 772 if (fsa != NULL) 773 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 774 } 775 #endif 776 777 tmpinp = NULL; 778 lport = *lportp; 779 780 if (V_ipport_randomized) 781 *lastport = first + (arc4random() % (last - first)); 782 783 count = last - first; 784 785 do { 786 if (count-- < 0) /* completely used? */ 787 return (EADDRNOTAVAIL); 788 ++*lastport; 789 if (*lastport < first || *lastport > last) 790 *lastport = first; 791 lport = htons(*lastport); 792 793 if (fsa != NULL) { 794 #ifdef INET 795 if (lsa->sa_family == AF_INET) { 796 tmpinp = in_pcblookup_hash_locked(pcbinfo, 797 faddr, fport, laddr, lport, lookupflags, 798 NULL, M_NODOM); 799 } 800 #endif 801 #ifdef INET6 802 if (lsa->sa_family == AF_INET6) { 803 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 804 faddr6, fport, laddr6, lport, lookupflags, 805 NULL, M_NODOM); 806 } 807 #endif 808 } else { 809 #ifdef INET6 810 if ((inp->inp_vflag & INP_IPV6) != 0) { 811 tmpinp = in6_pcblookup_local(pcbinfo, 812 &inp->in6p_laddr, lport, lookupflags, cred); 813 #ifdef INET 814 if (tmpinp == NULL && 815 (inp->inp_vflag & INP_IPV4)) 816 tmpinp = in_pcblookup_local(pcbinfo, 817 laddr, lport, lookupflags, cred); 818 #endif 819 } 820 #endif 821 #if defined(INET) && defined(INET6) 822 else 823 #endif 824 #ifdef INET 825 tmpinp = in_pcblookup_local(pcbinfo, laddr, 826 lport, lookupflags, cred); 827 #endif 828 } 829 } while (tmpinp != NULL); 830 831 *lportp = lport; 832 833 return (0); 834 } 835 836 /* 837 * Select a local port (number) to use. 838 */ 839 int 840 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 841 struct ucred *cred, int lookupflags) 842 { 843 struct sockaddr_in laddr; 844 845 if (laddrp) { 846 bzero(&laddr, sizeof(laddr)); 847 laddr.sin_family = AF_INET; 848 laddr.sin_addr = *laddrp; 849 } 850 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 851 NULL, lportp, NULL, 0, cred, lookupflags)); 852 } 853 854 /* 855 * Return cached socket options. 856 */ 857 int 858 inp_so_options(const struct inpcb *inp) 859 { 860 int so_options; 861 862 so_options = 0; 863 864 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 865 so_options |= SO_REUSEPORT_LB; 866 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 867 so_options |= SO_REUSEPORT; 868 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 869 so_options |= SO_REUSEADDR; 870 return (so_options); 871 } 872 #endif /* INET || INET6 */ 873 874 /* 875 * Check if a new BINDMULTI socket is allowed to be created. 876 * 877 * ni points to the new inp. 878 * oi points to the existing inp. 879 * 880 * This checks whether the existing inp also has BINDMULTI and 881 * whether the credentials match. 882 */ 883 int 884 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 885 { 886 /* Check permissions match */ 887 if ((ni->inp_flags2 & INP_BINDMULTI) && 888 (ni->inp_cred->cr_uid != 889 oi->inp_cred->cr_uid)) 890 return (0); 891 892 /* Check the existing inp has BINDMULTI set */ 893 if ((ni->inp_flags2 & INP_BINDMULTI) && 894 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 895 return (0); 896 897 /* 898 * We're okay - either INP_BINDMULTI isn't set on ni, or 899 * it is and it matches the checks. 900 */ 901 return (1); 902 } 903 904 #ifdef INET 905 /* 906 * Set up a bind operation on a PCB, performing port allocation 907 * as required, but do not actually modify the PCB. Callers can 908 * either complete the bind by setting inp_laddr/inp_lport and 909 * calling in_pcbinshash(), or they can just use the resulting 910 * port and address to authorise the sending of a once-off packet. 911 * 912 * On error, the values of *laddrp and *lportp are not changed. 913 */ 914 int 915 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 916 u_short *lportp, struct ucred *cred) 917 { 918 struct socket *so = inp->inp_socket; 919 struct sockaddr_in *sin; 920 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 921 struct in_addr laddr; 922 u_short lport = 0; 923 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 924 int error; 925 926 /* 927 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 928 * so that we don't have to add to the (already messy) code below. 929 */ 930 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 931 932 /* 933 * No state changes, so read locks are sufficient here. 934 */ 935 INP_LOCK_ASSERT(inp); 936 INP_HASH_LOCK_ASSERT(pcbinfo); 937 938 laddr.s_addr = *laddrp; 939 if (nam != NULL && laddr.s_addr != INADDR_ANY) 940 return (EINVAL); 941 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 942 lookupflags = INPLOOKUP_WILDCARD; 943 if (nam == NULL) { 944 if ((error = prison_local_ip4(cred, &laddr)) != 0) 945 return (error); 946 } else { 947 sin = (struct sockaddr_in *)nam; 948 KASSERT(sin->sin_family == AF_INET, 949 ("%s: invalid family for address %p", __func__, sin)); 950 KASSERT(sin->sin_len == sizeof(*sin), 951 ("%s: invalid length for address %p", __func__, sin)); 952 953 error = prison_local_ip4(cred, &sin->sin_addr); 954 if (error) 955 return (error); 956 if (sin->sin_port != *lportp) { 957 /* Don't allow the port to change. */ 958 if (*lportp != 0) 959 return (EINVAL); 960 lport = sin->sin_port; 961 } 962 /* NB: lport is left as 0 if the port isn't being changed. */ 963 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 964 /* 965 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 966 * allow complete duplication of binding if 967 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 968 * and a multicast address is bound on both 969 * new and duplicated sockets. 970 */ 971 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 972 reuseport = SO_REUSEADDR|SO_REUSEPORT; 973 /* 974 * XXX: How to deal with SO_REUSEPORT_LB here? 975 * Treat same as SO_REUSEPORT for now. 976 */ 977 if ((so->so_options & 978 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 979 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 980 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 981 sin->sin_port = 0; /* yech... */ 982 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 983 /* 984 * Is the address a local IP address? 985 * If INP_BINDANY is set, then the socket may be bound 986 * to any endpoint address, local or not. 987 */ 988 if ((inp->inp_flags & INP_BINDANY) == 0 && 989 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 990 return (EADDRNOTAVAIL); 991 } 992 laddr = sin->sin_addr; 993 if (lport) { 994 struct inpcb *t; 995 996 /* GROSS */ 997 if (ntohs(lport) <= V_ipport_reservedhigh && 998 ntohs(lport) >= V_ipport_reservedlow && 999 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 1000 return (EACCES); 1001 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 1002 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 1003 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1004 lport, INPLOOKUP_WILDCARD, cred); 1005 /* 1006 * XXX 1007 * This entire block sorely needs a rewrite. 1008 */ 1009 if (t && 1010 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1011 (so->so_type != SOCK_STREAM || 1012 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1013 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1014 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1015 (t->inp_flags2 & INP_REUSEPORT) || 1016 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1017 (inp->inp_cred->cr_uid != 1018 t->inp_cred->cr_uid)) 1019 return (EADDRINUSE); 1020 1021 /* 1022 * If the socket is a BINDMULTI socket, then 1023 * the credentials need to match and the 1024 * original socket also has to have been bound 1025 * with BINDMULTI. 1026 */ 1027 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1028 return (EADDRINUSE); 1029 } 1030 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1031 lport, lookupflags, cred); 1032 if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1033 (reuseport & inp_so_options(t)) == 0 && 1034 (reuseport_lb & inp_so_options(t)) == 0) { 1035 #ifdef INET6 1036 if (ntohl(sin->sin_addr.s_addr) != 1037 INADDR_ANY || 1038 ntohl(t->inp_laddr.s_addr) != 1039 INADDR_ANY || 1040 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1041 (t->inp_vflag & INP_IPV6PROTO) == 0) 1042 #endif 1043 return (EADDRINUSE); 1044 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1045 return (EADDRINUSE); 1046 } 1047 } 1048 } 1049 if (*lportp != 0) 1050 lport = *lportp; 1051 if (lport == 0) { 1052 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1053 if (error != 0) 1054 return (error); 1055 } 1056 *laddrp = laddr.s_addr; 1057 *lportp = lport; 1058 return (0); 1059 } 1060 1061 /* 1062 * Connect from a socket to a specified address. 1063 * Both address and port must be specified in argument sin. 1064 * If don't have a local address for this socket yet, 1065 * then pick one. 1066 */ 1067 int 1068 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1069 bool rehash) 1070 { 1071 u_short lport, fport; 1072 in_addr_t laddr, faddr; 1073 int anonport, error; 1074 1075 INP_WLOCK_ASSERT(inp); 1076 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1077 1078 lport = inp->inp_lport; 1079 laddr = inp->inp_laddr.s_addr; 1080 anonport = (lport == 0); 1081 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1082 NULL, cred); 1083 if (error) 1084 return (error); 1085 1086 /* Do the initial binding of the local address if required. */ 1087 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1088 KASSERT(rehash == true, 1089 ("Rehashing required for unbound inps")); 1090 inp->inp_lport = lport; 1091 inp->inp_laddr.s_addr = laddr; 1092 if (in_pcbinshash(inp) != 0) { 1093 inp->inp_laddr.s_addr = INADDR_ANY; 1094 inp->inp_lport = 0; 1095 return (EAGAIN); 1096 } 1097 } 1098 1099 /* Commit the remaining changes. */ 1100 inp->inp_lport = lport; 1101 inp->inp_laddr.s_addr = laddr; 1102 inp->inp_faddr.s_addr = faddr; 1103 inp->inp_fport = fport; 1104 if (rehash) { 1105 in_pcbrehash(inp); 1106 } else { 1107 in_pcbinshash(inp); 1108 } 1109 1110 if (anonport) 1111 inp->inp_flags |= INP_ANONPORT; 1112 return (0); 1113 } 1114 1115 /* 1116 * Do proper source address selection on an unbound socket in case 1117 * of connect. Take jails into account as well. 1118 */ 1119 int 1120 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1121 struct ucred *cred) 1122 { 1123 struct ifaddr *ifa; 1124 struct sockaddr *sa; 1125 struct sockaddr_in *sin, dst; 1126 struct nhop_object *nh; 1127 int error; 1128 1129 NET_EPOCH_ASSERT(); 1130 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1131 1132 /* 1133 * Bypass source address selection and use the primary jail IP 1134 * if requested. 1135 */ 1136 if (!prison_saddrsel_ip4(cred, laddr)) 1137 return (0); 1138 1139 error = 0; 1140 1141 nh = NULL; 1142 bzero(&dst, sizeof(dst)); 1143 sin = &dst; 1144 sin->sin_family = AF_INET; 1145 sin->sin_len = sizeof(struct sockaddr_in); 1146 sin->sin_addr.s_addr = faddr->s_addr; 1147 1148 /* 1149 * If route is known our src addr is taken from the i/f, 1150 * else punt. 1151 * 1152 * Find out route to destination. 1153 */ 1154 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1155 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1156 0, NHR_NONE, 0); 1157 1158 /* 1159 * If we found a route, use the address corresponding to 1160 * the outgoing interface. 1161 * 1162 * Otherwise assume faddr is reachable on a directly connected 1163 * network and try to find a corresponding interface to take 1164 * the source address from. 1165 */ 1166 if (nh == NULL || nh->nh_ifp == NULL) { 1167 struct in_ifaddr *ia; 1168 struct ifnet *ifp; 1169 1170 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1171 inp->inp_socket->so_fibnum)); 1172 if (ia == NULL) { 1173 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1174 inp->inp_socket->so_fibnum)); 1175 } 1176 if (ia == NULL) { 1177 error = ENETUNREACH; 1178 goto done; 1179 } 1180 1181 if (!prison_flag(cred, PR_IP4)) { 1182 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1183 goto done; 1184 } 1185 1186 ifp = ia->ia_ifp; 1187 ia = NULL; 1188 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1189 sa = ifa->ifa_addr; 1190 if (sa->sa_family != AF_INET) 1191 continue; 1192 sin = (struct sockaddr_in *)sa; 1193 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1194 ia = (struct in_ifaddr *)ifa; 1195 break; 1196 } 1197 } 1198 if (ia != NULL) { 1199 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1200 goto done; 1201 } 1202 1203 /* 3. As a last resort return the 'default' jail address. */ 1204 error = prison_get_ip4(cred, laddr); 1205 goto done; 1206 } 1207 1208 /* 1209 * If the outgoing interface on the route found is not 1210 * a loopback interface, use the address from that interface. 1211 * In case of jails do those three steps: 1212 * 1. check if the interface address belongs to the jail. If so use it. 1213 * 2. check if we have any address on the outgoing interface 1214 * belonging to this jail. If so use it. 1215 * 3. as a last resort return the 'default' jail address. 1216 */ 1217 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1218 struct in_ifaddr *ia; 1219 struct ifnet *ifp; 1220 1221 /* If not jailed, use the default returned. */ 1222 if (!prison_flag(cred, PR_IP4)) { 1223 ia = (struct in_ifaddr *)nh->nh_ifa; 1224 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1225 goto done; 1226 } 1227 1228 /* Jailed. */ 1229 /* 1. Check if the iface address belongs to the jail. */ 1230 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1231 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1232 ia = (struct in_ifaddr *)nh->nh_ifa; 1233 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1234 goto done; 1235 } 1236 1237 /* 1238 * 2. Check if we have any address on the outgoing interface 1239 * belonging to this jail. 1240 */ 1241 ia = NULL; 1242 ifp = nh->nh_ifp; 1243 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1244 sa = ifa->ifa_addr; 1245 if (sa->sa_family != AF_INET) 1246 continue; 1247 sin = (struct sockaddr_in *)sa; 1248 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1249 ia = (struct in_ifaddr *)ifa; 1250 break; 1251 } 1252 } 1253 if (ia != NULL) { 1254 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1255 goto done; 1256 } 1257 1258 /* 3. As a last resort return the 'default' jail address. */ 1259 error = prison_get_ip4(cred, laddr); 1260 goto done; 1261 } 1262 1263 /* 1264 * The outgoing interface is marked with 'loopback net', so a route 1265 * to ourselves is here. 1266 * Try to find the interface of the destination address and then 1267 * take the address from there. That interface is not necessarily 1268 * a loopback interface. 1269 * In case of jails, check that it is an address of the jail 1270 * and if we cannot find, fall back to the 'default' jail address. 1271 */ 1272 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1273 struct in_ifaddr *ia; 1274 1275 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1276 inp->inp_socket->so_fibnum)); 1277 if (ia == NULL) 1278 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1279 inp->inp_socket->so_fibnum)); 1280 if (ia == NULL) 1281 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1282 1283 if (!prison_flag(cred, PR_IP4)) { 1284 if (ia == NULL) { 1285 error = ENETUNREACH; 1286 goto done; 1287 } 1288 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1289 goto done; 1290 } 1291 1292 /* Jailed. */ 1293 if (ia != NULL) { 1294 struct ifnet *ifp; 1295 1296 ifp = ia->ia_ifp; 1297 ia = NULL; 1298 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1299 sa = ifa->ifa_addr; 1300 if (sa->sa_family != AF_INET) 1301 continue; 1302 sin = (struct sockaddr_in *)sa; 1303 if (prison_check_ip4(cred, 1304 &sin->sin_addr) == 0) { 1305 ia = (struct in_ifaddr *)ifa; 1306 break; 1307 } 1308 } 1309 if (ia != NULL) { 1310 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1311 goto done; 1312 } 1313 } 1314 1315 /* 3. As a last resort return the 'default' jail address. */ 1316 error = prison_get_ip4(cred, laddr); 1317 goto done; 1318 } 1319 1320 done: 1321 return (error); 1322 } 1323 1324 /* 1325 * Set up for a connect from a socket to the specified address. 1326 * On entry, *laddrp and *lportp should contain the current local 1327 * address and port for the PCB; these are updated to the values 1328 * that should be placed in inp_laddr and inp_lport to complete 1329 * the connect. 1330 * 1331 * On success, *faddrp and *fportp will be set to the remote address 1332 * and port. These are not updated in the error case. 1333 * 1334 * If the operation fails because the connection already exists, 1335 * *oinpp will be set to the PCB of that connection so that the 1336 * caller can decide to override it. In all other cases, *oinpp 1337 * is set to NULL. 1338 */ 1339 int 1340 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1341 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1342 struct inpcb **oinpp, struct ucred *cred) 1343 { 1344 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1345 struct in_ifaddr *ia; 1346 struct inpcb *oinp; 1347 struct in_addr laddr, faddr; 1348 u_short lport, fport; 1349 int error; 1350 1351 KASSERT(sin->sin_family == AF_INET, 1352 ("%s: invalid address family for %p", __func__, sin)); 1353 KASSERT(sin->sin_len == sizeof(*sin), 1354 ("%s: invalid address length for %p", __func__, sin)); 1355 1356 /* 1357 * Because a global state change doesn't actually occur here, a read 1358 * lock is sufficient. 1359 */ 1360 NET_EPOCH_ASSERT(); 1361 INP_LOCK_ASSERT(inp); 1362 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1363 1364 if (oinpp != NULL) 1365 *oinpp = NULL; 1366 if (sin->sin_port == 0) 1367 return (EADDRNOTAVAIL); 1368 laddr.s_addr = *laddrp; 1369 lport = *lportp; 1370 faddr = sin->sin_addr; 1371 fport = sin->sin_port; 1372 #ifdef ROUTE_MPATH 1373 if (CALC_FLOWID_OUTBOUND) { 1374 uint32_t hash_val, hash_type; 1375 1376 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1377 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1378 1379 inp->inp_flowid = hash_val; 1380 inp->inp_flowtype = hash_type; 1381 } 1382 #endif 1383 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1384 /* 1385 * If the destination address is INADDR_ANY, 1386 * use the primary local address. 1387 * If the supplied address is INADDR_BROADCAST, 1388 * and the primary interface supports broadcast, 1389 * choose the broadcast address for that interface. 1390 */ 1391 if (faddr.s_addr == INADDR_ANY) { 1392 faddr = 1393 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1394 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1395 return (error); 1396 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1397 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1398 IFF_BROADCAST) 1399 faddr = satosin(&CK_STAILQ_FIRST( 1400 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1401 } 1402 } 1403 if (laddr.s_addr == INADDR_ANY) { 1404 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1405 /* 1406 * If the destination address is multicast and an outgoing 1407 * interface has been set as a multicast option, prefer the 1408 * address of that interface as our source address. 1409 */ 1410 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1411 inp->inp_moptions != NULL) { 1412 struct ip_moptions *imo; 1413 struct ifnet *ifp; 1414 1415 imo = inp->inp_moptions; 1416 if (imo->imo_multicast_ifp != NULL) { 1417 ifp = imo->imo_multicast_ifp; 1418 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1419 if (ia->ia_ifp == ifp && 1420 prison_check_ip4(cred, 1421 &ia->ia_addr.sin_addr) == 0) 1422 break; 1423 } 1424 if (ia == NULL) 1425 error = EADDRNOTAVAIL; 1426 else { 1427 laddr = ia->ia_addr.sin_addr; 1428 error = 0; 1429 } 1430 } 1431 } 1432 if (error) 1433 return (error); 1434 } 1435 1436 if (lport != 0) { 1437 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1438 fport, laddr, lport, 0, NULL, M_NODOM); 1439 if (oinp != NULL) { 1440 if (oinpp != NULL) 1441 *oinpp = oinp; 1442 return (EADDRINUSE); 1443 } 1444 } else { 1445 struct sockaddr_in lsin, fsin; 1446 1447 bzero(&lsin, sizeof(lsin)); 1448 bzero(&fsin, sizeof(fsin)); 1449 lsin.sin_family = AF_INET; 1450 lsin.sin_addr = laddr; 1451 fsin.sin_family = AF_INET; 1452 fsin.sin_addr = faddr; 1453 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1454 &lport, (struct sockaddr *)& fsin, fport, cred, 1455 INPLOOKUP_WILDCARD); 1456 if (error) 1457 return (error); 1458 } 1459 *laddrp = laddr.s_addr; 1460 *lportp = lport; 1461 *faddrp = faddr.s_addr; 1462 *fportp = fport; 1463 return (0); 1464 } 1465 1466 void 1467 in_pcbdisconnect(struct inpcb *inp) 1468 { 1469 1470 INP_WLOCK_ASSERT(inp); 1471 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1472 1473 inp->inp_faddr.s_addr = INADDR_ANY; 1474 inp->inp_fport = 0; 1475 in_pcbrehash(inp); 1476 } 1477 #endif /* INET */ 1478 1479 /* 1480 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1481 * For most protocols, this will be invoked immediately prior to calling 1482 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1483 * socket, in which case in_pcbfree() is deferred. 1484 */ 1485 void 1486 in_pcbdetach(struct inpcb *inp) 1487 { 1488 1489 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1490 1491 #ifdef RATELIMIT 1492 if (inp->inp_snd_tag != NULL) 1493 in_pcbdetach_txrtlmt(inp); 1494 #endif 1495 inp->inp_socket->so_pcb = NULL; 1496 inp->inp_socket = NULL; 1497 } 1498 1499 /* 1500 * inpcb hash lookups are protected by SMR section. 1501 * 1502 * Once desired pcb has been found, switching from SMR section to a pcb 1503 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1504 * here because SMR is a critical section. 1505 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1506 */ 1507 static inline void 1508 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1509 { 1510 1511 lock == INPLOOKUP_RLOCKPCB ? 1512 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1513 } 1514 1515 static inline void 1516 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1517 { 1518 1519 lock == INPLOOKUP_RLOCKPCB ? 1520 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1521 } 1522 1523 static inline int 1524 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1525 { 1526 1527 return (lock == INPLOOKUP_RLOCKPCB ? 1528 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1529 } 1530 1531 static inline bool 1532 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1533 { 1534 1535 return (lock == INPLOOKUP_RLOCKPCB ? 1536 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1537 } 1538 1539 static inline bool 1540 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1541 { 1542 1543 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1544 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1545 1546 if (__predict_true(inp_trylock(inp, lock))) { 1547 if (__predict_false(inp->inp_flags & ignflags)) { 1548 smr_exit(inp->inp_pcbinfo->ipi_smr); 1549 inp_unlock(inp, lock); 1550 return (false); 1551 } 1552 smr_exit(inp->inp_pcbinfo->ipi_smr); 1553 return (true); 1554 } 1555 1556 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1557 smr_exit(inp->inp_pcbinfo->ipi_smr); 1558 inp_lock(inp, lock); 1559 if (__predict_false(in_pcbrele(inp, lock))) 1560 return (false); 1561 /* 1562 * inp acquired through refcount & lock for sure didn't went 1563 * through uma_zfree(). However, it may have already went 1564 * through in_pcbfree() and has another reference, that 1565 * prevented its release by our in_pcbrele(). 1566 */ 1567 if (__predict_false(inp->inp_flags & ignflags)) { 1568 inp_unlock(inp, lock); 1569 return (false); 1570 } 1571 return (true); 1572 } else { 1573 smr_exit(inp->inp_pcbinfo->ipi_smr); 1574 return (false); 1575 } 1576 } 1577 1578 bool 1579 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1580 { 1581 1582 /* 1583 * in_pcblookup() family of functions ignore not only freed entries, 1584 * that may be found due to lockless access to the hash, but dropped 1585 * entries, too. 1586 */ 1587 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1588 } 1589 1590 /* 1591 * inp_next() - inpcb hash/list traversal iterator 1592 * 1593 * Requires initialized struct inpcb_iterator for context. 1594 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1595 * 1596 * - Iterator can have either write-lock or read-lock semantics, that can not 1597 * be changed later. 1598 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1599 * a single hash slot. Note: only rip_input() does the latter. 1600 * - Iterator may have optional bool matching function. The matching function 1601 * will be executed for each inpcb in the SMR context, so it can not acquire 1602 * locks and can safely access only immutable fields of inpcb. 1603 * 1604 * A fresh initialized iterator has NULL inpcb in its context and that 1605 * means that inp_next() call would return the very first inpcb on the list 1606 * locked with desired semantic. In all following calls the context pointer 1607 * shall hold the current inpcb pointer. The KPI user is not supposed to 1608 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1609 * and write NULL to its context. After end of traversal an iterator can be 1610 * reused. 1611 * 1612 * List traversals have the following features/constraints: 1613 * - New entries won't be seen, as they are always added to the head of a list. 1614 * - Removed entries won't stop traversal as long as they are not added to 1615 * a different list. This is violated by in_pcbrehash(). 1616 */ 1617 #define II_LIST_FIRST(ipi, hash) \ 1618 (((hash) == INP_ALL_LIST) ? \ 1619 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1620 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1621 #define II_LIST_NEXT(inp, hash) \ 1622 (((hash) == INP_ALL_LIST) ? \ 1623 CK_LIST_NEXT((inp), inp_list) : \ 1624 CK_LIST_NEXT((inp), inp_hash)) 1625 #define II_LOCK_ASSERT(inp, lock) \ 1626 rw_assert(&(inp)->inp_lock, \ 1627 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1628 struct inpcb * 1629 inp_next(struct inpcb_iterator *ii) 1630 { 1631 const struct inpcbinfo *ipi = ii->ipi; 1632 inp_match_t *match = ii->match; 1633 void *ctx = ii->ctx; 1634 inp_lookup_t lock = ii->lock; 1635 int hash = ii->hash; 1636 struct inpcb *inp; 1637 1638 if (ii->inp == NULL) { /* First call. */ 1639 smr_enter(ipi->ipi_smr); 1640 /* This is unrolled CK_LIST_FOREACH(). */ 1641 for (inp = II_LIST_FIRST(ipi, hash); 1642 inp != NULL; 1643 inp = II_LIST_NEXT(inp, hash)) { 1644 if (match != NULL && (match)(inp, ctx) == false) 1645 continue; 1646 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1647 break; 1648 else { 1649 smr_enter(ipi->ipi_smr); 1650 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1651 inp = II_LIST_FIRST(ipi, hash); 1652 if (inp == NULL) 1653 break; 1654 } 1655 } 1656 1657 if (inp == NULL) 1658 smr_exit(ipi->ipi_smr); 1659 else 1660 ii->inp = inp; 1661 1662 return (inp); 1663 } 1664 1665 /* Not a first call. */ 1666 smr_enter(ipi->ipi_smr); 1667 restart: 1668 inp = ii->inp; 1669 II_LOCK_ASSERT(inp, lock); 1670 next: 1671 inp = II_LIST_NEXT(inp, hash); 1672 if (inp == NULL) { 1673 smr_exit(ipi->ipi_smr); 1674 goto found; 1675 } 1676 1677 if (match != NULL && (match)(inp, ctx) == false) 1678 goto next; 1679 1680 if (__predict_true(inp_trylock(inp, lock))) { 1681 if (__predict_false(inp->inp_flags & INP_FREED)) { 1682 /* 1683 * Entries are never inserted in middle of a list, thus 1684 * as long as we are in SMR, we can continue traversal. 1685 * Jump to 'restart' should yield in the same result, 1686 * but could produce unnecessary looping. Could this 1687 * looping be unbound? 1688 */ 1689 inp_unlock(inp, lock); 1690 goto next; 1691 } else { 1692 smr_exit(ipi->ipi_smr); 1693 goto found; 1694 } 1695 } 1696 1697 /* 1698 * Can't obtain lock immediately, thus going hard. Once we exit the 1699 * SMR section we can no longer jump to 'next', and our only stable 1700 * anchoring point is ii->inp, which we keep locked for this case, so 1701 * we jump to 'restart'. 1702 */ 1703 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1704 smr_exit(ipi->ipi_smr); 1705 inp_lock(inp, lock); 1706 if (__predict_false(in_pcbrele(inp, lock))) { 1707 smr_enter(ipi->ipi_smr); 1708 goto restart; 1709 } 1710 /* 1711 * See comment in inp_smr_lock(). 1712 */ 1713 if (__predict_false(inp->inp_flags & INP_FREED)) { 1714 inp_unlock(inp, lock); 1715 smr_enter(ipi->ipi_smr); 1716 goto restart; 1717 } 1718 } else 1719 goto next; 1720 1721 found: 1722 inp_unlock(ii->inp, lock); 1723 ii->inp = inp; 1724 1725 return (ii->inp); 1726 } 1727 1728 /* 1729 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1730 * stability of an inpcb pointer despite the inpcb lock being released or 1731 * SMR section exited. 1732 * 1733 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1734 */ 1735 void 1736 in_pcbref(struct inpcb *inp) 1737 { 1738 u_int old __diagused; 1739 1740 old = refcount_acquire(&inp->inp_refcount); 1741 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1742 } 1743 1744 /* 1745 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1746 * freeing the pcb, if the reference was very last. 1747 */ 1748 bool 1749 in_pcbrele_rlocked(struct inpcb *inp) 1750 { 1751 1752 INP_RLOCK_ASSERT(inp); 1753 1754 if (refcount_release(&inp->inp_refcount) == 0) 1755 return (false); 1756 1757 MPASS(inp->inp_flags & INP_FREED); 1758 MPASS(inp->inp_socket == NULL); 1759 MPASS(inp->inp_in_hpts == 0); 1760 INP_RUNLOCK(inp); 1761 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1762 return (true); 1763 } 1764 1765 bool 1766 in_pcbrele_wlocked(struct inpcb *inp) 1767 { 1768 1769 INP_WLOCK_ASSERT(inp); 1770 1771 if (refcount_release(&inp->inp_refcount) == 0) 1772 return (false); 1773 1774 MPASS(inp->inp_flags & INP_FREED); 1775 MPASS(inp->inp_socket == NULL); 1776 MPASS(inp->inp_in_hpts == 0); 1777 INP_WUNLOCK(inp); 1778 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1779 return (true); 1780 } 1781 1782 /* 1783 * Unconditionally schedule an inpcb to be freed by decrementing its 1784 * reference count, which should occur only after the inpcb has been detached 1785 * from its socket. If another thread holds a temporary reference (acquired 1786 * using in_pcbref()) then the free is deferred until that reference is 1787 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1788 * Almost all work, including removal from global lists, is done in this 1789 * context, where the pcbinfo lock is held. 1790 */ 1791 void 1792 in_pcbfree(struct inpcb *inp) 1793 { 1794 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1795 #ifdef INET 1796 struct ip_moptions *imo; 1797 #endif 1798 #ifdef INET6 1799 struct ip6_moptions *im6o; 1800 #endif 1801 1802 INP_WLOCK_ASSERT(inp); 1803 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1804 KASSERT((inp->inp_flags & INP_FREED) == 0, 1805 ("%s: called twice for pcb %p", __func__, inp)); 1806 1807 inp->inp_flags |= INP_FREED; 1808 INP_INFO_WLOCK(pcbinfo); 1809 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1810 pcbinfo->ipi_count--; 1811 CK_LIST_REMOVE(inp, inp_list); 1812 INP_INFO_WUNLOCK(pcbinfo); 1813 1814 if (inp->inp_flags & INP_INHASHLIST) 1815 in_pcbremhash(inp); 1816 1817 RO_INVALIDATE_CACHE(&inp->inp_route); 1818 #ifdef MAC 1819 mac_inpcb_destroy(inp); 1820 #endif 1821 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1822 if (inp->inp_sp != NULL) 1823 ipsec_delete_pcbpolicy(inp); 1824 #endif 1825 #ifdef INET 1826 if (inp->inp_options) 1827 (void)m_free(inp->inp_options); 1828 imo = inp->inp_moptions; 1829 #endif 1830 #ifdef INET6 1831 if (inp->inp_vflag & INP_IPV6PROTO) { 1832 ip6_freepcbopts(inp->in6p_outputopts); 1833 im6o = inp->in6p_moptions; 1834 } else 1835 im6o = NULL; 1836 #endif 1837 1838 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1839 INP_WUNLOCK(inp); 1840 } 1841 #ifdef INET6 1842 ip6_freemoptions(im6o); 1843 #endif 1844 #ifdef INET 1845 inp_freemoptions(imo); 1846 #endif 1847 /* Destruction is finalized in inpcb_dtor(). */ 1848 } 1849 1850 static void 1851 inpcb_dtor(void *mem, int size, void *arg) 1852 { 1853 struct inpcb *inp = mem; 1854 1855 crfree(inp->inp_cred); 1856 #ifdef INVARIANTS 1857 inp->inp_cred = NULL; 1858 #endif 1859 } 1860 1861 /* 1862 * Different protocols initialize their inpcbs differently - giving 1863 * different name to the lock. But they all are disposed the same. 1864 */ 1865 static void 1866 inpcb_fini(void *mem, int size) 1867 { 1868 struct inpcb *inp = mem; 1869 1870 INP_LOCK_DESTROY(inp); 1871 } 1872 1873 /* 1874 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1875 * port reservation, and preventing it from being returned by inpcb lookups. 1876 * 1877 * It is used by TCP to mark an inpcb as unused and avoid future packet 1878 * delivery or event notification when a socket remains open but TCP has 1879 * closed. This might occur as a result of a shutdown()-initiated TCP close 1880 * or a RST on the wire, and allows the port binding to be reused while still 1881 * maintaining the invariant that so_pcb always points to a valid inpcb until 1882 * in_pcbdetach(). 1883 * 1884 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1885 * in_pcbnotifyall() and in_pcbpurgeif0()? 1886 */ 1887 void 1888 in_pcbdrop(struct inpcb *inp) 1889 { 1890 1891 INP_WLOCK_ASSERT(inp); 1892 #ifdef INVARIANTS 1893 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1894 MPASS(inp->inp_refcount > 1); 1895 #endif 1896 1897 inp->inp_flags |= INP_DROPPED; 1898 if (inp->inp_flags & INP_INHASHLIST) 1899 in_pcbremhash(inp); 1900 } 1901 1902 #ifdef INET 1903 /* 1904 * Common routines to return the socket addresses associated with inpcbs. 1905 */ 1906 struct sockaddr * 1907 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1908 { 1909 struct sockaddr_in *sin; 1910 1911 sin = malloc(sizeof *sin, M_SONAME, 1912 M_WAITOK | M_ZERO); 1913 sin->sin_family = AF_INET; 1914 sin->sin_len = sizeof(*sin); 1915 sin->sin_addr = *addr_p; 1916 sin->sin_port = port; 1917 1918 return (struct sockaddr *)sin; 1919 } 1920 1921 int 1922 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1923 { 1924 struct inpcb *inp; 1925 struct in_addr addr; 1926 in_port_t port; 1927 1928 inp = sotoinpcb(so); 1929 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1930 1931 INP_RLOCK(inp); 1932 port = inp->inp_lport; 1933 addr = inp->inp_laddr; 1934 INP_RUNLOCK(inp); 1935 1936 *nam = in_sockaddr(port, &addr); 1937 return 0; 1938 } 1939 1940 int 1941 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1942 { 1943 struct inpcb *inp; 1944 struct in_addr addr; 1945 in_port_t port; 1946 1947 inp = sotoinpcb(so); 1948 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1949 1950 INP_RLOCK(inp); 1951 port = inp->inp_fport; 1952 addr = inp->inp_faddr; 1953 INP_RUNLOCK(inp); 1954 1955 *nam = in_sockaddr(port, &addr); 1956 return 0; 1957 } 1958 1959 void 1960 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1961 struct inpcb *(*notify)(struct inpcb *, int)) 1962 { 1963 struct inpcb *inp, *inp_temp; 1964 1965 INP_INFO_WLOCK(pcbinfo); 1966 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1967 INP_WLOCK(inp); 1968 #ifdef INET6 1969 if ((inp->inp_vflag & INP_IPV4) == 0) { 1970 INP_WUNLOCK(inp); 1971 continue; 1972 } 1973 #endif 1974 if (inp->inp_faddr.s_addr != faddr.s_addr || 1975 inp->inp_socket == NULL) { 1976 INP_WUNLOCK(inp); 1977 continue; 1978 } 1979 if ((*notify)(inp, errno)) 1980 INP_WUNLOCK(inp); 1981 } 1982 INP_INFO_WUNLOCK(pcbinfo); 1983 } 1984 1985 static bool 1986 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1987 { 1988 1989 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1990 return (true); 1991 else 1992 return (false); 1993 } 1994 1995 void 1996 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1997 { 1998 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1999 inp_v4_multi_match, NULL); 2000 struct inpcb *inp; 2001 struct in_multi *inm; 2002 struct in_mfilter *imf; 2003 struct ip_moptions *imo; 2004 2005 IN_MULTI_LOCK_ASSERT(); 2006 2007 while ((inp = inp_next(&inpi)) != NULL) { 2008 INP_WLOCK_ASSERT(inp); 2009 2010 imo = inp->inp_moptions; 2011 /* 2012 * Unselect the outgoing interface if it is being 2013 * detached. 2014 */ 2015 if (imo->imo_multicast_ifp == ifp) 2016 imo->imo_multicast_ifp = NULL; 2017 2018 /* 2019 * Drop multicast group membership if we joined 2020 * through the interface being detached. 2021 * 2022 * XXX This can all be deferred to an epoch_call 2023 */ 2024 restart: 2025 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2026 if ((inm = imf->imf_inm) == NULL) 2027 continue; 2028 if (inm->inm_ifp != ifp) 2029 continue; 2030 ip_mfilter_remove(&imo->imo_head, imf); 2031 in_leavegroup_locked(inm, NULL); 2032 ip_mfilter_free(imf); 2033 goto restart; 2034 } 2035 } 2036 } 2037 2038 /* 2039 * Lookup a PCB based on the local address and port. Caller must hold the 2040 * hash lock. No inpcb locks or references are acquired. 2041 */ 2042 #define INP_LOOKUP_MAPPED_PCB_COST 3 2043 struct inpcb * 2044 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2045 u_short lport, int lookupflags, struct ucred *cred) 2046 { 2047 struct inpcb *inp; 2048 #ifdef INET6 2049 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2050 #else 2051 int matchwild = 3; 2052 #endif 2053 int wildcard; 2054 2055 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2056 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2057 INP_HASH_LOCK_ASSERT(pcbinfo); 2058 2059 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2060 struct inpcbhead *head; 2061 /* 2062 * Look for an unconnected (wildcard foreign addr) PCB that 2063 * matches the local address and port we're looking for. 2064 */ 2065 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2066 pcbinfo->ipi_hashmask)]; 2067 CK_LIST_FOREACH(inp, head, inp_hash) { 2068 #ifdef INET6 2069 /* XXX inp locking */ 2070 if ((inp->inp_vflag & INP_IPV4) == 0) 2071 continue; 2072 #endif 2073 if (inp->inp_faddr.s_addr == INADDR_ANY && 2074 inp->inp_laddr.s_addr == laddr.s_addr && 2075 inp->inp_lport == lport) { 2076 /* 2077 * Found? 2078 */ 2079 if (prison_equal_ip4(cred->cr_prison, 2080 inp->inp_cred->cr_prison)) 2081 return (inp); 2082 } 2083 } 2084 /* 2085 * Not found. 2086 */ 2087 return (NULL); 2088 } else { 2089 struct inpcbporthead *porthash; 2090 struct inpcbport *phd; 2091 struct inpcb *match = NULL; 2092 /* 2093 * Best fit PCB lookup. 2094 * 2095 * First see if this local port is in use by looking on the 2096 * port hash list. 2097 */ 2098 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2099 pcbinfo->ipi_porthashmask)]; 2100 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2101 if (phd->phd_port == lport) 2102 break; 2103 } 2104 if (phd != NULL) { 2105 /* 2106 * Port is in use by one or more PCBs. Look for best 2107 * fit. 2108 */ 2109 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2110 wildcard = 0; 2111 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2112 cred->cr_prison)) 2113 continue; 2114 #ifdef INET6 2115 /* XXX inp locking */ 2116 if ((inp->inp_vflag & INP_IPV4) == 0) 2117 continue; 2118 /* 2119 * We never select the PCB that has 2120 * INP_IPV6 flag and is bound to :: if 2121 * we have another PCB which is bound 2122 * to 0.0.0.0. If a PCB has the 2123 * INP_IPV6 flag, then we set its cost 2124 * higher than IPv4 only PCBs. 2125 * 2126 * Note that the case only happens 2127 * when a socket is bound to ::, under 2128 * the condition that the use of the 2129 * mapped address is allowed. 2130 */ 2131 if ((inp->inp_vflag & INP_IPV6) != 0) 2132 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2133 #endif 2134 if (inp->inp_faddr.s_addr != INADDR_ANY) 2135 wildcard++; 2136 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2137 if (laddr.s_addr == INADDR_ANY) 2138 wildcard++; 2139 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2140 continue; 2141 } else { 2142 if (laddr.s_addr != INADDR_ANY) 2143 wildcard++; 2144 } 2145 if (wildcard < matchwild) { 2146 match = inp; 2147 matchwild = wildcard; 2148 if (matchwild == 0) 2149 break; 2150 } 2151 } 2152 } 2153 return (match); 2154 } 2155 } 2156 #undef INP_LOOKUP_MAPPED_PCB_COST 2157 2158 static bool 2159 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) 2160 { 2161 return (domain == M_NODOM || domain == grp->il_numa_domain); 2162 } 2163 2164 static struct inpcb * 2165 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2166 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2167 uint16_t fport, int lookupflags, int domain) 2168 { 2169 const struct inpcblbgrouphead *hdr; 2170 struct inpcblbgroup *grp; 2171 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2172 2173 INP_HASH_LOCK_ASSERT(pcbinfo); 2174 2175 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2176 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2177 2178 /* 2179 * Search for an LB group match based on the following criteria: 2180 * - prefer jailed groups to non-jailed groups 2181 * - prefer exact source address matches to wildcard matches 2182 * - prefer groups bound to the specified NUMA domain 2183 */ 2184 jail_exact = jail_wild = local_exact = local_wild = NULL; 2185 CK_LIST_FOREACH(grp, hdr, il_list) { 2186 bool injail; 2187 2188 #ifdef INET6 2189 if (!(grp->il_vflag & INP_IPV4)) 2190 continue; 2191 #endif 2192 if (grp->il_lport != lport) 2193 continue; 2194 2195 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2196 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2197 laddr) != 0) 2198 continue; 2199 2200 if (grp->il_laddr.s_addr == laddr->s_addr) { 2201 if (injail) { 2202 jail_exact = grp; 2203 if (in_pcblookup_lb_numa_match(grp, domain)) 2204 /* This is a perfect match. */ 2205 goto out; 2206 } else if (local_exact == NULL || 2207 in_pcblookup_lb_numa_match(grp, domain)) { 2208 local_exact = grp; 2209 } 2210 } else if (grp->il_laddr.s_addr == INADDR_ANY && 2211 (lookupflags & INPLOOKUP_WILDCARD) != 0) { 2212 if (injail) { 2213 if (jail_wild == NULL || 2214 in_pcblookup_lb_numa_match(grp, domain)) 2215 jail_wild = grp; 2216 } else if (local_wild == NULL || 2217 in_pcblookup_lb_numa_match(grp, domain)) { 2218 local_wild = grp; 2219 } 2220 } 2221 } 2222 2223 if (jail_exact != NULL) 2224 grp = jail_exact; 2225 else if (jail_wild != NULL) 2226 grp = jail_wild; 2227 else if (local_exact != NULL) 2228 grp = local_exact; 2229 else 2230 grp = local_wild; 2231 if (grp == NULL) 2232 return (NULL); 2233 out: 2234 return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2235 grp->il_inpcnt]); 2236 } 2237 2238 /* 2239 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2240 * that the caller has either locked the hash list, which usually happens 2241 * for bind(2) operations, or is in SMR section, which happens when sorting 2242 * out incoming packets. 2243 */ 2244 static struct inpcb * 2245 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2246 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2247 struct ifnet *ifp, uint8_t numa_domain) 2248 { 2249 struct inpcbhead *head; 2250 struct inpcb *inp, *tmpinp; 2251 u_short fport = fport_arg, lport = lport_arg; 2252 2253 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2254 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2255 INP_HASH_LOCK_ASSERT(pcbinfo); 2256 2257 /* 2258 * First look for an exact match. 2259 */ 2260 tmpinp = NULL; 2261 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, 2262 pcbinfo->ipi_hashmask)]; 2263 CK_LIST_FOREACH(inp, head, inp_hash) { 2264 #ifdef INET6 2265 /* XXX inp locking */ 2266 if ((inp->inp_vflag & INP_IPV4) == 0) 2267 continue; 2268 #endif 2269 if (inp->inp_faddr.s_addr == faddr.s_addr && 2270 inp->inp_laddr.s_addr == laddr.s_addr && 2271 inp->inp_fport == fport && 2272 inp->inp_lport == lport) { 2273 /* 2274 * XXX We should be able to directly return 2275 * the inp here, without any checks. 2276 * Well unless both bound with SO_REUSEPORT? 2277 */ 2278 if (prison_flag(inp->inp_cred, PR_IP4)) 2279 return (inp); 2280 if (tmpinp == NULL) 2281 tmpinp = inp; 2282 } 2283 } 2284 if (tmpinp != NULL) 2285 return (tmpinp); 2286 2287 /* 2288 * Then look for a wildcard match, if requested. 2289 */ 2290 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2291 struct inpcb *local_wild = NULL, *local_exact = NULL; 2292 #ifdef INET6 2293 struct inpcb *local_wild_mapped = NULL; 2294 #endif 2295 struct inpcb *jail_wild = NULL; 2296 int injail; 2297 2298 /* 2299 * First see if an LB group matches the request before scanning 2300 * all sockets on this port. 2301 */ 2302 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2303 fport, lookupflags, numa_domain); 2304 if (inp != NULL) 2305 return (inp); 2306 2307 /* 2308 * Order of socket selection - we always prefer jails. 2309 * 1. jailed, non-wild. 2310 * 2. jailed, wild. 2311 * 3. non-jailed, non-wild. 2312 * 4. non-jailed, wild. 2313 */ 2314 2315 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2316 pcbinfo->ipi_hashmask)]; 2317 CK_LIST_FOREACH(inp, head, inp_hash) { 2318 #ifdef INET6 2319 /* XXX inp locking */ 2320 if ((inp->inp_vflag & INP_IPV4) == 0) 2321 continue; 2322 #endif 2323 if (inp->inp_faddr.s_addr != INADDR_ANY || 2324 inp->inp_lport != lport) 2325 continue; 2326 2327 injail = prison_flag(inp->inp_cred, PR_IP4); 2328 if (injail) { 2329 if (prison_check_ip4_locked( 2330 inp->inp_cred->cr_prison, &laddr) != 0) 2331 continue; 2332 } else { 2333 if (local_exact != NULL) 2334 continue; 2335 } 2336 2337 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2338 if (injail) 2339 return (inp); 2340 else 2341 local_exact = inp; 2342 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2343 #ifdef INET6 2344 /* XXX inp locking, NULL check */ 2345 if (inp->inp_vflag & INP_IPV6PROTO) 2346 local_wild_mapped = inp; 2347 else 2348 #endif 2349 if (injail) 2350 jail_wild = inp; 2351 else 2352 local_wild = inp; 2353 } 2354 } /* LIST_FOREACH */ 2355 if (jail_wild != NULL) 2356 return (jail_wild); 2357 if (local_exact != NULL) 2358 return (local_exact); 2359 if (local_wild != NULL) 2360 return (local_wild); 2361 #ifdef INET6 2362 if (local_wild_mapped != NULL) 2363 return (local_wild_mapped); 2364 #endif 2365 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2366 2367 return (NULL); 2368 } 2369 2370 /* 2371 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2372 * hash list lock, and will return the inpcb locked (i.e., requires 2373 * INPLOOKUP_LOCKPCB). 2374 */ 2375 static struct inpcb * 2376 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2377 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2378 struct ifnet *ifp, uint8_t numa_domain) 2379 { 2380 struct inpcb *inp; 2381 2382 smr_enter(pcbinfo->ipi_smr); 2383 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2384 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2385 if (inp != NULL) { 2386 if (__predict_false(inp_smr_lock(inp, 2387 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2388 inp = NULL; 2389 } else 2390 smr_exit(pcbinfo->ipi_smr); 2391 2392 return (inp); 2393 } 2394 2395 /* 2396 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2397 * from which a pre-calculated hash value may be extracted. 2398 */ 2399 struct inpcb * 2400 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2401 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2402 { 2403 2404 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2405 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2406 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2407 ("%s: LOCKPCB not set", __func__)); 2408 2409 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2410 lookupflags, ifp, M_NODOM)); 2411 } 2412 2413 struct inpcb * 2414 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2415 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2416 struct ifnet *ifp, struct mbuf *m) 2417 { 2418 2419 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2420 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2421 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2422 ("%s: LOCKPCB not set", __func__)); 2423 2424 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2425 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2426 } 2427 #endif /* INET */ 2428 2429 /* 2430 * Insert PCB onto various hash lists. 2431 */ 2432 int 2433 in_pcbinshash(struct inpcb *inp) 2434 { 2435 struct inpcbhead *pcbhash; 2436 struct inpcbporthead *pcbporthash; 2437 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2438 struct inpcbport *phd; 2439 2440 INP_WLOCK_ASSERT(inp); 2441 INP_HASH_WLOCK_ASSERT(pcbinfo); 2442 2443 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2444 ("in_pcbinshash: INP_INHASHLIST")); 2445 2446 #ifdef INET6 2447 if (inp->inp_vflag & INP_IPV6) 2448 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2449 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2450 else 2451 #endif 2452 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2453 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2454 2455 pcbporthash = &pcbinfo->ipi_porthashbase[ 2456 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2457 2458 /* 2459 * Add entry to load balance group. 2460 * Only do this if SO_REUSEPORT_LB is set. 2461 */ 2462 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) { 2463 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2464 if (error != 0) 2465 return (error); 2466 } 2467 2468 /* 2469 * Go through port list and look for a head for this lport. 2470 */ 2471 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2472 if (phd->phd_port == inp->inp_lport) 2473 break; 2474 } 2475 2476 /* 2477 * If none exists, malloc one and tack it on. 2478 */ 2479 if (phd == NULL) { 2480 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2481 if (phd == NULL) { 2482 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2483 in_pcbremlbgrouphash(inp); 2484 return (ENOMEM); 2485 } 2486 phd->phd_port = inp->inp_lport; 2487 CK_LIST_INIT(&phd->phd_pcblist); 2488 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2489 } 2490 inp->inp_phd = phd; 2491 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2492 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2493 inp->inp_flags |= INP_INHASHLIST; 2494 2495 return (0); 2496 } 2497 2498 static void 2499 in_pcbremhash(struct inpcb *inp) 2500 { 2501 struct inpcbport *phd = inp->inp_phd; 2502 2503 INP_WLOCK_ASSERT(inp); 2504 MPASS(inp->inp_flags & INP_INHASHLIST); 2505 2506 INP_HASH_WLOCK(inp->inp_pcbinfo); 2507 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2508 in_pcbremlbgrouphash(inp); 2509 CK_LIST_REMOVE(inp, inp_hash); 2510 CK_LIST_REMOVE(inp, inp_portlist); 2511 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2512 CK_LIST_REMOVE(phd, phd_hash); 2513 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2514 } 2515 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2516 inp->inp_flags &= ~INP_INHASHLIST; 2517 } 2518 2519 /* 2520 * Move PCB to the proper hash bucket when { faddr, fport } have been 2521 * changed. NOTE: This does not handle the case of the lport changing (the 2522 * hashed port list would have to be updated as well), so the lport must 2523 * not change after in_pcbinshash() has been called. 2524 * 2525 * XXXGL: a race between this function and SMR-protected hash iterator 2526 * will lead to iterator traversing a possibly wrong hash list. However, 2527 * this race should have been here since change from rwlock to epoch. 2528 */ 2529 void 2530 in_pcbrehash(struct inpcb *inp) 2531 { 2532 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2533 struct inpcbhead *head; 2534 2535 INP_WLOCK_ASSERT(inp); 2536 INP_HASH_WLOCK_ASSERT(pcbinfo); 2537 2538 KASSERT(inp->inp_flags & INP_INHASHLIST, 2539 ("in_pcbrehash: !INP_INHASHLIST")); 2540 2541 #ifdef INET6 2542 if (inp->inp_vflag & INP_IPV6) 2543 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2544 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2545 else 2546 #endif 2547 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2548 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2549 2550 CK_LIST_REMOVE(inp, inp_hash); 2551 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2552 } 2553 2554 /* 2555 * Check for alternatives when higher level complains 2556 * about service problems. For now, invalidate cached 2557 * routing information. If the route was created dynamically 2558 * (by a redirect), time to try a default gateway again. 2559 */ 2560 void 2561 in_losing(struct inpcb *inp) 2562 { 2563 2564 RO_INVALIDATE_CACHE(&inp->inp_route); 2565 return; 2566 } 2567 2568 /* 2569 * A set label operation has occurred at the socket layer, propagate the 2570 * label change into the in_pcb for the socket. 2571 */ 2572 void 2573 in_pcbsosetlabel(struct socket *so) 2574 { 2575 #ifdef MAC 2576 struct inpcb *inp; 2577 2578 inp = sotoinpcb(so); 2579 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2580 2581 INP_WLOCK(inp); 2582 SOCK_LOCK(so); 2583 mac_inpcb_sosetlabel(so, inp); 2584 SOCK_UNLOCK(so); 2585 INP_WUNLOCK(inp); 2586 #endif 2587 } 2588 2589 void 2590 inp_wlock(struct inpcb *inp) 2591 { 2592 2593 INP_WLOCK(inp); 2594 } 2595 2596 void 2597 inp_wunlock(struct inpcb *inp) 2598 { 2599 2600 INP_WUNLOCK(inp); 2601 } 2602 2603 void 2604 inp_rlock(struct inpcb *inp) 2605 { 2606 2607 INP_RLOCK(inp); 2608 } 2609 2610 void 2611 inp_runlock(struct inpcb *inp) 2612 { 2613 2614 INP_RUNLOCK(inp); 2615 } 2616 2617 #ifdef INVARIANT_SUPPORT 2618 void 2619 inp_lock_assert(struct inpcb *inp) 2620 { 2621 2622 INP_WLOCK_ASSERT(inp); 2623 } 2624 2625 void 2626 inp_unlock_assert(struct inpcb *inp) 2627 { 2628 2629 INP_UNLOCK_ASSERT(inp); 2630 } 2631 #endif 2632 2633 void 2634 inp_apply_all(struct inpcbinfo *pcbinfo, 2635 void (*func)(struct inpcb *, void *), void *arg) 2636 { 2637 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2638 INPLOOKUP_WLOCKPCB); 2639 struct inpcb *inp; 2640 2641 while ((inp = inp_next(&inpi)) != NULL) 2642 func(inp, arg); 2643 } 2644 2645 struct socket * 2646 inp_inpcbtosocket(struct inpcb *inp) 2647 { 2648 2649 INP_WLOCK_ASSERT(inp); 2650 return (inp->inp_socket); 2651 } 2652 2653 struct tcpcb * 2654 inp_inpcbtotcpcb(struct inpcb *inp) 2655 { 2656 2657 INP_WLOCK_ASSERT(inp); 2658 return ((struct tcpcb *)inp->inp_ppcb); 2659 } 2660 2661 int 2662 inp_ip_tos_get(const struct inpcb *inp) 2663 { 2664 2665 return (inp->inp_ip_tos); 2666 } 2667 2668 void 2669 inp_ip_tos_set(struct inpcb *inp, int val) 2670 { 2671 2672 inp->inp_ip_tos = val; 2673 } 2674 2675 void 2676 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2677 uint32_t *faddr, uint16_t *fp) 2678 { 2679 2680 INP_LOCK_ASSERT(inp); 2681 *laddr = inp->inp_laddr.s_addr; 2682 *faddr = inp->inp_faddr.s_addr; 2683 *lp = inp->inp_lport; 2684 *fp = inp->inp_fport; 2685 } 2686 2687 struct inpcb * 2688 so_sotoinpcb(struct socket *so) 2689 { 2690 2691 return (sotoinpcb(so)); 2692 } 2693 2694 /* 2695 * Create an external-format (``xinpcb'') structure using the information in 2696 * the kernel-format in_pcb structure pointed to by inp. This is done to 2697 * reduce the spew of irrelevant information over this interface, to isolate 2698 * user code from changes in the kernel structure, and potentially to provide 2699 * information-hiding if we decide that some of this information should be 2700 * hidden from users. 2701 */ 2702 void 2703 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2704 { 2705 2706 bzero(xi, sizeof(*xi)); 2707 xi->xi_len = sizeof(struct xinpcb); 2708 if (inp->inp_socket) 2709 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2710 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2711 xi->inp_gencnt = inp->inp_gencnt; 2712 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2713 xi->inp_flow = inp->inp_flow; 2714 xi->inp_flowid = inp->inp_flowid; 2715 xi->inp_flowtype = inp->inp_flowtype; 2716 xi->inp_flags = inp->inp_flags; 2717 xi->inp_flags2 = inp->inp_flags2; 2718 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2719 xi->in6p_cksum = inp->in6p_cksum; 2720 xi->in6p_hops = inp->in6p_hops; 2721 xi->inp_ip_tos = inp->inp_ip_tos; 2722 xi->inp_vflag = inp->inp_vflag; 2723 xi->inp_ip_ttl = inp->inp_ip_ttl; 2724 xi->inp_ip_p = inp->inp_ip_p; 2725 xi->inp_ip_minttl = inp->inp_ip_minttl; 2726 } 2727 2728 int 2729 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2730 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2731 { 2732 struct sockopt sopt; 2733 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2734 INPLOOKUP_WLOCKPCB); 2735 struct inpcb *inp; 2736 struct sockopt_parameters *params; 2737 struct socket *so; 2738 int error; 2739 char buf[1024]; 2740 2741 if (req->oldptr != NULL || req->oldlen != 0) 2742 return (EINVAL); 2743 if (req->newptr == NULL) 2744 return (EPERM); 2745 if (req->newlen > sizeof(buf)) 2746 return (ENOMEM); 2747 error = SYSCTL_IN(req, buf, req->newlen); 2748 if (error != 0) 2749 return (error); 2750 if (req->newlen < sizeof(struct sockopt_parameters)) 2751 return (EINVAL); 2752 params = (struct sockopt_parameters *)buf; 2753 sopt.sopt_level = params->sop_level; 2754 sopt.sopt_name = params->sop_optname; 2755 sopt.sopt_dir = SOPT_SET; 2756 sopt.sopt_val = params->sop_optval; 2757 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2758 sopt.sopt_td = NULL; 2759 #ifdef INET6 2760 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2761 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2762 params->sop_inc.inc6_laddr.s6_addr16[1] = 2763 htons(params->sop_inc.inc6_zoneid & 0xffff); 2764 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2765 params->sop_inc.inc6_faddr.s6_addr16[1] = 2766 htons(params->sop_inc.inc6_zoneid & 0xffff); 2767 } 2768 #endif 2769 if (params->sop_inc.inc_lport != htons(0)) { 2770 if (params->sop_inc.inc_fport == htons(0)) 2771 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, 2772 pcbinfo->ipi_hashmask); 2773 else 2774 #ifdef INET6 2775 if (params->sop_inc.inc_flags & INC_ISIPV6) 2776 inpi.hash = INP6_PCBHASH( 2777 ¶ms->sop_inc.inc6_faddr, 2778 params->sop_inc.inc_lport, 2779 params->sop_inc.inc_fport, 2780 pcbinfo->ipi_hashmask); 2781 else 2782 #endif 2783 inpi.hash = INP_PCBHASH( 2784 ¶ms->sop_inc.inc_faddr, 2785 params->sop_inc.inc_lport, 2786 params->sop_inc.inc_fport, 2787 pcbinfo->ipi_hashmask); 2788 } 2789 while ((inp = inp_next(&inpi)) != NULL) 2790 if (inp->inp_gencnt == params->sop_id) { 2791 if (inp->inp_flags & INP_DROPPED) { 2792 INP_WUNLOCK(inp); 2793 return (ECONNRESET); 2794 } 2795 so = inp->inp_socket; 2796 KASSERT(so != NULL, ("inp_socket == NULL")); 2797 soref(so); 2798 error = (*ctloutput_set)(inp, &sopt); 2799 sorele(so); 2800 break; 2801 } 2802 if (inp == NULL) 2803 error = ESRCH; 2804 return (error); 2805 } 2806 2807 #ifdef DDB 2808 static void 2809 db_print_indent(int indent) 2810 { 2811 int i; 2812 2813 for (i = 0; i < indent; i++) 2814 db_printf(" "); 2815 } 2816 2817 static void 2818 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2819 { 2820 char faddr_str[48], laddr_str[48]; 2821 2822 db_print_indent(indent); 2823 db_printf("%s at %p\n", name, inc); 2824 2825 indent += 2; 2826 2827 #ifdef INET6 2828 if (inc->inc_flags & INC_ISIPV6) { 2829 /* IPv6. */ 2830 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2831 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2832 } else 2833 #endif 2834 { 2835 /* IPv4. */ 2836 inet_ntoa_r(inc->inc_laddr, laddr_str); 2837 inet_ntoa_r(inc->inc_faddr, faddr_str); 2838 } 2839 db_print_indent(indent); 2840 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2841 ntohs(inc->inc_lport)); 2842 db_print_indent(indent); 2843 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2844 ntohs(inc->inc_fport)); 2845 } 2846 2847 static void 2848 db_print_inpflags(int inp_flags) 2849 { 2850 int comma; 2851 2852 comma = 0; 2853 if (inp_flags & INP_RECVOPTS) { 2854 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2855 comma = 1; 2856 } 2857 if (inp_flags & INP_RECVRETOPTS) { 2858 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2859 comma = 1; 2860 } 2861 if (inp_flags & INP_RECVDSTADDR) { 2862 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2863 comma = 1; 2864 } 2865 if (inp_flags & INP_ORIGDSTADDR) { 2866 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2867 comma = 1; 2868 } 2869 if (inp_flags & INP_HDRINCL) { 2870 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2871 comma = 1; 2872 } 2873 if (inp_flags & INP_HIGHPORT) { 2874 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2875 comma = 1; 2876 } 2877 if (inp_flags & INP_LOWPORT) { 2878 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2879 comma = 1; 2880 } 2881 if (inp_flags & INP_ANONPORT) { 2882 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2883 comma = 1; 2884 } 2885 if (inp_flags & INP_RECVIF) { 2886 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2887 comma = 1; 2888 } 2889 if (inp_flags & INP_MTUDISC) { 2890 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2891 comma = 1; 2892 } 2893 if (inp_flags & INP_RECVTTL) { 2894 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2895 comma = 1; 2896 } 2897 if (inp_flags & INP_DONTFRAG) { 2898 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2899 comma = 1; 2900 } 2901 if (inp_flags & INP_RECVTOS) { 2902 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2903 comma = 1; 2904 } 2905 if (inp_flags & IN6P_IPV6_V6ONLY) { 2906 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2907 comma = 1; 2908 } 2909 if (inp_flags & IN6P_PKTINFO) { 2910 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2911 comma = 1; 2912 } 2913 if (inp_flags & IN6P_HOPLIMIT) { 2914 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2915 comma = 1; 2916 } 2917 if (inp_flags & IN6P_HOPOPTS) { 2918 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2919 comma = 1; 2920 } 2921 if (inp_flags & IN6P_DSTOPTS) { 2922 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2923 comma = 1; 2924 } 2925 if (inp_flags & IN6P_RTHDR) { 2926 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2927 comma = 1; 2928 } 2929 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2930 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2931 comma = 1; 2932 } 2933 if (inp_flags & IN6P_TCLASS) { 2934 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2935 comma = 1; 2936 } 2937 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2938 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2939 comma = 1; 2940 } 2941 if (inp_flags & INP_ONESBCAST) { 2942 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2943 comma = 1; 2944 } 2945 if (inp_flags & INP_DROPPED) { 2946 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2947 comma = 1; 2948 } 2949 if (inp_flags & INP_SOCKREF) { 2950 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2951 comma = 1; 2952 } 2953 if (inp_flags & IN6P_RFC2292) { 2954 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2955 comma = 1; 2956 } 2957 if (inp_flags & IN6P_MTU) { 2958 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2959 comma = 1; 2960 } 2961 } 2962 2963 static void 2964 db_print_inpvflag(u_char inp_vflag) 2965 { 2966 int comma; 2967 2968 comma = 0; 2969 if (inp_vflag & INP_IPV4) { 2970 db_printf("%sINP_IPV4", comma ? ", " : ""); 2971 comma = 1; 2972 } 2973 if (inp_vflag & INP_IPV6) { 2974 db_printf("%sINP_IPV6", comma ? ", " : ""); 2975 comma = 1; 2976 } 2977 if (inp_vflag & INP_IPV6PROTO) { 2978 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2979 comma = 1; 2980 } 2981 } 2982 2983 static void 2984 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2985 { 2986 2987 db_print_indent(indent); 2988 db_printf("%s at %p\n", name, inp); 2989 2990 indent += 2; 2991 2992 db_print_indent(indent); 2993 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2994 2995 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2996 2997 db_print_indent(indent); 2998 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2999 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 3000 3001 db_print_indent(indent); 3002 db_printf("inp_label: %p inp_flags: 0x%x (", 3003 inp->inp_label, inp->inp_flags); 3004 db_print_inpflags(inp->inp_flags); 3005 db_printf(")\n"); 3006 3007 db_print_indent(indent); 3008 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3009 inp->inp_vflag); 3010 db_print_inpvflag(inp->inp_vflag); 3011 db_printf(")\n"); 3012 3013 db_print_indent(indent); 3014 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3015 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3016 3017 db_print_indent(indent); 3018 #ifdef INET6 3019 if (inp->inp_vflag & INP_IPV6) { 3020 db_printf("in6p_options: %p in6p_outputopts: %p " 3021 "in6p_moptions: %p\n", inp->in6p_options, 3022 inp->in6p_outputopts, inp->in6p_moptions); 3023 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3024 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3025 inp->in6p_hops); 3026 } else 3027 #endif 3028 { 3029 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3030 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3031 inp->inp_options, inp->inp_moptions); 3032 } 3033 3034 db_print_indent(indent); 3035 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3036 (uintmax_t)inp->inp_gencnt); 3037 } 3038 3039 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3040 { 3041 struct inpcb *inp; 3042 3043 if (!have_addr) { 3044 db_printf("usage: show inpcb <addr>\n"); 3045 return; 3046 } 3047 inp = (struct inpcb *)addr; 3048 3049 db_print_inpcb(inp, "inpcb", 0); 3050 } 3051 #endif /* DDB */ 3052 3053 #ifdef RATELIMIT 3054 /* 3055 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3056 * if any. 3057 */ 3058 int 3059 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3060 { 3061 union if_snd_tag_modify_params params = { 3062 .rate_limit.max_rate = max_pacing_rate, 3063 .rate_limit.flags = M_NOWAIT, 3064 }; 3065 struct m_snd_tag *mst; 3066 int error; 3067 3068 mst = inp->inp_snd_tag; 3069 if (mst == NULL) 3070 return (EINVAL); 3071 3072 if (mst->sw->snd_tag_modify == NULL) { 3073 error = EOPNOTSUPP; 3074 } else { 3075 error = mst->sw->snd_tag_modify(mst, ¶ms); 3076 } 3077 return (error); 3078 } 3079 3080 /* 3081 * Query existing TX rate limit based on the existing 3082 * "inp->inp_snd_tag", if any. 3083 */ 3084 int 3085 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3086 { 3087 union if_snd_tag_query_params params = { }; 3088 struct m_snd_tag *mst; 3089 int error; 3090 3091 mst = inp->inp_snd_tag; 3092 if (mst == NULL) 3093 return (EINVAL); 3094 3095 if (mst->sw->snd_tag_query == NULL) { 3096 error = EOPNOTSUPP; 3097 } else { 3098 error = mst->sw->snd_tag_query(mst, ¶ms); 3099 if (error == 0 && p_max_pacing_rate != NULL) 3100 *p_max_pacing_rate = params.rate_limit.max_rate; 3101 } 3102 return (error); 3103 } 3104 3105 /* 3106 * Query existing TX queue level based on the existing 3107 * "inp->inp_snd_tag", if any. 3108 */ 3109 int 3110 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3111 { 3112 union if_snd_tag_query_params params = { }; 3113 struct m_snd_tag *mst; 3114 int error; 3115 3116 mst = inp->inp_snd_tag; 3117 if (mst == NULL) 3118 return (EINVAL); 3119 3120 if (mst->sw->snd_tag_query == NULL) 3121 return (EOPNOTSUPP); 3122 3123 error = mst->sw->snd_tag_query(mst, ¶ms); 3124 if (error == 0 && p_txqueue_level != NULL) 3125 *p_txqueue_level = params.rate_limit.queue_level; 3126 return (error); 3127 } 3128 3129 /* 3130 * Allocate a new TX rate limit send tag from the network interface 3131 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3132 */ 3133 int 3134 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3135 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3136 3137 { 3138 union if_snd_tag_alloc_params params = { 3139 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3140 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3141 .rate_limit.hdr.flowid = flowid, 3142 .rate_limit.hdr.flowtype = flowtype, 3143 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3144 .rate_limit.max_rate = max_pacing_rate, 3145 .rate_limit.flags = M_NOWAIT, 3146 }; 3147 int error; 3148 3149 INP_WLOCK_ASSERT(inp); 3150 3151 /* 3152 * If there is already a send tag, or the INP is being torn 3153 * down, allocating a new send tag is not allowed. Else send 3154 * tags may leak. 3155 */ 3156 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3157 return (EINVAL); 3158 3159 error = m_snd_tag_alloc(ifp, ¶ms, st); 3160 #ifdef INET 3161 if (error == 0) { 3162 counter_u64_add(rate_limit_set_ok, 1); 3163 counter_u64_add(rate_limit_active, 1); 3164 } else if (error != EOPNOTSUPP) 3165 counter_u64_add(rate_limit_alloc_fail, 1); 3166 #endif 3167 return (error); 3168 } 3169 3170 void 3171 in_pcbdetach_tag(struct m_snd_tag *mst) 3172 { 3173 3174 m_snd_tag_rele(mst); 3175 #ifdef INET 3176 counter_u64_add(rate_limit_active, -1); 3177 #endif 3178 } 3179 3180 /* 3181 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3182 * if any: 3183 */ 3184 void 3185 in_pcbdetach_txrtlmt(struct inpcb *inp) 3186 { 3187 struct m_snd_tag *mst; 3188 3189 INP_WLOCK_ASSERT(inp); 3190 3191 mst = inp->inp_snd_tag; 3192 inp->inp_snd_tag = NULL; 3193 3194 if (mst == NULL) 3195 return; 3196 3197 m_snd_tag_rele(mst); 3198 #ifdef INET 3199 counter_u64_add(rate_limit_active, -1); 3200 #endif 3201 } 3202 3203 int 3204 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3205 { 3206 int error; 3207 3208 /* 3209 * If the existing send tag is for the wrong interface due to 3210 * a route change, first drop the existing tag. Set the 3211 * CHANGED flag so that we will keep trying to allocate a new 3212 * tag if we fail to allocate one this time. 3213 */ 3214 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3215 in_pcbdetach_txrtlmt(inp); 3216 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3217 } 3218 3219 /* 3220 * NOTE: When attaching to a network interface a reference is 3221 * made to ensure the network interface doesn't go away until 3222 * all ratelimit connections are gone. The network interface 3223 * pointers compared below represent valid network interfaces, 3224 * except when comparing towards NULL. 3225 */ 3226 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3227 error = 0; 3228 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3229 if (inp->inp_snd_tag != NULL) 3230 in_pcbdetach_txrtlmt(inp); 3231 error = 0; 3232 } else if (inp->inp_snd_tag == NULL) { 3233 /* 3234 * In order to utilize packet pacing with RSS, we need 3235 * to wait until there is a valid RSS hash before we 3236 * can proceed: 3237 */ 3238 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3239 error = EAGAIN; 3240 } else { 3241 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3242 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3243 } 3244 } else { 3245 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3246 } 3247 if (error == 0 || error == EOPNOTSUPP) 3248 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3249 3250 return (error); 3251 } 3252 3253 /* 3254 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3255 * is set in the fast path and will attach/detach/modify the TX rate 3256 * limit send tag based on the socket's so_max_pacing_rate value. 3257 */ 3258 void 3259 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3260 { 3261 struct socket *socket; 3262 uint32_t max_pacing_rate; 3263 bool did_upgrade; 3264 3265 if (inp == NULL) 3266 return; 3267 3268 socket = inp->inp_socket; 3269 if (socket == NULL) 3270 return; 3271 3272 if (!INP_WLOCKED(inp)) { 3273 /* 3274 * NOTE: If the write locking fails, we need to bail 3275 * out and use the non-ratelimited ring for the 3276 * transmit until there is a new chance to get the 3277 * write lock. 3278 */ 3279 if (!INP_TRY_UPGRADE(inp)) 3280 return; 3281 did_upgrade = 1; 3282 } else { 3283 did_upgrade = 0; 3284 } 3285 3286 /* 3287 * NOTE: The so_max_pacing_rate value is read unlocked, 3288 * because atomic updates are not required since the variable 3289 * is checked at every mbuf we send. It is assumed that the 3290 * variable read itself will be atomic. 3291 */ 3292 max_pacing_rate = socket->so_max_pacing_rate; 3293 3294 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3295 3296 if (did_upgrade) 3297 INP_DOWNGRADE(inp); 3298 } 3299 3300 /* 3301 * Track route changes for TX rate limiting. 3302 */ 3303 void 3304 in_pcboutput_eagain(struct inpcb *inp) 3305 { 3306 bool did_upgrade; 3307 3308 if (inp == NULL) 3309 return; 3310 3311 if (inp->inp_snd_tag == NULL) 3312 return; 3313 3314 if (!INP_WLOCKED(inp)) { 3315 /* 3316 * NOTE: If the write locking fails, we need to bail 3317 * out and use the non-ratelimited ring for the 3318 * transmit until there is a new chance to get the 3319 * write lock. 3320 */ 3321 if (!INP_TRY_UPGRADE(inp)) 3322 return; 3323 did_upgrade = 1; 3324 } else { 3325 did_upgrade = 0; 3326 } 3327 3328 /* detach rate limiting */ 3329 in_pcbdetach_txrtlmt(inp); 3330 3331 /* make sure new mbuf send tag allocation is made */ 3332 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3333 3334 if (did_upgrade) 3335 INP_DOWNGRADE(inp); 3336 } 3337 3338 #ifdef INET 3339 static void 3340 rl_init(void *st) 3341 { 3342 rate_limit_new = counter_u64_alloc(M_WAITOK); 3343 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3344 rate_limit_active = counter_u64_alloc(M_WAITOK); 3345 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3346 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3347 } 3348 3349 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3350 #endif 3351 #endif /* RATELIMIT */ 3352