1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/hash.h> 53 #include <sys/systm.h> 54 #include <sys/libkern.h> 55 #include <sys/lock.h> 56 #include <sys/malloc.h> 57 #include <sys/mbuf.h> 58 #include <sys/eventhandler.h> 59 #include <sys/domain.h> 60 #include <sys/protosw.h> 61 #include <sys/smp.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/sockio.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/jail.h> 69 #include <sys/kernel.h> 70 #include <sys/sysctl.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 #include <vm/vm.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #ifdef INET 92 #include <netinet/in_var.h> 93 #include <netinet/in_fib.h> 94 #endif 95 #include <netinet/ip_var.h> 96 #include <netinet/tcp_var.h> 97 #ifdef TCPHPTS 98 #include <netinet/tcp_hpts.h> 99 #endif 100 #include <netinet/udp.h> 101 #include <netinet/udp_var.h> 102 #ifdef INET6 103 #include <netinet/ip6.h> 104 #include <netinet6/in6_pcb.h> 105 #include <netinet6/in6_var.h> 106 #include <netinet6/ip6_var.h> 107 #endif /* INET6 */ 108 #include <net/route/nhop.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #include <security/mac/mac_framework.h> 114 115 #define INPCBLBGROUP_SIZMIN 8 116 #define INPCBLBGROUP_SIZMAX 256 117 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 118 119 /* 120 * These configure the range of local port addresses assigned to 121 * "unspecified" outgoing connections/packets/whatever. 122 */ 123 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 124 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 125 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 126 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 127 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 128 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 129 130 /* 131 * Reserved ports accessible only to root. There are significant 132 * security considerations that must be accounted for when changing these, 133 * but the security benefits can be great. Please be careful. 134 */ 135 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 136 VNET_DEFINE(int, ipport_reservedlow); 137 138 /* Enable random ephemeral port allocation by default. */ 139 VNET_DEFINE(int, ipport_randomized) = 1; 140 141 #ifdef INET 142 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 143 struct in_addr faddr, u_int fport_arg, 144 struct in_addr laddr, u_int lport_arg, 145 int lookupflags, struct ifnet *ifp, 146 uint8_t numa_domain); 147 148 #define RANGECHK(var, min, max) \ 149 if ((var) < (min)) { (var) = (min); } \ 150 else if ((var) > (max)) { (var) = (max); } 151 152 static int 153 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 154 { 155 int error; 156 157 error = sysctl_handle_int(oidp, arg1, arg2, req); 158 if (error == 0) { 159 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 160 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 161 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 162 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 163 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 164 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 165 } 166 return (error); 167 } 168 169 #undef RANGECHK 170 171 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 172 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 173 "IP Ports"); 174 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 196 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 197 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 198 ""); 199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 200 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 201 &VNET_NAME(ipport_reservedhigh), 0, ""); 202 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 203 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 205 CTLFLAG_VNET | CTLFLAG_RW, 206 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 207 208 #ifdef RATELIMIT 209 counter_u64_t rate_limit_new; 210 counter_u64_t rate_limit_chg; 211 counter_u64_t rate_limit_active; 212 counter_u64_t rate_limit_alloc_fail; 213 counter_u64_t rate_limit_set_ok; 214 215 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 216 "IP Rate Limiting"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 218 &rate_limit_active, "Active rate limited connections"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 220 &rate_limit_alloc_fail, "Rate limited connection failures"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 222 &rate_limit_set_ok, "Rate limited setting succeeded"); 223 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 224 &rate_limit_new, "Total Rate limit new attempts"); 225 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 226 &rate_limit_chg, "Total Rate limited change attempts"); 227 228 #endif /* RATELIMIT */ 229 230 #endif /* INET */ 231 232 VNET_DEFINE(uint32_t, in_pcbhashseed); 233 static void 234 in_pcbhashseed_init(void) 235 { 236 237 V_in_pcbhashseed = arc4random(); 238 } 239 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 240 in_pcbhashseed_init, 0); 241 242 static void in_pcbremhash(struct inpcb *); 243 244 /* 245 * in_pcb.c: manage the Protocol Control Blocks. 246 * 247 * NOTE: It is assumed that most of these functions will be called with 248 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 249 * functions often modify hash chains or addresses in pcbs. 250 */ 251 252 static struct inpcblbgroup * 253 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, 254 u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, 255 uint8_t numa_domain) 256 { 257 struct inpcblbgroup *grp; 258 size_t bytes; 259 260 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 261 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 262 if (grp == NULL) 263 return (NULL); 264 grp->il_cred = crhold(cred); 265 grp->il_vflag = vflag; 266 grp->il_lport = port; 267 grp->il_numa_domain = numa_domain; 268 grp->il_dependladdr = *addr; 269 grp->il_inpsiz = size; 270 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 271 return (grp); 272 } 273 274 static void 275 in_pcblbgroup_free_deferred(epoch_context_t ctx) 276 { 277 struct inpcblbgroup *grp; 278 279 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 280 crfree(grp->il_cred); 281 free(grp, M_PCB); 282 } 283 284 static void 285 in_pcblbgroup_free(struct inpcblbgroup *grp) 286 { 287 288 CK_LIST_REMOVE(grp, il_list); 289 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 290 } 291 292 static struct inpcblbgroup * 293 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 294 struct inpcblbgroup *old_grp, int size) 295 { 296 struct inpcblbgroup *grp; 297 int i; 298 299 grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, 300 old_grp->il_lport, &old_grp->il_dependladdr, size, 301 old_grp->il_numa_domain); 302 if (grp == NULL) 303 return (NULL); 304 305 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 306 ("invalid new local group size %d and old local group count %d", 307 grp->il_inpsiz, old_grp->il_inpcnt)); 308 309 for (i = 0; i < old_grp->il_inpcnt; ++i) 310 grp->il_inp[i] = old_grp->il_inp[i]; 311 grp->il_inpcnt = old_grp->il_inpcnt; 312 in_pcblbgroup_free(old_grp); 313 return (grp); 314 } 315 316 /* 317 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 318 * and shrink group if possible. 319 */ 320 static void 321 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 322 int i) 323 { 324 struct inpcblbgroup *grp, *new_grp; 325 326 grp = *grpp; 327 for (; i + 1 < grp->il_inpcnt; ++i) 328 grp->il_inp[i] = grp->il_inp[i + 1]; 329 grp->il_inpcnt--; 330 331 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 332 grp->il_inpcnt <= grp->il_inpsiz / 4) { 333 /* Shrink this group. */ 334 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 335 if (new_grp != NULL) 336 *grpp = new_grp; 337 } 338 } 339 340 /* 341 * Add PCB to load balance group for SO_REUSEPORT_LB option. 342 */ 343 static int 344 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 345 { 346 const static struct timeval interval = { 60, 0 }; 347 static struct timeval lastprint; 348 struct inpcbinfo *pcbinfo; 349 struct inpcblbgrouphead *hdr; 350 struct inpcblbgroup *grp; 351 uint32_t idx; 352 353 pcbinfo = inp->inp_pcbinfo; 354 355 INP_WLOCK_ASSERT(inp); 356 INP_HASH_WLOCK_ASSERT(pcbinfo); 357 358 #ifdef INET6 359 /* 360 * Don't allow IPv4 mapped INET6 wild socket. 361 */ 362 if ((inp->inp_vflag & INP_IPV4) && 363 inp->inp_laddr.s_addr == INADDR_ANY && 364 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 365 return (0); 366 } 367 #endif 368 369 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 370 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 371 CK_LIST_FOREACH(grp, hdr, il_list) { 372 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 373 grp->il_vflag == inp->inp_vflag && 374 grp->il_lport == inp->inp_lport && 375 grp->il_numa_domain == numa_domain && 376 memcmp(&grp->il_dependladdr, 377 &inp->inp_inc.inc_ie.ie_dependladdr, 378 sizeof(grp->il_dependladdr)) == 0) { 379 break; 380 } 381 } 382 if (grp == NULL) { 383 /* Create new load balance group. */ 384 grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, 385 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 386 INPCBLBGROUP_SIZMIN, numa_domain); 387 if (grp == NULL) 388 return (ENOBUFS); 389 } else if (grp->il_inpcnt == grp->il_inpsiz) { 390 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 391 if (ratecheck(&lastprint, &interval)) 392 printf("lb group port %d, limit reached\n", 393 ntohs(grp->il_lport)); 394 return (0); 395 } 396 397 /* Expand this local group. */ 398 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 399 if (grp == NULL) 400 return (ENOBUFS); 401 } 402 403 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 404 ("invalid local group size %d and count %d", grp->il_inpsiz, 405 grp->il_inpcnt)); 406 407 grp->il_inp[grp->il_inpcnt] = inp; 408 grp->il_inpcnt++; 409 return (0); 410 } 411 412 /* 413 * Remove PCB from load balance group. 414 */ 415 static void 416 in_pcbremlbgrouphash(struct inpcb *inp) 417 { 418 struct inpcbinfo *pcbinfo; 419 struct inpcblbgrouphead *hdr; 420 struct inpcblbgroup *grp; 421 int i; 422 423 pcbinfo = inp->inp_pcbinfo; 424 425 INP_WLOCK_ASSERT(inp); 426 INP_HASH_WLOCK_ASSERT(pcbinfo); 427 428 hdr = &pcbinfo->ipi_lbgrouphashbase[ 429 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 430 CK_LIST_FOREACH(grp, hdr, il_list) { 431 for (i = 0; i < grp->il_inpcnt; ++i) { 432 if (grp->il_inp[i] != inp) 433 continue; 434 435 if (grp->il_inpcnt == 1) { 436 /* We are the last, free this local group. */ 437 in_pcblbgroup_free(grp); 438 } else { 439 /* Pull up inpcbs, shrink group if possible. */ 440 in_pcblbgroup_reorder(hdr, &grp, i); 441 } 442 return; 443 } 444 } 445 } 446 447 int 448 in_pcblbgroup_numa(struct inpcb *inp, int arg) 449 { 450 struct inpcbinfo *pcbinfo; 451 struct inpcblbgrouphead *hdr; 452 struct inpcblbgroup *grp; 453 int err, i; 454 uint8_t numa_domain; 455 456 switch (arg) { 457 case TCP_REUSPORT_LB_NUMA_NODOM: 458 numa_domain = M_NODOM; 459 break; 460 case TCP_REUSPORT_LB_NUMA_CURDOM: 461 numa_domain = PCPU_GET(domain); 462 break; 463 default: 464 if (arg < 0 || arg >= vm_ndomains) 465 return (EINVAL); 466 numa_domain = arg; 467 } 468 469 err = 0; 470 pcbinfo = inp->inp_pcbinfo; 471 INP_WLOCK_ASSERT(inp); 472 INP_HASH_WLOCK(pcbinfo); 473 hdr = &pcbinfo->ipi_lbgrouphashbase[ 474 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 475 CK_LIST_FOREACH(grp, hdr, il_list) { 476 for (i = 0; i < grp->il_inpcnt; ++i) { 477 if (grp->il_inp[i] != inp) 478 continue; 479 480 if (grp->il_numa_domain == numa_domain) { 481 goto abort_with_hash_wlock; 482 } 483 484 /* Remove it from the old group. */ 485 in_pcbremlbgrouphash(inp); 486 487 /* Add it to the new group based on numa domain. */ 488 in_pcbinslbgrouphash(inp, numa_domain); 489 goto abort_with_hash_wlock; 490 } 491 } 492 err = ENOENT; 493 abort_with_hash_wlock: 494 INP_HASH_WUNLOCK(pcbinfo); 495 return (err); 496 } 497 498 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 499 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 500 501 /* 502 * Initialize an inpcbinfo - a per-VNET instance of connections db. 503 */ 504 void 505 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 506 u_int hash_nelements, u_int porthash_nelements) 507 { 508 509 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 510 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 511 NULL, MTX_DEF); 512 #ifdef VIMAGE 513 pcbinfo->ipi_vnet = curvnet; 514 #endif 515 CK_LIST_INIT(&pcbinfo->ipi_listhead); 516 pcbinfo->ipi_count = 0; 517 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 518 &pcbinfo->ipi_hashmask); 519 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 520 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 521 &pcbinfo->ipi_porthashmask); 522 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 523 &pcbinfo->ipi_lbgrouphashmask); 524 pcbinfo->ipi_zone = pcbstor->ips_zone; 525 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 526 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 527 } 528 529 /* 530 * Destroy an inpcbinfo. 531 */ 532 void 533 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 534 { 535 536 KASSERT(pcbinfo->ipi_count == 0, 537 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 538 539 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 540 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 541 pcbinfo->ipi_porthashmask); 542 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 543 pcbinfo->ipi_lbgrouphashmask); 544 mtx_destroy(&pcbinfo->ipi_hash_lock); 545 mtx_destroy(&pcbinfo->ipi_lock); 546 } 547 548 /* 549 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 550 */ 551 static void inpcb_dtor(void *, int, void *); 552 static void inpcb_fini(void *, int); 553 void 554 in_pcbstorage_init(void *arg) 555 { 556 struct inpcbstorage *pcbstor = arg; 557 558 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 559 sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, 560 inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); 561 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 562 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 563 uma_zone_set_smr(pcbstor->ips_portzone, 564 uma_zone_get_smr(pcbstor->ips_zone)); 565 } 566 567 /* 568 * Destroy a pcbstorage - used by unloadable protocols. 569 */ 570 void 571 in_pcbstorage_destroy(void *arg) 572 { 573 struct inpcbstorage *pcbstor = arg; 574 575 uma_zdestroy(pcbstor->ips_zone); 576 uma_zdestroy(pcbstor->ips_portzone); 577 } 578 579 /* 580 * Allocate a PCB and associate it with the socket. 581 * On success return with the PCB locked. 582 */ 583 int 584 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 585 { 586 struct inpcb *inp; 587 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 588 int error; 589 #endif 590 591 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 592 if (inp == NULL) 593 return (ENOBUFS); 594 bzero(&inp->inp_start_zero, inp_zero_size); 595 #ifdef NUMA 596 inp->inp_numa_domain = M_NODOM; 597 #endif 598 inp->inp_pcbinfo = pcbinfo; 599 inp->inp_socket = so; 600 inp->inp_cred = crhold(so->so_cred); 601 inp->inp_inc.inc_fibnum = so->so_fibnum; 602 #ifdef MAC 603 error = mac_inpcb_init(inp, M_NOWAIT); 604 if (error != 0) 605 goto out; 606 mac_inpcb_create(so, inp); 607 #endif 608 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 609 error = ipsec_init_pcbpolicy(inp); 610 if (error != 0) { 611 #ifdef MAC 612 mac_inpcb_destroy(inp); 613 #endif 614 goto out; 615 } 616 #endif /*IPSEC*/ 617 #ifdef INET6 618 if (INP_SOCKAF(so) == AF_INET6) { 619 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 620 if (V_ip6_v6only) 621 inp->inp_flags |= IN6P_IPV6_V6ONLY; 622 #ifdef INET 623 else 624 inp->inp_vflag |= INP_IPV4; 625 #endif 626 if (V_ip6_auto_flowlabel) 627 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 628 inp->in6p_hops = -1; /* use kernel default */ 629 } 630 #endif 631 #if defined(INET) && defined(INET6) 632 else 633 #endif 634 #ifdef INET 635 inp->inp_vflag |= INP_IPV4; 636 #endif 637 /* 638 * Routes in inpcb's can cache L2 as well; they are guaranteed 639 * to be cleaned up. 640 */ 641 inp->inp_route.ro_flags = RT_LLE_CACHE; 642 #ifdef TCPHPTS 643 /* 644 * If using hpts lets drop a random number in so 645 * not all new connections fall on the same CPU. 646 */ 647 inp->inp_hpts_cpu = hpts_random_cpu(inp); 648 #endif 649 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 650 INP_WLOCK(inp); 651 INP_INFO_WLOCK(pcbinfo); 652 pcbinfo->ipi_count++; 653 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 654 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 655 INP_INFO_WUNLOCK(pcbinfo); 656 so->so_pcb = inp; 657 658 return (0); 659 660 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 661 out: 662 uma_zfree_smr(pcbinfo->ipi_zone, inp); 663 return (error); 664 #endif 665 } 666 667 #ifdef INET 668 int 669 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 670 { 671 int anonport, error; 672 673 KASSERT(nam == NULL || nam->sa_family == AF_INET, 674 ("%s: invalid address family for %p", __func__, nam)); 675 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 676 ("%s: invalid address length for %p", __func__, nam)); 677 INP_WLOCK_ASSERT(inp); 678 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 679 680 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 681 return (EINVAL); 682 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 683 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 684 &inp->inp_lport, cred); 685 if (error) 686 return (error); 687 if (in_pcbinshash(inp) != 0) { 688 inp->inp_laddr.s_addr = INADDR_ANY; 689 inp->inp_lport = 0; 690 return (EAGAIN); 691 } 692 if (anonport) 693 inp->inp_flags |= INP_ANONPORT; 694 return (0); 695 } 696 #endif 697 698 #if defined(INET) || defined(INET6) 699 /* 700 * Assign a local port like in_pcb_lport(), but also used with connect() 701 * and a foreign address and port. If fsa is non-NULL, choose a local port 702 * that is unused with those, otherwise one that is completely unused. 703 * lsa can be NULL for IPv6. 704 */ 705 int 706 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 707 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 708 { 709 struct inpcbinfo *pcbinfo; 710 struct inpcb *tmpinp; 711 unsigned short *lastport; 712 int count, error; 713 u_short aux, first, last, lport; 714 #ifdef INET 715 struct in_addr laddr, faddr; 716 #endif 717 #ifdef INET6 718 struct in6_addr *laddr6, *faddr6; 719 #endif 720 721 pcbinfo = inp->inp_pcbinfo; 722 723 /* 724 * Because no actual state changes occur here, a global write lock on 725 * the pcbinfo isn't required. 726 */ 727 INP_LOCK_ASSERT(inp); 728 INP_HASH_LOCK_ASSERT(pcbinfo); 729 730 if (inp->inp_flags & INP_HIGHPORT) { 731 first = V_ipport_hifirstauto; /* sysctl */ 732 last = V_ipport_hilastauto; 733 lastport = &pcbinfo->ipi_lasthi; 734 } else if (inp->inp_flags & INP_LOWPORT) { 735 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 736 if (error) 737 return (error); 738 first = V_ipport_lowfirstauto; /* 1023 */ 739 last = V_ipport_lowlastauto; /* 600 */ 740 lastport = &pcbinfo->ipi_lastlow; 741 } else { 742 first = V_ipport_firstauto; /* sysctl */ 743 last = V_ipport_lastauto; 744 lastport = &pcbinfo->ipi_lastport; 745 } 746 747 /* 748 * Instead of having two loops further down counting up or down 749 * make sure that first is always <= last and go with only one 750 * code path implementing all logic. 751 */ 752 if (first > last) { 753 aux = first; 754 first = last; 755 last = aux; 756 } 757 758 #ifdef INET 759 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 760 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 761 if (lsa != NULL) 762 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 763 if (fsa != NULL) 764 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 765 } 766 #endif 767 #ifdef INET6 768 laddr6 = NULL; 769 if ((inp->inp_vflag & INP_IPV6) != 0) { 770 if (lsa != NULL) 771 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 772 if (fsa != NULL) 773 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 774 } 775 #endif 776 777 tmpinp = NULL; 778 lport = *lportp; 779 780 if (V_ipport_randomized) 781 *lastport = first + (arc4random() % (last - first)); 782 783 count = last - first; 784 785 do { 786 if (count-- < 0) /* completely used? */ 787 return (EADDRNOTAVAIL); 788 ++*lastport; 789 if (*lastport < first || *lastport > last) 790 *lastport = first; 791 lport = htons(*lastport); 792 793 if (fsa != NULL) { 794 #ifdef INET 795 if (lsa->sa_family == AF_INET) { 796 tmpinp = in_pcblookup_hash_locked(pcbinfo, 797 faddr, fport, laddr, lport, lookupflags, 798 NULL, M_NODOM); 799 } 800 #endif 801 #ifdef INET6 802 if (lsa->sa_family == AF_INET6) { 803 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 804 faddr6, fport, laddr6, lport, lookupflags, 805 NULL, M_NODOM); 806 } 807 #endif 808 } else { 809 #ifdef INET6 810 if ((inp->inp_vflag & INP_IPV6) != 0) { 811 tmpinp = in6_pcblookup_local(pcbinfo, 812 &inp->in6p_laddr, lport, lookupflags, cred); 813 #ifdef INET 814 if (tmpinp == NULL && 815 (inp->inp_vflag & INP_IPV4)) 816 tmpinp = in_pcblookup_local(pcbinfo, 817 laddr, lport, lookupflags, cred); 818 #endif 819 } 820 #endif 821 #if defined(INET) && defined(INET6) 822 else 823 #endif 824 #ifdef INET 825 tmpinp = in_pcblookup_local(pcbinfo, laddr, 826 lport, lookupflags, cred); 827 #endif 828 } 829 } while (tmpinp != NULL); 830 831 *lportp = lport; 832 833 return (0); 834 } 835 836 /* 837 * Select a local port (number) to use. 838 */ 839 int 840 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 841 struct ucred *cred, int lookupflags) 842 { 843 struct sockaddr_in laddr; 844 845 if (laddrp) { 846 bzero(&laddr, sizeof(laddr)); 847 laddr.sin_family = AF_INET; 848 laddr.sin_addr = *laddrp; 849 } 850 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 851 NULL, lportp, NULL, 0, cred, lookupflags)); 852 } 853 854 /* 855 * Return cached socket options. 856 */ 857 int 858 inp_so_options(const struct inpcb *inp) 859 { 860 int so_options; 861 862 so_options = 0; 863 864 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 865 so_options |= SO_REUSEPORT_LB; 866 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 867 so_options |= SO_REUSEPORT; 868 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 869 so_options |= SO_REUSEADDR; 870 return (so_options); 871 } 872 #endif /* INET || INET6 */ 873 874 /* 875 * Check if a new BINDMULTI socket is allowed to be created. 876 * 877 * ni points to the new inp. 878 * oi points to the existing inp. 879 * 880 * This checks whether the existing inp also has BINDMULTI and 881 * whether the credentials match. 882 */ 883 int 884 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 885 { 886 /* Check permissions match */ 887 if ((ni->inp_flags2 & INP_BINDMULTI) && 888 (ni->inp_cred->cr_uid != 889 oi->inp_cred->cr_uid)) 890 return (0); 891 892 /* Check the existing inp has BINDMULTI set */ 893 if ((ni->inp_flags2 & INP_BINDMULTI) && 894 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 895 return (0); 896 897 /* 898 * We're okay - either INP_BINDMULTI isn't set on ni, or 899 * it is and it matches the checks. 900 */ 901 return (1); 902 } 903 904 #ifdef INET 905 /* 906 * Set up a bind operation on a PCB, performing port allocation 907 * as required, but do not actually modify the PCB. Callers can 908 * either complete the bind by setting inp_laddr/inp_lport and 909 * calling in_pcbinshash(), or they can just use the resulting 910 * port and address to authorise the sending of a once-off packet. 911 * 912 * On error, the values of *laddrp and *lportp are not changed. 913 */ 914 int 915 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 916 u_short *lportp, struct ucred *cred) 917 { 918 struct socket *so = inp->inp_socket; 919 struct sockaddr_in *sin; 920 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 921 struct in_addr laddr; 922 u_short lport = 0; 923 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 924 int error; 925 926 /* 927 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 928 * so that we don't have to add to the (already messy) code below. 929 */ 930 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 931 932 /* 933 * No state changes, so read locks are sufficient here. 934 */ 935 INP_LOCK_ASSERT(inp); 936 INP_HASH_LOCK_ASSERT(pcbinfo); 937 938 laddr.s_addr = *laddrp; 939 if (nam != NULL && laddr.s_addr != INADDR_ANY) 940 return (EINVAL); 941 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 942 lookupflags = INPLOOKUP_WILDCARD; 943 if (nam == NULL) { 944 if ((error = prison_local_ip4(cred, &laddr)) != 0) 945 return (error); 946 } else { 947 sin = (struct sockaddr_in *)nam; 948 KASSERT(sin->sin_family == AF_INET, 949 ("%s: invalid family for address %p", __func__, sin)); 950 KASSERT(sin->sin_len == sizeof(*sin), 951 ("%s: invalid length for address %p", __func__, sin)); 952 953 error = prison_local_ip4(cred, &sin->sin_addr); 954 if (error) 955 return (error); 956 if (sin->sin_port != *lportp) { 957 /* Don't allow the port to change. */ 958 if (*lportp != 0) 959 return (EINVAL); 960 lport = sin->sin_port; 961 } 962 /* NB: lport is left as 0 if the port isn't being changed. */ 963 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 964 /* 965 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 966 * allow complete duplication of binding if 967 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 968 * and a multicast address is bound on both 969 * new and duplicated sockets. 970 */ 971 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 972 reuseport = SO_REUSEADDR|SO_REUSEPORT; 973 /* 974 * XXX: How to deal with SO_REUSEPORT_LB here? 975 * Treat same as SO_REUSEPORT for now. 976 */ 977 if ((so->so_options & 978 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 979 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 980 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 981 sin->sin_port = 0; /* yech... */ 982 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 983 /* 984 * Is the address a local IP address? 985 * If INP_BINDANY is set, then the socket may be bound 986 * to any endpoint address, local or not. 987 */ 988 if ((inp->inp_flags & INP_BINDANY) == 0 && 989 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 990 return (EADDRNOTAVAIL); 991 } 992 laddr = sin->sin_addr; 993 if (lport) { 994 struct inpcb *t; 995 996 /* GROSS */ 997 if (ntohs(lport) <= V_ipport_reservedhigh && 998 ntohs(lport) >= V_ipport_reservedlow && 999 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 1000 return (EACCES); 1001 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 1002 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 1003 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1004 lport, INPLOOKUP_WILDCARD, cred); 1005 /* 1006 * XXX 1007 * This entire block sorely needs a rewrite. 1008 */ 1009 if (t && 1010 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1011 (so->so_type != SOCK_STREAM || 1012 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1013 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1014 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1015 (t->inp_flags2 & INP_REUSEPORT) || 1016 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1017 (inp->inp_cred->cr_uid != 1018 t->inp_cred->cr_uid)) 1019 return (EADDRINUSE); 1020 1021 /* 1022 * If the socket is a BINDMULTI socket, then 1023 * the credentials need to match and the 1024 * original socket also has to have been bound 1025 * with BINDMULTI. 1026 */ 1027 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1028 return (EADDRINUSE); 1029 } 1030 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1031 lport, lookupflags, cred); 1032 if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1033 (reuseport & inp_so_options(t)) == 0 && 1034 (reuseport_lb & inp_so_options(t)) == 0) { 1035 #ifdef INET6 1036 if (ntohl(sin->sin_addr.s_addr) != 1037 INADDR_ANY || 1038 ntohl(t->inp_laddr.s_addr) != 1039 INADDR_ANY || 1040 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1041 (t->inp_vflag & INP_IPV6PROTO) == 0) 1042 #endif 1043 return (EADDRINUSE); 1044 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1045 return (EADDRINUSE); 1046 } 1047 } 1048 } 1049 if (*lportp != 0) 1050 lport = *lportp; 1051 if (lport == 0) { 1052 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1053 if (error != 0) 1054 return (error); 1055 } 1056 *laddrp = laddr.s_addr; 1057 *lportp = lport; 1058 return (0); 1059 } 1060 1061 /* 1062 * Connect from a socket to a specified address. 1063 * Both address and port must be specified in argument sin. 1064 * If don't have a local address for this socket yet, 1065 * then pick one. 1066 */ 1067 int 1068 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1069 bool rehash) 1070 { 1071 u_short lport, fport; 1072 in_addr_t laddr, faddr; 1073 int anonport, error; 1074 1075 INP_WLOCK_ASSERT(inp); 1076 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1077 1078 lport = inp->inp_lport; 1079 laddr = inp->inp_laddr.s_addr; 1080 anonport = (lport == 0); 1081 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1082 NULL, cred); 1083 if (error) 1084 return (error); 1085 1086 /* Do the initial binding of the local address if required. */ 1087 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1088 KASSERT(rehash == true, 1089 ("Rehashing required for unbound inps")); 1090 inp->inp_lport = lport; 1091 inp->inp_laddr.s_addr = laddr; 1092 if (in_pcbinshash(inp) != 0) { 1093 inp->inp_laddr.s_addr = INADDR_ANY; 1094 inp->inp_lport = 0; 1095 return (EAGAIN); 1096 } 1097 } 1098 1099 /* Commit the remaining changes. */ 1100 inp->inp_lport = lport; 1101 inp->inp_laddr.s_addr = laddr; 1102 inp->inp_faddr.s_addr = faddr; 1103 inp->inp_fport = fport; 1104 if (rehash) { 1105 in_pcbrehash(inp); 1106 } else { 1107 in_pcbinshash(inp); 1108 } 1109 1110 if (anonport) 1111 inp->inp_flags |= INP_ANONPORT; 1112 return (0); 1113 } 1114 1115 /* 1116 * Do proper source address selection on an unbound socket in case 1117 * of connect. Take jails into account as well. 1118 */ 1119 int 1120 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1121 struct ucred *cred) 1122 { 1123 struct ifaddr *ifa; 1124 struct sockaddr *sa; 1125 struct sockaddr_in *sin, dst; 1126 struct nhop_object *nh; 1127 int error; 1128 1129 NET_EPOCH_ASSERT(); 1130 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1131 1132 /* 1133 * Bypass source address selection and use the primary jail IP 1134 * if requested. 1135 */ 1136 if (!prison_saddrsel_ip4(cred, laddr)) 1137 return (0); 1138 1139 error = 0; 1140 1141 nh = NULL; 1142 bzero(&dst, sizeof(dst)); 1143 sin = &dst; 1144 sin->sin_family = AF_INET; 1145 sin->sin_len = sizeof(struct sockaddr_in); 1146 sin->sin_addr.s_addr = faddr->s_addr; 1147 1148 /* 1149 * If route is known our src addr is taken from the i/f, 1150 * else punt. 1151 * 1152 * Find out route to destination. 1153 */ 1154 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1155 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1156 0, NHR_NONE, 0); 1157 1158 /* 1159 * If we found a route, use the address corresponding to 1160 * the outgoing interface. 1161 * 1162 * Otherwise assume faddr is reachable on a directly connected 1163 * network and try to find a corresponding interface to take 1164 * the source address from. 1165 */ 1166 if (nh == NULL || nh->nh_ifp == NULL) { 1167 struct in_ifaddr *ia; 1168 struct ifnet *ifp; 1169 1170 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1171 inp->inp_socket->so_fibnum)); 1172 if (ia == NULL) { 1173 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1174 inp->inp_socket->so_fibnum)); 1175 } 1176 if (ia == NULL) { 1177 error = ENETUNREACH; 1178 goto done; 1179 } 1180 1181 if (!prison_flag(cred, PR_IP4)) { 1182 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1183 goto done; 1184 } 1185 1186 ifp = ia->ia_ifp; 1187 ia = NULL; 1188 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1189 sa = ifa->ifa_addr; 1190 if (sa->sa_family != AF_INET) 1191 continue; 1192 sin = (struct sockaddr_in *)sa; 1193 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1194 ia = (struct in_ifaddr *)ifa; 1195 break; 1196 } 1197 } 1198 if (ia != NULL) { 1199 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1200 goto done; 1201 } 1202 1203 /* 3. As a last resort return the 'default' jail address. */ 1204 error = prison_get_ip4(cred, laddr); 1205 goto done; 1206 } 1207 1208 /* 1209 * If the outgoing interface on the route found is not 1210 * a loopback interface, use the address from that interface. 1211 * In case of jails do those three steps: 1212 * 1. check if the interface address belongs to the jail. If so use it. 1213 * 2. check if we have any address on the outgoing interface 1214 * belonging to this jail. If so use it. 1215 * 3. as a last resort return the 'default' jail address. 1216 */ 1217 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1218 struct in_ifaddr *ia; 1219 struct ifnet *ifp; 1220 1221 /* If not jailed, use the default returned. */ 1222 if (!prison_flag(cred, PR_IP4)) { 1223 ia = (struct in_ifaddr *)nh->nh_ifa; 1224 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1225 goto done; 1226 } 1227 1228 /* Jailed. */ 1229 /* 1. Check if the iface address belongs to the jail. */ 1230 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1231 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1232 ia = (struct in_ifaddr *)nh->nh_ifa; 1233 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1234 goto done; 1235 } 1236 1237 /* 1238 * 2. Check if we have any address on the outgoing interface 1239 * belonging to this jail. 1240 */ 1241 ia = NULL; 1242 ifp = nh->nh_ifp; 1243 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1244 sa = ifa->ifa_addr; 1245 if (sa->sa_family != AF_INET) 1246 continue; 1247 sin = (struct sockaddr_in *)sa; 1248 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1249 ia = (struct in_ifaddr *)ifa; 1250 break; 1251 } 1252 } 1253 if (ia != NULL) { 1254 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1255 goto done; 1256 } 1257 1258 /* 3. As a last resort return the 'default' jail address. */ 1259 error = prison_get_ip4(cred, laddr); 1260 goto done; 1261 } 1262 1263 /* 1264 * The outgoing interface is marked with 'loopback net', so a route 1265 * to ourselves is here. 1266 * Try to find the interface of the destination address and then 1267 * take the address from there. That interface is not necessarily 1268 * a loopback interface. 1269 * In case of jails, check that it is an address of the jail 1270 * and if we cannot find, fall back to the 'default' jail address. 1271 */ 1272 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1273 struct in_ifaddr *ia; 1274 1275 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1276 inp->inp_socket->so_fibnum)); 1277 if (ia == NULL) 1278 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1279 inp->inp_socket->so_fibnum)); 1280 if (ia == NULL) 1281 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1282 1283 if (!prison_flag(cred, PR_IP4)) { 1284 if (ia == NULL) { 1285 error = ENETUNREACH; 1286 goto done; 1287 } 1288 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1289 goto done; 1290 } 1291 1292 /* Jailed. */ 1293 if (ia != NULL) { 1294 struct ifnet *ifp; 1295 1296 ifp = ia->ia_ifp; 1297 ia = NULL; 1298 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1299 sa = ifa->ifa_addr; 1300 if (sa->sa_family != AF_INET) 1301 continue; 1302 sin = (struct sockaddr_in *)sa; 1303 if (prison_check_ip4(cred, 1304 &sin->sin_addr) == 0) { 1305 ia = (struct in_ifaddr *)ifa; 1306 break; 1307 } 1308 } 1309 if (ia != NULL) { 1310 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1311 goto done; 1312 } 1313 } 1314 1315 /* 3. As a last resort return the 'default' jail address. */ 1316 error = prison_get_ip4(cred, laddr); 1317 goto done; 1318 } 1319 1320 done: 1321 return (error); 1322 } 1323 1324 /* 1325 * Set up for a connect from a socket to the specified address. 1326 * On entry, *laddrp and *lportp should contain the current local 1327 * address and port for the PCB; these are updated to the values 1328 * that should be placed in inp_laddr and inp_lport to complete 1329 * the connect. 1330 * 1331 * On success, *faddrp and *fportp will be set to the remote address 1332 * and port. These are not updated in the error case. 1333 * 1334 * If the operation fails because the connection already exists, 1335 * *oinpp will be set to the PCB of that connection so that the 1336 * caller can decide to override it. In all other cases, *oinpp 1337 * is set to NULL. 1338 */ 1339 int 1340 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1341 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1342 struct inpcb **oinpp, struct ucred *cred) 1343 { 1344 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1345 struct in_ifaddr *ia; 1346 struct inpcb *oinp; 1347 struct in_addr laddr, faddr; 1348 u_short lport, fport; 1349 int error; 1350 1351 KASSERT(sin->sin_family == AF_INET, 1352 ("%s: invalid address family for %p", __func__, sin)); 1353 KASSERT(sin->sin_len == sizeof(*sin), 1354 ("%s: invalid address length for %p", __func__, sin)); 1355 1356 /* 1357 * Because a global state change doesn't actually occur here, a read 1358 * lock is sufficient. 1359 */ 1360 NET_EPOCH_ASSERT(); 1361 INP_LOCK_ASSERT(inp); 1362 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1363 1364 if (oinpp != NULL) 1365 *oinpp = NULL; 1366 if (sin->sin_port == 0) 1367 return (EADDRNOTAVAIL); 1368 laddr.s_addr = *laddrp; 1369 lport = *lportp; 1370 faddr = sin->sin_addr; 1371 fport = sin->sin_port; 1372 #ifdef ROUTE_MPATH 1373 if (CALC_FLOWID_OUTBOUND) { 1374 uint32_t hash_val, hash_type; 1375 1376 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1377 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1378 1379 inp->inp_flowid = hash_val; 1380 inp->inp_flowtype = hash_type; 1381 } 1382 #endif 1383 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1384 /* 1385 * If the destination address is INADDR_ANY, 1386 * use the primary local address. 1387 * If the supplied address is INADDR_BROADCAST, 1388 * and the primary interface supports broadcast, 1389 * choose the broadcast address for that interface. 1390 */ 1391 if (faddr.s_addr == INADDR_ANY) { 1392 faddr = 1393 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1394 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1395 return (error); 1396 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1397 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1398 IFF_BROADCAST) 1399 faddr = satosin(&CK_STAILQ_FIRST( 1400 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1401 } 1402 } 1403 if (laddr.s_addr == INADDR_ANY) { 1404 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1405 /* 1406 * If the destination address is multicast and an outgoing 1407 * interface has been set as a multicast option, prefer the 1408 * address of that interface as our source address. 1409 */ 1410 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1411 inp->inp_moptions != NULL) { 1412 struct ip_moptions *imo; 1413 struct ifnet *ifp; 1414 1415 imo = inp->inp_moptions; 1416 if (imo->imo_multicast_ifp != NULL) { 1417 ifp = imo->imo_multicast_ifp; 1418 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1419 if (ia->ia_ifp == ifp && 1420 prison_check_ip4(cred, 1421 &ia->ia_addr.sin_addr) == 0) 1422 break; 1423 } 1424 if (ia == NULL) 1425 error = EADDRNOTAVAIL; 1426 else { 1427 laddr = ia->ia_addr.sin_addr; 1428 error = 0; 1429 } 1430 } 1431 } 1432 if (error) 1433 return (error); 1434 } 1435 1436 if (lport != 0) { 1437 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1438 fport, laddr, lport, 0, NULL, M_NODOM); 1439 if (oinp != NULL) { 1440 if (oinpp != NULL) 1441 *oinpp = oinp; 1442 return (EADDRINUSE); 1443 } 1444 } else { 1445 struct sockaddr_in lsin, fsin; 1446 1447 bzero(&lsin, sizeof(lsin)); 1448 bzero(&fsin, sizeof(fsin)); 1449 lsin.sin_family = AF_INET; 1450 lsin.sin_addr = laddr; 1451 fsin.sin_family = AF_INET; 1452 fsin.sin_addr = faddr; 1453 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1454 &lport, (struct sockaddr *)& fsin, fport, cred, 1455 INPLOOKUP_WILDCARD); 1456 if (error) 1457 return (error); 1458 } 1459 *laddrp = laddr.s_addr; 1460 *lportp = lport; 1461 *faddrp = faddr.s_addr; 1462 *fportp = fport; 1463 return (0); 1464 } 1465 1466 void 1467 in_pcbdisconnect(struct inpcb *inp) 1468 { 1469 1470 INP_WLOCK_ASSERT(inp); 1471 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1472 1473 inp->inp_faddr.s_addr = INADDR_ANY; 1474 inp->inp_fport = 0; 1475 in_pcbrehash(inp); 1476 } 1477 #endif /* INET */ 1478 1479 /* 1480 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1481 * For most protocols, this will be invoked immediately prior to calling 1482 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1483 * socket, in which case in_pcbfree() is deferred. 1484 */ 1485 void 1486 in_pcbdetach(struct inpcb *inp) 1487 { 1488 1489 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1490 1491 #ifdef RATELIMIT 1492 if (inp->inp_snd_tag != NULL) 1493 in_pcbdetach_txrtlmt(inp); 1494 #endif 1495 inp->inp_socket->so_pcb = NULL; 1496 inp->inp_socket = NULL; 1497 } 1498 1499 /* 1500 * inpcb hash lookups are protected by SMR section. 1501 * 1502 * Once desired pcb has been found, switching from SMR section to a pcb 1503 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1504 * here because SMR is a critical section. 1505 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1506 */ 1507 static inline void 1508 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1509 { 1510 1511 lock == INPLOOKUP_RLOCKPCB ? 1512 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1513 } 1514 1515 static inline void 1516 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1517 { 1518 1519 lock == INPLOOKUP_RLOCKPCB ? 1520 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1521 } 1522 1523 static inline int 1524 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1525 { 1526 1527 return (lock == INPLOOKUP_RLOCKPCB ? 1528 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1529 } 1530 1531 static inline bool 1532 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1533 { 1534 1535 return (lock == INPLOOKUP_RLOCKPCB ? 1536 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1537 } 1538 1539 bool 1540 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1541 { 1542 1543 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1544 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1545 1546 if (__predict_true(inp_trylock(inp, lock))) { 1547 if (__predict_false(inp->inp_flags & INP_FREED)) { 1548 smr_exit(inp->inp_pcbinfo->ipi_smr); 1549 inp_unlock(inp, lock); 1550 return (false); 1551 } 1552 smr_exit(inp->inp_pcbinfo->ipi_smr); 1553 return (true); 1554 } 1555 1556 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1557 smr_exit(inp->inp_pcbinfo->ipi_smr); 1558 inp_lock(inp, lock); 1559 if (__predict_false(in_pcbrele(inp, lock))) 1560 return (false); 1561 /* 1562 * inp acquired through refcount & lock for sure didn't went 1563 * through uma_zfree(). However, it may have already went 1564 * through in_pcbfree() and has another reference, that 1565 * prevented its release by our in_pcbrele(). 1566 */ 1567 if (__predict_false(inp->inp_flags & INP_FREED)) { 1568 inp_unlock(inp, lock); 1569 return (false); 1570 } 1571 return (true); 1572 } else { 1573 smr_exit(inp->inp_pcbinfo->ipi_smr); 1574 return (false); 1575 } 1576 } 1577 1578 /* 1579 * inp_next() - inpcb hash/list traversal iterator 1580 * 1581 * Requires initialized struct inpcb_iterator for context. 1582 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1583 * 1584 * - Iterator can have either write-lock or read-lock semantics, that can not 1585 * be changed later. 1586 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1587 * a single hash slot. Note: only rip_input() does the latter. 1588 * - Iterator may have optional bool matching function. The matching function 1589 * will be executed for each inpcb in the SMR context, so it can not acquire 1590 * locks and can safely access only immutable fields of inpcb. 1591 * 1592 * A fresh initialized iterator has NULL inpcb in its context and that 1593 * means that inp_next() call would return the very first inpcb on the list 1594 * locked with desired semantic. In all following calls the context pointer 1595 * shall hold the current inpcb pointer. The KPI user is not supposed to 1596 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1597 * and write NULL to its context. After end of traversal an iterator can be 1598 * reused. 1599 * 1600 * List traversals have the following features/constraints: 1601 * - New entries won't be seen, as they are always added to the head of a list. 1602 * - Removed entries won't stop traversal as long as they are not added to 1603 * a different list. This is violated by in_pcbrehash(). 1604 */ 1605 #define II_LIST_FIRST(ipi, hash) \ 1606 (((hash) == INP_ALL_LIST) ? \ 1607 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1608 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1609 #define II_LIST_NEXT(inp, hash) \ 1610 (((hash) == INP_ALL_LIST) ? \ 1611 CK_LIST_NEXT((inp), inp_list) : \ 1612 CK_LIST_NEXT((inp), inp_hash)) 1613 #define II_LOCK_ASSERT(inp, lock) \ 1614 rw_assert(&(inp)->inp_lock, \ 1615 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1616 struct inpcb * 1617 inp_next(struct inpcb_iterator *ii) 1618 { 1619 const struct inpcbinfo *ipi = ii->ipi; 1620 inp_match_t *match = ii->match; 1621 void *ctx = ii->ctx; 1622 inp_lookup_t lock = ii->lock; 1623 int hash = ii->hash; 1624 struct inpcb *inp; 1625 1626 if (ii->inp == NULL) { /* First call. */ 1627 smr_enter(ipi->ipi_smr); 1628 /* This is unrolled CK_LIST_FOREACH(). */ 1629 for (inp = II_LIST_FIRST(ipi, hash); 1630 inp != NULL; 1631 inp = II_LIST_NEXT(inp, hash)) { 1632 if (match != NULL && (match)(inp, ctx) == false) 1633 continue; 1634 if (__predict_true(inp_smr_lock(inp, lock))) 1635 break; 1636 else { 1637 smr_enter(ipi->ipi_smr); 1638 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1639 inp = II_LIST_FIRST(ipi, hash); 1640 if (inp == NULL) 1641 break; 1642 } 1643 } 1644 1645 if (inp == NULL) 1646 smr_exit(ipi->ipi_smr); 1647 else 1648 ii->inp = inp; 1649 1650 return (inp); 1651 } 1652 1653 /* Not a first call. */ 1654 smr_enter(ipi->ipi_smr); 1655 restart: 1656 inp = ii->inp; 1657 II_LOCK_ASSERT(inp, lock); 1658 next: 1659 inp = II_LIST_NEXT(inp, hash); 1660 if (inp == NULL) { 1661 smr_exit(ipi->ipi_smr); 1662 goto found; 1663 } 1664 1665 if (match != NULL && (match)(inp, ctx) == false) 1666 goto next; 1667 1668 if (__predict_true(inp_trylock(inp, lock))) { 1669 if (__predict_false(inp->inp_flags & INP_FREED)) { 1670 /* 1671 * Entries are never inserted in middle of a list, thus 1672 * as long as we are in SMR, we can continue traversal. 1673 * Jump to 'restart' should yield in the same result, 1674 * but could produce unnecessary looping. Could this 1675 * looping be unbound? 1676 */ 1677 inp_unlock(inp, lock); 1678 goto next; 1679 } else { 1680 smr_exit(ipi->ipi_smr); 1681 goto found; 1682 } 1683 } 1684 1685 /* 1686 * Can't obtain lock immediately, thus going hard. Once we exit the 1687 * SMR section we can no longer jump to 'next', and our only stable 1688 * anchoring point is ii->inp, which we keep locked for this case, so 1689 * we jump to 'restart'. 1690 */ 1691 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1692 smr_exit(ipi->ipi_smr); 1693 inp_lock(inp, lock); 1694 if (__predict_false(in_pcbrele(inp, lock))) { 1695 smr_enter(ipi->ipi_smr); 1696 goto restart; 1697 } 1698 /* 1699 * See comment in inp_smr_lock(). 1700 */ 1701 if (__predict_false(inp->inp_flags & INP_FREED)) { 1702 inp_unlock(inp, lock); 1703 smr_enter(ipi->ipi_smr); 1704 goto restart; 1705 } 1706 } else 1707 goto next; 1708 1709 found: 1710 inp_unlock(ii->inp, lock); 1711 ii->inp = inp; 1712 1713 return (ii->inp); 1714 } 1715 1716 /* 1717 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1718 * stability of an inpcb pointer despite the inpcb lock being released or 1719 * SMR section exited. 1720 * 1721 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1722 */ 1723 void 1724 in_pcbref(struct inpcb *inp) 1725 { 1726 u_int old __diagused; 1727 1728 old = refcount_acquire(&inp->inp_refcount); 1729 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1730 } 1731 1732 /* 1733 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1734 * freeing the pcb, if the reference was very last. 1735 */ 1736 bool 1737 in_pcbrele_rlocked(struct inpcb *inp) 1738 { 1739 1740 INP_RLOCK_ASSERT(inp); 1741 1742 if (refcount_release(&inp->inp_refcount) == 0) 1743 return (false); 1744 1745 MPASS(inp->inp_flags & INP_FREED); 1746 MPASS(inp->inp_socket == NULL); 1747 MPASS(inp->inp_in_hpts == 0); 1748 INP_RUNLOCK(inp); 1749 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1750 return (true); 1751 } 1752 1753 bool 1754 in_pcbrele_wlocked(struct inpcb *inp) 1755 { 1756 1757 INP_WLOCK_ASSERT(inp); 1758 1759 if (refcount_release(&inp->inp_refcount) == 0) 1760 return (false); 1761 1762 MPASS(inp->inp_flags & INP_FREED); 1763 MPASS(inp->inp_socket == NULL); 1764 MPASS(inp->inp_in_hpts == 0); 1765 INP_WUNLOCK(inp); 1766 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1767 return (true); 1768 } 1769 1770 /* 1771 * Unconditionally schedule an inpcb to be freed by decrementing its 1772 * reference count, which should occur only after the inpcb has been detached 1773 * from its socket. If another thread holds a temporary reference (acquired 1774 * using in_pcbref()) then the free is deferred until that reference is 1775 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1776 * Almost all work, including removal from global lists, is done in this 1777 * context, where the pcbinfo lock is held. 1778 */ 1779 void 1780 in_pcbfree(struct inpcb *inp) 1781 { 1782 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1783 #ifdef INET 1784 struct ip_moptions *imo; 1785 #endif 1786 #ifdef INET6 1787 struct ip6_moptions *im6o; 1788 #endif 1789 1790 INP_WLOCK_ASSERT(inp); 1791 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1792 KASSERT((inp->inp_flags & INP_FREED) == 0, 1793 ("%s: called twice for pcb %p", __func__, inp)); 1794 1795 inp->inp_flags |= INP_FREED; 1796 INP_INFO_WLOCK(pcbinfo); 1797 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1798 pcbinfo->ipi_count--; 1799 CK_LIST_REMOVE(inp, inp_list); 1800 INP_INFO_WUNLOCK(pcbinfo); 1801 1802 if (inp->inp_flags & INP_INHASHLIST) 1803 in_pcbremhash(inp); 1804 1805 RO_INVALIDATE_CACHE(&inp->inp_route); 1806 #ifdef MAC 1807 mac_inpcb_destroy(inp); 1808 #endif 1809 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1810 if (inp->inp_sp != NULL) 1811 ipsec_delete_pcbpolicy(inp); 1812 #endif 1813 #ifdef INET 1814 if (inp->inp_options) 1815 (void)m_free(inp->inp_options); 1816 imo = inp->inp_moptions; 1817 #endif 1818 #ifdef INET6 1819 if (inp->inp_vflag & INP_IPV6PROTO) { 1820 ip6_freepcbopts(inp->in6p_outputopts); 1821 im6o = inp->in6p_moptions; 1822 } else 1823 im6o = NULL; 1824 #endif 1825 1826 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1827 INP_WUNLOCK(inp); 1828 } 1829 #ifdef INET6 1830 ip6_freemoptions(im6o); 1831 #endif 1832 #ifdef INET 1833 inp_freemoptions(imo); 1834 #endif 1835 /* Destruction is finalized in inpcb_dtor(). */ 1836 } 1837 1838 static void 1839 inpcb_dtor(void *mem, int size, void *arg) 1840 { 1841 struct inpcb *inp = mem; 1842 1843 crfree(inp->inp_cred); 1844 #ifdef INVARIANTS 1845 inp->inp_cred = NULL; 1846 #endif 1847 } 1848 1849 /* 1850 * Different protocols initialize their inpcbs differently - giving 1851 * different name to the lock. But they all are disposed the same. 1852 */ 1853 static void 1854 inpcb_fini(void *mem, int size) 1855 { 1856 struct inpcb *inp = mem; 1857 1858 INP_LOCK_DESTROY(inp); 1859 } 1860 1861 /* 1862 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1863 * port reservation, and preventing it from being returned by inpcb lookups. 1864 * 1865 * It is used by TCP to mark an inpcb as unused and avoid future packet 1866 * delivery or event notification when a socket remains open but TCP has 1867 * closed. This might occur as a result of a shutdown()-initiated TCP close 1868 * or a RST on the wire, and allows the port binding to be reused while still 1869 * maintaining the invariant that so_pcb always points to a valid inpcb until 1870 * in_pcbdetach(). 1871 * 1872 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1873 * in_pcbnotifyall() and in_pcbpurgeif0()? 1874 */ 1875 void 1876 in_pcbdrop(struct inpcb *inp) 1877 { 1878 1879 INP_WLOCK_ASSERT(inp); 1880 #ifdef INVARIANTS 1881 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1882 MPASS(inp->inp_refcount > 1); 1883 #endif 1884 1885 inp->inp_flags |= INP_DROPPED; 1886 if (inp->inp_flags & INP_INHASHLIST) 1887 in_pcbremhash(inp); 1888 } 1889 1890 #ifdef INET 1891 /* 1892 * Common routines to return the socket addresses associated with inpcbs. 1893 */ 1894 struct sockaddr * 1895 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1896 { 1897 struct sockaddr_in *sin; 1898 1899 sin = malloc(sizeof *sin, M_SONAME, 1900 M_WAITOK | M_ZERO); 1901 sin->sin_family = AF_INET; 1902 sin->sin_len = sizeof(*sin); 1903 sin->sin_addr = *addr_p; 1904 sin->sin_port = port; 1905 1906 return (struct sockaddr *)sin; 1907 } 1908 1909 int 1910 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1911 { 1912 struct inpcb *inp; 1913 struct in_addr addr; 1914 in_port_t port; 1915 1916 inp = sotoinpcb(so); 1917 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1918 1919 INP_RLOCK(inp); 1920 port = inp->inp_lport; 1921 addr = inp->inp_laddr; 1922 INP_RUNLOCK(inp); 1923 1924 *nam = in_sockaddr(port, &addr); 1925 return 0; 1926 } 1927 1928 int 1929 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1930 { 1931 struct inpcb *inp; 1932 struct in_addr addr; 1933 in_port_t port; 1934 1935 inp = sotoinpcb(so); 1936 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1937 1938 INP_RLOCK(inp); 1939 port = inp->inp_fport; 1940 addr = inp->inp_faddr; 1941 INP_RUNLOCK(inp); 1942 1943 *nam = in_sockaddr(port, &addr); 1944 return 0; 1945 } 1946 1947 void 1948 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1949 struct inpcb *(*notify)(struct inpcb *, int)) 1950 { 1951 struct inpcb *inp, *inp_temp; 1952 1953 INP_INFO_WLOCK(pcbinfo); 1954 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1955 INP_WLOCK(inp); 1956 #ifdef INET6 1957 if ((inp->inp_vflag & INP_IPV4) == 0) { 1958 INP_WUNLOCK(inp); 1959 continue; 1960 } 1961 #endif 1962 if (inp->inp_faddr.s_addr != faddr.s_addr || 1963 inp->inp_socket == NULL) { 1964 INP_WUNLOCK(inp); 1965 continue; 1966 } 1967 if ((*notify)(inp, errno)) 1968 INP_WUNLOCK(inp); 1969 } 1970 INP_INFO_WUNLOCK(pcbinfo); 1971 } 1972 1973 static bool 1974 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1975 { 1976 1977 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1978 return (true); 1979 else 1980 return (false); 1981 } 1982 1983 void 1984 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1985 { 1986 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1987 inp_v4_multi_match, NULL); 1988 struct inpcb *inp; 1989 struct in_multi *inm; 1990 struct in_mfilter *imf; 1991 struct ip_moptions *imo; 1992 1993 IN_MULTI_LOCK_ASSERT(); 1994 1995 while ((inp = inp_next(&inpi)) != NULL) { 1996 INP_WLOCK_ASSERT(inp); 1997 1998 imo = inp->inp_moptions; 1999 /* 2000 * Unselect the outgoing interface if it is being 2001 * detached. 2002 */ 2003 if (imo->imo_multicast_ifp == ifp) 2004 imo->imo_multicast_ifp = NULL; 2005 2006 /* 2007 * Drop multicast group membership if we joined 2008 * through the interface being detached. 2009 * 2010 * XXX This can all be deferred to an epoch_call 2011 */ 2012 restart: 2013 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2014 if ((inm = imf->imf_inm) == NULL) 2015 continue; 2016 if (inm->inm_ifp != ifp) 2017 continue; 2018 ip_mfilter_remove(&imo->imo_head, imf); 2019 in_leavegroup_locked(inm, NULL); 2020 ip_mfilter_free(imf); 2021 goto restart; 2022 } 2023 } 2024 } 2025 2026 /* 2027 * Lookup a PCB based on the local address and port. Caller must hold the 2028 * hash lock. No inpcb locks or references are acquired. 2029 */ 2030 #define INP_LOOKUP_MAPPED_PCB_COST 3 2031 struct inpcb * 2032 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2033 u_short lport, int lookupflags, struct ucred *cred) 2034 { 2035 struct inpcb *inp; 2036 #ifdef INET6 2037 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2038 #else 2039 int matchwild = 3; 2040 #endif 2041 int wildcard; 2042 2043 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2044 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2045 INP_HASH_LOCK_ASSERT(pcbinfo); 2046 2047 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2048 struct inpcbhead *head; 2049 /* 2050 * Look for an unconnected (wildcard foreign addr) PCB that 2051 * matches the local address and port we're looking for. 2052 */ 2053 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2054 pcbinfo->ipi_hashmask)]; 2055 CK_LIST_FOREACH(inp, head, inp_hash) { 2056 #ifdef INET6 2057 /* XXX inp locking */ 2058 if ((inp->inp_vflag & INP_IPV4) == 0) 2059 continue; 2060 #endif 2061 if (inp->inp_faddr.s_addr == INADDR_ANY && 2062 inp->inp_laddr.s_addr == laddr.s_addr && 2063 inp->inp_lport == lport) { 2064 /* 2065 * Found? 2066 */ 2067 if (prison_equal_ip4(cred->cr_prison, 2068 inp->inp_cred->cr_prison)) 2069 return (inp); 2070 } 2071 } 2072 /* 2073 * Not found. 2074 */ 2075 return (NULL); 2076 } else { 2077 struct inpcbporthead *porthash; 2078 struct inpcbport *phd; 2079 struct inpcb *match = NULL; 2080 /* 2081 * Best fit PCB lookup. 2082 * 2083 * First see if this local port is in use by looking on the 2084 * port hash list. 2085 */ 2086 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2087 pcbinfo->ipi_porthashmask)]; 2088 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2089 if (phd->phd_port == lport) 2090 break; 2091 } 2092 if (phd != NULL) { 2093 /* 2094 * Port is in use by one or more PCBs. Look for best 2095 * fit. 2096 */ 2097 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2098 wildcard = 0; 2099 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2100 cred->cr_prison)) 2101 continue; 2102 #ifdef INET6 2103 /* XXX inp locking */ 2104 if ((inp->inp_vflag & INP_IPV4) == 0) 2105 continue; 2106 /* 2107 * We never select the PCB that has 2108 * INP_IPV6 flag and is bound to :: if 2109 * we have another PCB which is bound 2110 * to 0.0.0.0. If a PCB has the 2111 * INP_IPV6 flag, then we set its cost 2112 * higher than IPv4 only PCBs. 2113 * 2114 * Note that the case only happens 2115 * when a socket is bound to ::, under 2116 * the condition that the use of the 2117 * mapped address is allowed. 2118 */ 2119 if ((inp->inp_vflag & INP_IPV6) != 0) 2120 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2121 #endif 2122 if (inp->inp_faddr.s_addr != INADDR_ANY) 2123 wildcard++; 2124 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2125 if (laddr.s_addr == INADDR_ANY) 2126 wildcard++; 2127 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2128 continue; 2129 } else { 2130 if (laddr.s_addr != INADDR_ANY) 2131 wildcard++; 2132 } 2133 if (wildcard < matchwild) { 2134 match = inp; 2135 matchwild = wildcard; 2136 if (matchwild == 0) 2137 break; 2138 } 2139 } 2140 } 2141 return (match); 2142 } 2143 } 2144 #undef INP_LOOKUP_MAPPED_PCB_COST 2145 2146 static bool 2147 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) 2148 { 2149 return (domain == M_NODOM || domain == grp->il_numa_domain); 2150 } 2151 2152 static struct inpcb * 2153 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2154 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2155 uint16_t fport, int lookupflags, int domain) 2156 { 2157 const struct inpcblbgrouphead *hdr; 2158 struct inpcblbgroup *grp; 2159 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2160 2161 INP_HASH_LOCK_ASSERT(pcbinfo); 2162 2163 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2164 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2165 2166 /* 2167 * Search for an LB group match based on the following criteria: 2168 * - prefer jailed groups to non-jailed groups 2169 * - prefer exact source address matches to wildcard matches 2170 * - prefer groups bound to the specified NUMA domain 2171 */ 2172 jail_exact = jail_wild = local_exact = local_wild = NULL; 2173 CK_LIST_FOREACH(grp, hdr, il_list) { 2174 bool injail; 2175 2176 #ifdef INET6 2177 if (!(grp->il_vflag & INP_IPV4)) 2178 continue; 2179 #endif 2180 if (grp->il_lport != lport) 2181 continue; 2182 2183 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2184 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2185 laddr) != 0) 2186 continue; 2187 2188 if (grp->il_laddr.s_addr == laddr->s_addr) { 2189 if (injail) { 2190 jail_exact = grp; 2191 if (in_pcblookup_lb_numa_match(grp, domain)) 2192 /* This is a perfect match. */ 2193 goto out; 2194 } else if (local_exact == NULL || 2195 in_pcblookup_lb_numa_match(grp, domain)) { 2196 local_exact = grp; 2197 } 2198 } else if (grp->il_laddr.s_addr == INADDR_ANY && 2199 (lookupflags & INPLOOKUP_WILDCARD) != 0) { 2200 if (injail) { 2201 if (jail_wild == NULL || 2202 in_pcblookup_lb_numa_match(grp, domain)) 2203 jail_wild = grp; 2204 } else if (local_wild == NULL || 2205 in_pcblookup_lb_numa_match(grp, domain)) { 2206 local_wild = grp; 2207 } 2208 } 2209 } 2210 2211 if (jail_exact != NULL) 2212 grp = jail_exact; 2213 else if (jail_wild != NULL) 2214 grp = jail_wild; 2215 else if (local_exact != NULL) 2216 grp = local_exact; 2217 else 2218 grp = local_wild; 2219 if (grp == NULL) 2220 return (NULL); 2221 out: 2222 return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2223 grp->il_inpcnt]); 2224 } 2225 2226 /* 2227 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2228 * that the caller has either locked the hash list, which usually happens 2229 * for bind(2) operations, or is in SMR section, which happens when sorting 2230 * out incoming packets. 2231 */ 2232 static struct inpcb * 2233 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2234 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2235 struct ifnet *ifp, uint8_t numa_domain) 2236 { 2237 struct inpcbhead *head; 2238 struct inpcb *inp, *tmpinp; 2239 u_short fport = fport_arg, lport = lport_arg; 2240 2241 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2242 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2243 INP_HASH_LOCK_ASSERT(pcbinfo); 2244 2245 /* 2246 * First look for an exact match. 2247 */ 2248 tmpinp = NULL; 2249 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, 2250 pcbinfo->ipi_hashmask)]; 2251 CK_LIST_FOREACH(inp, head, inp_hash) { 2252 #ifdef INET6 2253 /* XXX inp locking */ 2254 if ((inp->inp_vflag & INP_IPV4) == 0) 2255 continue; 2256 #endif 2257 if (inp->inp_faddr.s_addr == faddr.s_addr && 2258 inp->inp_laddr.s_addr == laddr.s_addr && 2259 inp->inp_fport == fport && 2260 inp->inp_lport == lport) { 2261 /* 2262 * XXX We should be able to directly return 2263 * the inp here, without any checks. 2264 * Well unless both bound with SO_REUSEPORT? 2265 */ 2266 if (prison_flag(inp->inp_cred, PR_IP4)) 2267 return (inp); 2268 if (tmpinp == NULL) 2269 tmpinp = inp; 2270 } 2271 } 2272 if (tmpinp != NULL) 2273 return (tmpinp); 2274 2275 /* 2276 * Then look for a wildcard match, if requested. 2277 */ 2278 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2279 struct inpcb *local_wild = NULL, *local_exact = NULL; 2280 #ifdef INET6 2281 struct inpcb *local_wild_mapped = NULL; 2282 #endif 2283 struct inpcb *jail_wild = NULL; 2284 int injail; 2285 2286 /* 2287 * First see if an LB group matches the request before scanning 2288 * all sockets on this port. 2289 */ 2290 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2291 fport, lookupflags, numa_domain); 2292 if (inp != NULL) 2293 return (inp); 2294 2295 /* 2296 * Order of socket selection - we always prefer jails. 2297 * 1. jailed, non-wild. 2298 * 2. jailed, wild. 2299 * 3. non-jailed, non-wild. 2300 * 4. non-jailed, wild. 2301 */ 2302 2303 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2304 pcbinfo->ipi_hashmask)]; 2305 CK_LIST_FOREACH(inp, head, inp_hash) { 2306 #ifdef INET6 2307 /* XXX inp locking */ 2308 if ((inp->inp_vflag & INP_IPV4) == 0) 2309 continue; 2310 #endif 2311 if (inp->inp_faddr.s_addr != INADDR_ANY || 2312 inp->inp_lport != lport) 2313 continue; 2314 2315 injail = prison_flag(inp->inp_cred, PR_IP4); 2316 if (injail) { 2317 if (prison_check_ip4_locked( 2318 inp->inp_cred->cr_prison, &laddr) != 0) 2319 continue; 2320 } else { 2321 if (local_exact != NULL) 2322 continue; 2323 } 2324 2325 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2326 if (injail) 2327 return (inp); 2328 else 2329 local_exact = inp; 2330 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2331 #ifdef INET6 2332 /* XXX inp locking, NULL check */ 2333 if (inp->inp_vflag & INP_IPV6PROTO) 2334 local_wild_mapped = inp; 2335 else 2336 #endif 2337 if (injail) 2338 jail_wild = inp; 2339 else 2340 local_wild = inp; 2341 } 2342 } /* LIST_FOREACH */ 2343 if (jail_wild != NULL) 2344 return (jail_wild); 2345 if (local_exact != NULL) 2346 return (local_exact); 2347 if (local_wild != NULL) 2348 return (local_wild); 2349 #ifdef INET6 2350 if (local_wild_mapped != NULL) 2351 return (local_wild_mapped); 2352 #endif 2353 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2354 2355 return (NULL); 2356 } 2357 2358 /* 2359 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2360 * hash list lock, and will return the inpcb locked (i.e., requires 2361 * INPLOOKUP_LOCKPCB). 2362 */ 2363 static struct inpcb * 2364 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2365 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2366 struct ifnet *ifp, uint8_t numa_domain) 2367 { 2368 struct inpcb *inp; 2369 2370 smr_enter(pcbinfo->ipi_smr); 2371 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2372 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2373 if (inp != NULL) { 2374 if (__predict_false(inp_smr_lock(inp, 2375 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2376 inp = NULL; 2377 } else 2378 smr_exit(pcbinfo->ipi_smr); 2379 2380 return (inp); 2381 } 2382 2383 /* 2384 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2385 * from which a pre-calculated hash value may be extracted. 2386 */ 2387 struct inpcb * 2388 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2389 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2390 { 2391 2392 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2393 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2394 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2395 ("%s: LOCKPCB not set", __func__)); 2396 2397 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2398 lookupflags, ifp, M_NODOM)); 2399 } 2400 2401 struct inpcb * 2402 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2403 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2404 struct ifnet *ifp, struct mbuf *m) 2405 { 2406 2407 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2408 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2409 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2410 ("%s: LOCKPCB not set", __func__)); 2411 2412 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2413 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2414 } 2415 #endif /* INET */ 2416 2417 /* 2418 * Insert PCB onto various hash lists. 2419 */ 2420 int 2421 in_pcbinshash(struct inpcb *inp) 2422 { 2423 struct inpcbhead *pcbhash; 2424 struct inpcbporthead *pcbporthash; 2425 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2426 struct inpcbport *phd; 2427 2428 INP_WLOCK_ASSERT(inp); 2429 INP_HASH_WLOCK_ASSERT(pcbinfo); 2430 2431 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2432 ("in_pcbinshash: INP_INHASHLIST")); 2433 2434 #ifdef INET6 2435 if (inp->inp_vflag & INP_IPV6) 2436 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2437 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2438 else 2439 #endif 2440 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2441 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2442 2443 pcbporthash = &pcbinfo->ipi_porthashbase[ 2444 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2445 2446 /* 2447 * Add entry to load balance group. 2448 * Only do this if SO_REUSEPORT_LB is set. 2449 */ 2450 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) { 2451 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2452 if (error != 0) 2453 return (error); 2454 } 2455 2456 /* 2457 * Go through port list and look for a head for this lport. 2458 */ 2459 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2460 if (phd->phd_port == inp->inp_lport) 2461 break; 2462 } 2463 2464 /* 2465 * If none exists, malloc one and tack it on. 2466 */ 2467 if (phd == NULL) { 2468 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2469 if (phd == NULL) { 2470 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2471 in_pcbremlbgrouphash(inp); 2472 return (ENOMEM); 2473 } 2474 phd->phd_port = inp->inp_lport; 2475 CK_LIST_INIT(&phd->phd_pcblist); 2476 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2477 } 2478 inp->inp_phd = phd; 2479 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2480 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2481 inp->inp_flags |= INP_INHASHLIST; 2482 2483 return (0); 2484 } 2485 2486 static void 2487 in_pcbremhash(struct inpcb *inp) 2488 { 2489 struct inpcbport *phd = inp->inp_phd; 2490 2491 INP_WLOCK_ASSERT(inp); 2492 MPASS(inp->inp_flags & INP_INHASHLIST); 2493 2494 INP_HASH_WLOCK(inp->inp_pcbinfo); 2495 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 2496 in_pcbremlbgrouphash(inp); 2497 CK_LIST_REMOVE(inp, inp_hash); 2498 CK_LIST_REMOVE(inp, inp_portlist); 2499 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2500 CK_LIST_REMOVE(phd, phd_hash); 2501 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2502 } 2503 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2504 inp->inp_flags &= ~INP_INHASHLIST; 2505 } 2506 2507 /* 2508 * Move PCB to the proper hash bucket when { faddr, fport } have been 2509 * changed. NOTE: This does not handle the case of the lport changing (the 2510 * hashed port list would have to be updated as well), so the lport must 2511 * not change after in_pcbinshash() has been called. 2512 * 2513 * XXXGL: a race between this function and SMR-protected hash iterator 2514 * will lead to iterator traversing a possibly wrong hash list. However, 2515 * this race should have been here since change from rwlock to epoch. 2516 */ 2517 void 2518 in_pcbrehash(struct inpcb *inp) 2519 { 2520 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2521 struct inpcbhead *head; 2522 2523 INP_WLOCK_ASSERT(inp); 2524 INP_HASH_WLOCK_ASSERT(pcbinfo); 2525 2526 KASSERT(inp->inp_flags & INP_INHASHLIST, 2527 ("in_pcbrehash: !INP_INHASHLIST")); 2528 2529 #ifdef INET6 2530 if (inp->inp_vflag & INP_IPV6) 2531 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2532 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2533 else 2534 #endif 2535 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2536 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2537 2538 CK_LIST_REMOVE(inp, inp_hash); 2539 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2540 } 2541 2542 /* 2543 * Check for alternatives when higher level complains 2544 * about service problems. For now, invalidate cached 2545 * routing information. If the route was created dynamically 2546 * (by a redirect), time to try a default gateway again. 2547 */ 2548 void 2549 in_losing(struct inpcb *inp) 2550 { 2551 2552 RO_INVALIDATE_CACHE(&inp->inp_route); 2553 return; 2554 } 2555 2556 /* 2557 * A set label operation has occurred at the socket layer, propagate the 2558 * label change into the in_pcb for the socket. 2559 */ 2560 void 2561 in_pcbsosetlabel(struct socket *so) 2562 { 2563 #ifdef MAC 2564 struct inpcb *inp; 2565 2566 inp = sotoinpcb(so); 2567 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2568 2569 INP_WLOCK(inp); 2570 SOCK_LOCK(so); 2571 mac_inpcb_sosetlabel(so, inp); 2572 SOCK_UNLOCK(so); 2573 INP_WUNLOCK(inp); 2574 #endif 2575 } 2576 2577 void 2578 inp_wlock(struct inpcb *inp) 2579 { 2580 2581 INP_WLOCK(inp); 2582 } 2583 2584 void 2585 inp_wunlock(struct inpcb *inp) 2586 { 2587 2588 INP_WUNLOCK(inp); 2589 } 2590 2591 void 2592 inp_rlock(struct inpcb *inp) 2593 { 2594 2595 INP_RLOCK(inp); 2596 } 2597 2598 void 2599 inp_runlock(struct inpcb *inp) 2600 { 2601 2602 INP_RUNLOCK(inp); 2603 } 2604 2605 #ifdef INVARIANT_SUPPORT 2606 void 2607 inp_lock_assert(struct inpcb *inp) 2608 { 2609 2610 INP_WLOCK_ASSERT(inp); 2611 } 2612 2613 void 2614 inp_unlock_assert(struct inpcb *inp) 2615 { 2616 2617 INP_UNLOCK_ASSERT(inp); 2618 } 2619 #endif 2620 2621 void 2622 inp_apply_all(struct inpcbinfo *pcbinfo, 2623 void (*func)(struct inpcb *, void *), void *arg) 2624 { 2625 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2626 INPLOOKUP_WLOCKPCB); 2627 struct inpcb *inp; 2628 2629 while ((inp = inp_next(&inpi)) != NULL) 2630 func(inp, arg); 2631 } 2632 2633 struct socket * 2634 inp_inpcbtosocket(struct inpcb *inp) 2635 { 2636 2637 INP_WLOCK_ASSERT(inp); 2638 return (inp->inp_socket); 2639 } 2640 2641 struct tcpcb * 2642 inp_inpcbtotcpcb(struct inpcb *inp) 2643 { 2644 2645 INP_WLOCK_ASSERT(inp); 2646 return ((struct tcpcb *)inp->inp_ppcb); 2647 } 2648 2649 int 2650 inp_ip_tos_get(const struct inpcb *inp) 2651 { 2652 2653 return (inp->inp_ip_tos); 2654 } 2655 2656 void 2657 inp_ip_tos_set(struct inpcb *inp, int val) 2658 { 2659 2660 inp->inp_ip_tos = val; 2661 } 2662 2663 void 2664 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2665 uint32_t *faddr, uint16_t *fp) 2666 { 2667 2668 INP_LOCK_ASSERT(inp); 2669 *laddr = inp->inp_laddr.s_addr; 2670 *faddr = inp->inp_faddr.s_addr; 2671 *lp = inp->inp_lport; 2672 *fp = inp->inp_fport; 2673 } 2674 2675 struct inpcb * 2676 so_sotoinpcb(struct socket *so) 2677 { 2678 2679 return (sotoinpcb(so)); 2680 } 2681 2682 /* 2683 * Create an external-format (``xinpcb'') structure using the information in 2684 * the kernel-format in_pcb structure pointed to by inp. This is done to 2685 * reduce the spew of irrelevant information over this interface, to isolate 2686 * user code from changes in the kernel structure, and potentially to provide 2687 * information-hiding if we decide that some of this information should be 2688 * hidden from users. 2689 */ 2690 void 2691 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2692 { 2693 2694 bzero(xi, sizeof(*xi)); 2695 xi->xi_len = sizeof(struct xinpcb); 2696 if (inp->inp_socket) 2697 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2698 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2699 xi->inp_gencnt = inp->inp_gencnt; 2700 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2701 xi->inp_flow = inp->inp_flow; 2702 xi->inp_flowid = inp->inp_flowid; 2703 xi->inp_flowtype = inp->inp_flowtype; 2704 xi->inp_flags = inp->inp_flags; 2705 xi->inp_flags2 = inp->inp_flags2; 2706 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2707 xi->in6p_cksum = inp->in6p_cksum; 2708 xi->in6p_hops = inp->in6p_hops; 2709 xi->inp_ip_tos = inp->inp_ip_tos; 2710 xi->inp_vflag = inp->inp_vflag; 2711 xi->inp_ip_ttl = inp->inp_ip_ttl; 2712 xi->inp_ip_p = inp->inp_ip_p; 2713 xi->inp_ip_minttl = inp->inp_ip_minttl; 2714 } 2715 2716 int 2717 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2718 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2719 { 2720 struct sockopt sopt; 2721 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2722 INPLOOKUP_WLOCKPCB); 2723 struct inpcb *inp; 2724 struct sockopt_parameters *params; 2725 struct socket *so; 2726 int error; 2727 char buf[1024]; 2728 2729 if (req->oldptr != NULL || req->oldlen != 0) 2730 return (EINVAL); 2731 if (req->newptr == NULL) 2732 return (EPERM); 2733 if (req->newlen > sizeof(buf)) 2734 return (ENOMEM); 2735 error = SYSCTL_IN(req, buf, req->newlen); 2736 if (error != 0) 2737 return (error); 2738 if (req->newlen < sizeof(struct sockopt_parameters)) 2739 return (EINVAL); 2740 params = (struct sockopt_parameters *)buf; 2741 sopt.sopt_level = params->sop_level; 2742 sopt.sopt_name = params->sop_optname; 2743 sopt.sopt_dir = SOPT_SET; 2744 sopt.sopt_val = params->sop_optval; 2745 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2746 sopt.sopt_td = NULL; 2747 #ifdef INET6 2748 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2749 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2750 params->sop_inc.inc6_laddr.s6_addr16[1] = 2751 htons(params->sop_inc.inc6_zoneid & 0xffff); 2752 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2753 params->sop_inc.inc6_faddr.s6_addr16[1] = 2754 htons(params->sop_inc.inc6_zoneid & 0xffff); 2755 } 2756 #endif 2757 if (params->sop_inc.inc_lport != htons(0)) { 2758 if (params->sop_inc.inc_fport == htons(0)) 2759 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, 2760 pcbinfo->ipi_hashmask); 2761 else 2762 #ifdef INET6 2763 if (params->sop_inc.inc_flags & INC_ISIPV6) 2764 inpi.hash = INP6_PCBHASH( 2765 ¶ms->sop_inc.inc6_faddr, 2766 params->sop_inc.inc_lport, 2767 params->sop_inc.inc_fport, 2768 pcbinfo->ipi_hashmask); 2769 else 2770 #endif 2771 inpi.hash = INP_PCBHASH( 2772 ¶ms->sop_inc.inc_faddr, 2773 params->sop_inc.inc_lport, 2774 params->sop_inc.inc_fport, 2775 pcbinfo->ipi_hashmask); 2776 } 2777 while ((inp = inp_next(&inpi)) != NULL) 2778 if (inp->inp_gencnt == params->sop_id) { 2779 if (inp->inp_flags & INP_DROPPED) { 2780 INP_WUNLOCK(inp); 2781 return (ECONNRESET); 2782 } 2783 so = inp->inp_socket; 2784 KASSERT(so != NULL, ("inp_socket == NULL")); 2785 soref(so); 2786 error = (*ctloutput_set)(inp, &sopt); 2787 sorele(so); 2788 break; 2789 } 2790 if (inp == NULL) 2791 error = ESRCH; 2792 return (error); 2793 } 2794 2795 #ifdef DDB 2796 static void 2797 db_print_indent(int indent) 2798 { 2799 int i; 2800 2801 for (i = 0; i < indent; i++) 2802 db_printf(" "); 2803 } 2804 2805 static void 2806 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2807 { 2808 char faddr_str[48], laddr_str[48]; 2809 2810 db_print_indent(indent); 2811 db_printf("%s at %p\n", name, inc); 2812 2813 indent += 2; 2814 2815 #ifdef INET6 2816 if (inc->inc_flags & INC_ISIPV6) { 2817 /* IPv6. */ 2818 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2819 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2820 } else 2821 #endif 2822 { 2823 /* IPv4. */ 2824 inet_ntoa_r(inc->inc_laddr, laddr_str); 2825 inet_ntoa_r(inc->inc_faddr, faddr_str); 2826 } 2827 db_print_indent(indent); 2828 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2829 ntohs(inc->inc_lport)); 2830 db_print_indent(indent); 2831 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2832 ntohs(inc->inc_fport)); 2833 } 2834 2835 static void 2836 db_print_inpflags(int inp_flags) 2837 { 2838 int comma; 2839 2840 comma = 0; 2841 if (inp_flags & INP_RECVOPTS) { 2842 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2843 comma = 1; 2844 } 2845 if (inp_flags & INP_RECVRETOPTS) { 2846 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2847 comma = 1; 2848 } 2849 if (inp_flags & INP_RECVDSTADDR) { 2850 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2851 comma = 1; 2852 } 2853 if (inp_flags & INP_ORIGDSTADDR) { 2854 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2855 comma = 1; 2856 } 2857 if (inp_flags & INP_HDRINCL) { 2858 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2859 comma = 1; 2860 } 2861 if (inp_flags & INP_HIGHPORT) { 2862 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2863 comma = 1; 2864 } 2865 if (inp_flags & INP_LOWPORT) { 2866 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2867 comma = 1; 2868 } 2869 if (inp_flags & INP_ANONPORT) { 2870 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2871 comma = 1; 2872 } 2873 if (inp_flags & INP_RECVIF) { 2874 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2875 comma = 1; 2876 } 2877 if (inp_flags & INP_MTUDISC) { 2878 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2879 comma = 1; 2880 } 2881 if (inp_flags & INP_RECVTTL) { 2882 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2883 comma = 1; 2884 } 2885 if (inp_flags & INP_DONTFRAG) { 2886 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2887 comma = 1; 2888 } 2889 if (inp_flags & INP_RECVTOS) { 2890 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2891 comma = 1; 2892 } 2893 if (inp_flags & IN6P_IPV6_V6ONLY) { 2894 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2895 comma = 1; 2896 } 2897 if (inp_flags & IN6P_PKTINFO) { 2898 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2899 comma = 1; 2900 } 2901 if (inp_flags & IN6P_HOPLIMIT) { 2902 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2903 comma = 1; 2904 } 2905 if (inp_flags & IN6P_HOPOPTS) { 2906 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2907 comma = 1; 2908 } 2909 if (inp_flags & IN6P_DSTOPTS) { 2910 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2911 comma = 1; 2912 } 2913 if (inp_flags & IN6P_RTHDR) { 2914 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2915 comma = 1; 2916 } 2917 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2918 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2919 comma = 1; 2920 } 2921 if (inp_flags & IN6P_TCLASS) { 2922 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2923 comma = 1; 2924 } 2925 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2926 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2927 comma = 1; 2928 } 2929 if (inp_flags & INP_ONESBCAST) { 2930 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2931 comma = 1; 2932 } 2933 if (inp_flags & INP_DROPPED) { 2934 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2935 comma = 1; 2936 } 2937 if (inp_flags & INP_SOCKREF) { 2938 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2939 comma = 1; 2940 } 2941 if (inp_flags & IN6P_RFC2292) { 2942 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2943 comma = 1; 2944 } 2945 if (inp_flags & IN6P_MTU) { 2946 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2947 comma = 1; 2948 } 2949 } 2950 2951 static void 2952 db_print_inpvflag(u_char inp_vflag) 2953 { 2954 int comma; 2955 2956 comma = 0; 2957 if (inp_vflag & INP_IPV4) { 2958 db_printf("%sINP_IPV4", comma ? ", " : ""); 2959 comma = 1; 2960 } 2961 if (inp_vflag & INP_IPV6) { 2962 db_printf("%sINP_IPV6", comma ? ", " : ""); 2963 comma = 1; 2964 } 2965 if (inp_vflag & INP_IPV6PROTO) { 2966 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2967 comma = 1; 2968 } 2969 } 2970 2971 static void 2972 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2973 { 2974 2975 db_print_indent(indent); 2976 db_printf("%s at %p\n", name, inp); 2977 2978 indent += 2; 2979 2980 db_print_indent(indent); 2981 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2982 2983 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2984 2985 db_print_indent(indent); 2986 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2987 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 2988 2989 db_print_indent(indent); 2990 db_printf("inp_label: %p inp_flags: 0x%x (", 2991 inp->inp_label, inp->inp_flags); 2992 db_print_inpflags(inp->inp_flags); 2993 db_printf(")\n"); 2994 2995 db_print_indent(indent); 2996 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 2997 inp->inp_vflag); 2998 db_print_inpvflag(inp->inp_vflag); 2999 db_printf(")\n"); 3000 3001 db_print_indent(indent); 3002 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3003 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3004 3005 db_print_indent(indent); 3006 #ifdef INET6 3007 if (inp->inp_vflag & INP_IPV6) { 3008 db_printf("in6p_options: %p in6p_outputopts: %p " 3009 "in6p_moptions: %p\n", inp->in6p_options, 3010 inp->in6p_outputopts, inp->in6p_moptions); 3011 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3012 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3013 inp->in6p_hops); 3014 } else 3015 #endif 3016 { 3017 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3018 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3019 inp->inp_options, inp->inp_moptions); 3020 } 3021 3022 db_print_indent(indent); 3023 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3024 (uintmax_t)inp->inp_gencnt); 3025 } 3026 3027 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3028 { 3029 struct inpcb *inp; 3030 3031 if (!have_addr) { 3032 db_printf("usage: show inpcb <addr>\n"); 3033 return; 3034 } 3035 inp = (struct inpcb *)addr; 3036 3037 db_print_inpcb(inp, "inpcb", 0); 3038 } 3039 #endif /* DDB */ 3040 3041 #ifdef RATELIMIT 3042 /* 3043 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3044 * if any. 3045 */ 3046 int 3047 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3048 { 3049 union if_snd_tag_modify_params params = { 3050 .rate_limit.max_rate = max_pacing_rate, 3051 .rate_limit.flags = M_NOWAIT, 3052 }; 3053 struct m_snd_tag *mst; 3054 int error; 3055 3056 mst = inp->inp_snd_tag; 3057 if (mst == NULL) 3058 return (EINVAL); 3059 3060 if (mst->sw->snd_tag_modify == NULL) { 3061 error = EOPNOTSUPP; 3062 } else { 3063 error = mst->sw->snd_tag_modify(mst, ¶ms); 3064 } 3065 return (error); 3066 } 3067 3068 /* 3069 * Query existing TX rate limit based on the existing 3070 * "inp->inp_snd_tag", if any. 3071 */ 3072 int 3073 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3074 { 3075 union if_snd_tag_query_params params = { }; 3076 struct m_snd_tag *mst; 3077 int error; 3078 3079 mst = inp->inp_snd_tag; 3080 if (mst == NULL) 3081 return (EINVAL); 3082 3083 if (mst->sw->snd_tag_query == NULL) { 3084 error = EOPNOTSUPP; 3085 } else { 3086 error = mst->sw->snd_tag_query(mst, ¶ms); 3087 if (error == 0 && p_max_pacing_rate != NULL) 3088 *p_max_pacing_rate = params.rate_limit.max_rate; 3089 } 3090 return (error); 3091 } 3092 3093 /* 3094 * Query existing TX queue level based on the existing 3095 * "inp->inp_snd_tag", if any. 3096 */ 3097 int 3098 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3099 { 3100 union if_snd_tag_query_params params = { }; 3101 struct m_snd_tag *mst; 3102 int error; 3103 3104 mst = inp->inp_snd_tag; 3105 if (mst == NULL) 3106 return (EINVAL); 3107 3108 if (mst->sw->snd_tag_query == NULL) 3109 return (EOPNOTSUPP); 3110 3111 error = mst->sw->snd_tag_query(mst, ¶ms); 3112 if (error == 0 && p_txqueue_level != NULL) 3113 *p_txqueue_level = params.rate_limit.queue_level; 3114 return (error); 3115 } 3116 3117 /* 3118 * Allocate a new TX rate limit send tag from the network interface 3119 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3120 */ 3121 int 3122 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3123 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3124 3125 { 3126 union if_snd_tag_alloc_params params = { 3127 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3128 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3129 .rate_limit.hdr.flowid = flowid, 3130 .rate_limit.hdr.flowtype = flowtype, 3131 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3132 .rate_limit.max_rate = max_pacing_rate, 3133 .rate_limit.flags = M_NOWAIT, 3134 }; 3135 int error; 3136 3137 INP_WLOCK_ASSERT(inp); 3138 3139 /* 3140 * If there is already a send tag, or the INP is being torn 3141 * down, allocating a new send tag is not allowed. Else send 3142 * tags may leak. 3143 */ 3144 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3145 return (EINVAL); 3146 3147 error = m_snd_tag_alloc(ifp, ¶ms, st); 3148 #ifdef INET 3149 if (error == 0) { 3150 counter_u64_add(rate_limit_set_ok, 1); 3151 counter_u64_add(rate_limit_active, 1); 3152 } else if (error != EOPNOTSUPP) 3153 counter_u64_add(rate_limit_alloc_fail, 1); 3154 #endif 3155 return (error); 3156 } 3157 3158 void 3159 in_pcbdetach_tag(struct m_snd_tag *mst) 3160 { 3161 3162 m_snd_tag_rele(mst); 3163 #ifdef INET 3164 counter_u64_add(rate_limit_active, -1); 3165 #endif 3166 } 3167 3168 /* 3169 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3170 * if any: 3171 */ 3172 void 3173 in_pcbdetach_txrtlmt(struct inpcb *inp) 3174 { 3175 struct m_snd_tag *mst; 3176 3177 INP_WLOCK_ASSERT(inp); 3178 3179 mst = inp->inp_snd_tag; 3180 inp->inp_snd_tag = NULL; 3181 3182 if (mst == NULL) 3183 return; 3184 3185 m_snd_tag_rele(mst); 3186 #ifdef INET 3187 counter_u64_add(rate_limit_active, -1); 3188 #endif 3189 } 3190 3191 int 3192 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3193 { 3194 int error; 3195 3196 /* 3197 * If the existing send tag is for the wrong interface due to 3198 * a route change, first drop the existing tag. Set the 3199 * CHANGED flag so that we will keep trying to allocate a new 3200 * tag if we fail to allocate one this time. 3201 */ 3202 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3203 in_pcbdetach_txrtlmt(inp); 3204 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3205 } 3206 3207 /* 3208 * NOTE: When attaching to a network interface a reference is 3209 * made to ensure the network interface doesn't go away until 3210 * all ratelimit connections are gone. The network interface 3211 * pointers compared below represent valid network interfaces, 3212 * except when comparing towards NULL. 3213 */ 3214 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3215 error = 0; 3216 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3217 if (inp->inp_snd_tag != NULL) 3218 in_pcbdetach_txrtlmt(inp); 3219 error = 0; 3220 } else if (inp->inp_snd_tag == NULL) { 3221 /* 3222 * In order to utilize packet pacing with RSS, we need 3223 * to wait until there is a valid RSS hash before we 3224 * can proceed: 3225 */ 3226 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3227 error = EAGAIN; 3228 } else { 3229 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3230 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3231 } 3232 } else { 3233 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3234 } 3235 if (error == 0 || error == EOPNOTSUPP) 3236 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3237 3238 return (error); 3239 } 3240 3241 /* 3242 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3243 * is set in the fast path and will attach/detach/modify the TX rate 3244 * limit send tag based on the socket's so_max_pacing_rate value. 3245 */ 3246 void 3247 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3248 { 3249 struct socket *socket; 3250 uint32_t max_pacing_rate; 3251 bool did_upgrade; 3252 3253 if (inp == NULL) 3254 return; 3255 3256 socket = inp->inp_socket; 3257 if (socket == NULL) 3258 return; 3259 3260 if (!INP_WLOCKED(inp)) { 3261 /* 3262 * NOTE: If the write locking fails, we need to bail 3263 * out and use the non-ratelimited ring for the 3264 * transmit until there is a new chance to get the 3265 * write lock. 3266 */ 3267 if (!INP_TRY_UPGRADE(inp)) 3268 return; 3269 did_upgrade = 1; 3270 } else { 3271 did_upgrade = 0; 3272 } 3273 3274 /* 3275 * NOTE: The so_max_pacing_rate value is read unlocked, 3276 * because atomic updates are not required since the variable 3277 * is checked at every mbuf we send. It is assumed that the 3278 * variable read itself will be atomic. 3279 */ 3280 max_pacing_rate = socket->so_max_pacing_rate; 3281 3282 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3283 3284 if (did_upgrade) 3285 INP_DOWNGRADE(inp); 3286 } 3287 3288 /* 3289 * Track route changes for TX rate limiting. 3290 */ 3291 void 3292 in_pcboutput_eagain(struct inpcb *inp) 3293 { 3294 bool did_upgrade; 3295 3296 if (inp == NULL) 3297 return; 3298 3299 if (inp->inp_snd_tag == NULL) 3300 return; 3301 3302 if (!INP_WLOCKED(inp)) { 3303 /* 3304 * NOTE: If the write locking fails, we need to bail 3305 * out and use the non-ratelimited ring for the 3306 * transmit until there is a new chance to get the 3307 * write lock. 3308 */ 3309 if (!INP_TRY_UPGRADE(inp)) 3310 return; 3311 did_upgrade = 1; 3312 } else { 3313 did_upgrade = 0; 3314 } 3315 3316 /* detach rate limiting */ 3317 in_pcbdetach_txrtlmt(inp); 3318 3319 /* make sure new mbuf send tag allocation is made */ 3320 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3321 3322 if (did_upgrade) 3323 INP_DOWNGRADE(inp); 3324 } 3325 3326 #ifdef INET 3327 static void 3328 rl_init(void *st) 3329 { 3330 rate_limit_new = counter_u64_alloc(M_WAITOK); 3331 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3332 rate_limit_active = counter_u64_alloc(M_WAITOK); 3333 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3334 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3335 } 3336 3337 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3338 #endif 3339 #endif /* RATELIMIT */ 3340