1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/hash.h> 53 #include <sys/systm.h> 54 #include <sys/libkern.h> 55 #include <sys/lock.h> 56 #include <sys/malloc.h> 57 #include <sys/mbuf.h> 58 #include <sys/eventhandler.h> 59 #include <sys/domain.h> 60 #include <sys/protosw.h> 61 #include <sys/smp.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/sockio.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/jail.h> 69 #include <sys/kernel.h> 70 #include <sys/sysctl.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 #include <vm/vm.h> 78 79 #include <net/if.h> 80 #include <net/if_var.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #ifdef INET 92 #include <netinet/in_var.h> 93 #include <netinet/in_fib.h> 94 #endif 95 #include <netinet/ip_var.h> 96 #include <netinet/tcp_var.h> 97 #ifdef TCPHPTS 98 #include <netinet/tcp_hpts.h> 99 #endif 100 #include <netinet/udp.h> 101 #include <netinet/udp_var.h> 102 #ifdef INET6 103 #include <netinet/ip6.h> 104 #include <netinet6/in6_pcb.h> 105 #include <netinet6/in6_var.h> 106 #include <netinet6/ip6_var.h> 107 #endif /* INET6 */ 108 #include <net/route/nhop.h> 109 #endif 110 111 #include <netipsec/ipsec_support.h> 112 113 #include <security/mac/mac_framework.h> 114 115 #define INPCBLBGROUP_SIZMIN 8 116 #define INPCBLBGROUP_SIZMAX 256 117 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 118 119 /* 120 * These configure the range of local port addresses assigned to 121 * "unspecified" outgoing connections/packets/whatever. 122 */ 123 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 124 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 125 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 126 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 127 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 128 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 129 130 /* 131 * Reserved ports accessible only to root. There are significant 132 * security considerations that must be accounted for when changing these, 133 * but the security benefits can be great. Please be careful. 134 */ 135 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 136 VNET_DEFINE(int, ipport_reservedlow); 137 138 /* Enable random ephemeral port allocation by default. */ 139 VNET_DEFINE(int, ipport_randomized) = 1; 140 141 #ifdef INET 142 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 143 struct in_addr faddr, u_int fport_arg, 144 struct in_addr laddr, u_int lport_arg, 145 int lookupflags, struct ifnet *ifp, 146 uint8_t numa_domain); 147 148 #define RANGECHK(var, min, max) \ 149 if ((var) < (min)) { (var) = (min); } \ 150 else if ((var) > (max)) { (var) = (max); } 151 152 static int 153 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 154 { 155 int error; 156 157 error = sysctl_handle_int(oidp, arg1, arg2, req); 158 if (error == 0) { 159 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 160 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 161 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 162 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 163 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 164 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 165 } 166 return (error); 167 } 168 169 #undef RANGECHK 170 171 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 172 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 173 "IP Ports"); 174 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 196 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 197 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 198 ""); 199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 200 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 201 &VNET_NAME(ipport_reservedhigh), 0, ""); 202 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 203 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 204 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 205 CTLFLAG_VNET | CTLFLAG_RW, 206 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 207 208 #ifdef RATELIMIT 209 counter_u64_t rate_limit_new; 210 counter_u64_t rate_limit_chg; 211 counter_u64_t rate_limit_active; 212 counter_u64_t rate_limit_alloc_fail; 213 counter_u64_t rate_limit_set_ok; 214 215 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 216 "IP Rate Limiting"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 218 &rate_limit_active, "Active rate limited connections"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 220 &rate_limit_alloc_fail, "Rate limited connection failures"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 222 &rate_limit_set_ok, "Rate limited setting succeeded"); 223 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 224 &rate_limit_new, "Total Rate limit new attempts"); 225 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 226 &rate_limit_chg, "Total Rate limited change attempts"); 227 228 #endif /* RATELIMIT */ 229 230 #endif /* INET */ 231 232 VNET_DEFINE(uint32_t, in_pcbhashseed); 233 static void 234 in_pcbhashseed_init(void) 235 { 236 237 V_in_pcbhashseed = arc4random(); 238 } 239 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 240 in_pcbhashseed_init, 0); 241 242 static void in_pcbremhash(struct inpcb *); 243 244 /* 245 * in_pcb.c: manage the Protocol Control Blocks. 246 * 247 * NOTE: It is assumed that most of these functions will be called with 248 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 249 * functions often modify hash chains or addresses in pcbs. 250 */ 251 252 static struct inpcblbgroup * 253 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, 254 uint16_t port, const union in_dependaddr *addr, int size, 255 uint8_t numa_domain) 256 { 257 struct inpcblbgroup *grp; 258 size_t bytes; 259 260 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 261 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 262 if (!grp) 263 return (NULL); 264 grp->il_vflag = vflag; 265 grp->il_lport = port; 266 grp->il_numa_domain = numa_domain; 267 grp->il_dependladdr = *addr; 268 grp->il_inpsiz = size; 269 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 270 return (grp); 271 } 272 273 static void 274 in_pcblbgroup_free_deferred(epoch_context_t ctx) 275 { 276 struct inpcblbgroup *grp; 277 278 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 279 free(grp, M_PCB); 280 } 281 282 static void 283 in_pcblbgroup_free(struct inpcblbgroup *grp) 284 { 285 286 CK_LIST_REMOVE(grp, il_list); 287 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 288 } 289 290 static struct inpcblbgroup * 291 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 292 struct inpcblbgroup *old_grp, int size) 293 { 294 struct inpcblbgroup *grp; 295 int i; 296 297 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, 298 old_grp->il_lport, &old_grp->il_dependladdr, size, 299 old_grp->il_numa_domain); 300 if (grp == NULL) 301 return (NULL); 302 303 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 304 ("invalid new local group size %d and old local group count %d", 305 grp->il_inpsiz, old_grp->il_inpcnt)); 306 307 for (i = 0; i < old_grp->il_inpcnt; ++i) 308 grp->il_inp[i] = old_grp->il_inp[i]; 309 grp->il_inpcnt = old_grp->il_inpcnt; 310 in_pcblbgroup_free(old_grp); 311 return (grp); 312 } 313 314 /* 315 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 316 * and shrink group if possible. 317 */ 318 static void 319 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 320 int i) 321 { 322 struct inpcblbgroup *grp, *new_grp; 323 324 grp = *grpp; 325 for (; i + 1 < grp->il_inpcnt; ++i) 326 grp->il_inp[i] = grp->il_inp[i + 1]; 327 grp->il_inpcnt--; 328 329 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 330 grp->il_inpcnt <= grp->il_inpsiz / 4) { 331 /* Shrink this group. */ 332 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 333 if (new_grp != NULL) 334 *grpp = new_grp; 335 } 336 } 337 338 /* 339 * Add PCB to load balance group for SO_REUSEPORT_LB option. 340 */ 341 static int 342 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 343 { 344 const static struct timeval interval = { 60, 0 }; 345 static struct timeval lastprint; 346 struct inpcbinfo *pcbinfo; 347 struct inpcblbgrouphead *hdr; 348 struct inpcblbgroup *grp; 349 uint32_t idx; 350 351 pcbinfo = inp->inp_pcbinfo; 352 353 INP_WLOCK_ASSERT(inp); 354 INP_HASH_WLOCK_ASSERT(pcbinfo); 355 356 /* 357 * Don't allow jailed socket to join local group. 358 */ 359 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) 360 return (0); 361 362 #ifdef INET6 363 /* 364 * Don't allow IPv4 mapped INET6 wild socket. 365 */ 366 if ((inp->inp_vflag & INP_IPV4) && 367 inp->inp_laddr.s_addr == INADDR_ANY && 368 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 369 return (0); 370 } 371 #endif 372 373 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 374 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 375 CK_LIST_FOREACH(grp, hdr, il_list) { 376 if (grp->il_vflag == inp->inp_vflag && 377 grp->il_lport == inp->inp_lport && 378 grp->il_numa_domain == numa_domain && 379 memcmp(&grp->il_dependladdr, 380 &inp->inp_inc.inc_ie.ie_dependladdr, 381 sizeof(grp->il_dependladdr)) == 0) 382 break; 383 } 384 if (grp == NULL) { 385 /* Create new load balance group. */ 386 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, 387 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 388 INPCBLBGROUP_SIZMIN, numa_domain); 389 if (grp == NULL) 390 return (ENOBUFS); 391 } else if (grp->il_inpcnt == grp->il_inpsiz) { 392 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 393 if (ratecheck(&lastprint, &interval)) 394 printf("lb group port %d, limit reached\n", 395 ntohs(grp->il_lport)); 396 return (0); 397 } 398 399 /* Expand this local group. */ 400 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 401 if (grp == NULL) 402 return (ENOBUFS); 403 } 404 405 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 406 ("invalid local group size %d and count %d", grp->il_inpsiz, 407 grp->il_inpcnt)); 408 409 grp->il_inp[grp->il_inpcnt] = inp; 410 grp->il_inpcnt++; 411 return (0); 412 } 413 414 /* 415 * Remove PCB from load balance group. 416 */ 417 static void 418 in_pcbremlbgrouphash(struct inpcb *inp) 419 { 420 struct inpcbinfo *pcbinfo; 421 struct inpcblbgrouphead *hdr; 422 struct inpcblbgroup *grp; 423 int i; 424 425 pcbinfo = inp->inp_pcbinfo; 426 427 INP_WLOCK_ASSERT(inp); 428 INP_HASH_WLOCK_ASSERT(pcbinfo); 429 430 hdr = &pcbinfo->ipi_lbgrouphashbase[ 431 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 432 CK_LIST_FOREACH(grp, hdr, il_list) { 433 for (i = 0; i < grp->il_inpcnt; ++i) { 434 if (grp->il_inp[i] != inp) 435 continue; 436 437 if (grp->il_inpcnt == 1) { 438 /* We are the last, free this local group. */ 439 in_pcblbgroup_free(grp); 440 } else { 441 /* Pull up inpcbs, shrink group if possible. */ 442 in_pcblbgroup_reorder(hdr, &grp, i); 443 } 444 return; 445 } 446 } 447 } 448 449 int 450 in_pcblbgroup_numa(struct inpcb *inp, int arg) 451 { 452 struct inpcbinfo *pcbinfo; 453 struct inpcblbgrouphead *hdr; 454 struct inpcblbgroup *grp; 455 int err, i; 456 uint8_t numa_domain; 457 458 switch (arg) { 459 case TCP_REUSPORT_LB_NUMA_NODOM: 460 numa_domain = M_NODOM; 461 break; 462 case TCP_REUSPORT_LB_NUMA_CURDOM: 463 numa_domain = PCPU_GET(domain); 464 break; 465 default: 466 if (arg < 0 || arg >= vm_ndomains) 467 return (EINVAL); 468 numa_domain = arg; 469 } 470 471 err = 0; 472 pcbinfo = inp->inp_pcbinfo; 473 INP_WLOCK_ASSERT(inp); 474 INP_HASH_WLOCK(pcbinfo); 475 hdr = &pcbinfo->ipi_lbgrouphashbase[ 476 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 477 CK_LIST_FOREACH(grp, hdr, il_list) { 478 for (i = 0; i < grp->il_inpcnt; ++i) { 479 if (grp->il_inp[i] != inp) 480 continue; 481 482 if (grp->il_numa_domain == numa_domain) { 483 goto abort_with_hash_wlock; 484 } 485 486 /* Remove it from the old group. */ 487 in_pcbremlbgrouphash(inp); 488 489 /* Add it to the new group based on numa domain. */ 490 in_pcbinslbgrouphash(inp, numa_domain); 491 goto abort_with_hash_wlock; 492 } 493 } 494 err = ENOENT; 495 abort_with_hash_wlock: 496 INP_HASH_WUNLOCK(pcbinfo); 497 return (err); 498 } 499 500 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 501 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 502 503 /* 504 * Initialize an inpcbinfo - a per-VNET instance of connections db. 505 */ 506 void 507 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 508 u_int hash_nelements, u_int porthash_nelements) 509 { 510 511 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 512 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 513 NULL, MTX_DEF); 514 #ifdef VIMAGE 515 pcbinfo->ipi_vnet = curvnet; 516 #endif 517 CK_LIST_INIT(&pcbinfo->ipi_listhead); 518 pcbinfo->ipi_count = 0; 519 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 520 &pcbinfo->ipi_hashmask); 521 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 522 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 523 &pcbinfo->ipi_porthashmask); 524 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 525 &pcbinfo->ipi_lbgrouphashmask); 526 pcbinfo->ipi_zone = pcbstor->ips_zone; 527 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 528 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 529 } 530 531 /* 532 * Destroy an inpcbinfo. 533 */ 534 void 535 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 536 { 537 538 KASSERT(pcbinfo->ipi_count == 0, 539 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 540 541 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 542 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 543 pcbinfo->ipi_porthashmask); 544 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 545 pcbinfo->ipi_lbgrouphashmask); 546 mtx_destroy(&pcbinfo->ipi_hash_lock); 547 mtx_destroy(&pcbinfo->ipi_lock); 548 } 549 550 /* 551 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 552 */ 553 static void inpcb_dtor(void *, int, void *); 554 static void inpcb_fini(void *, int); 555 void 556 in_pcbstorage_init(void *arg) 557 { 558 struct inpcbstorage *pcbstor = arg; 559 560 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 561 sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, 562 inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); 563 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 564 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 565 uma_zone_set_smr(pcbstor->ips_portzone, 566 uma_zone_get_smr(pcbstor->ips_zone)); 567 } 568 569 /* 570 * Destroy a pcbstorage - used by unloadable protocols. 571 */ 572 void 573 in_pcbstorage_destroy(void *arg) 574 { 575 struct inpcbstorage *pcbstor = arg; 576 577 uma_zdestroy(pcbstor->ips_zone); 578 uma_zdestroy(pcbstor->ips_portzone); 579 } 580 581 /* 582 * Allocate a PCB and associate it with the socket. 583 * On success return with the PCB locked. 584 */ 585 int 586 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 587 { 588 struct inpcb *inp; 589 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 590 int error; 591 #endif 592 593 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 594 if (inp == NULL) 595 return (ENOBUFS); 596 bzero(&inp->inp_start_zero, inp_zero_size); 597 #ifdef NUMA 598 inp->inp_numa_domain = M_NODOM; 599 #endif 600 inp->inp_pcbinfo = pcbinfo; 601 inp->inp_socket = so; 602 inp->inp_cred = crhold(so->so_cred); 603 inp->inp_inc.inc_fibnum = so->so_fibnum; 604 #ifdef MAC 605 error = mac_inpcb_init(inp, M_NOWAIT); 606 if (error != 0) 607 goto out; 608 mac_inpcb_create(so, inp); 609 #endif 610 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 611 error = ipsec_init_pcbpolicy(inp); 612 if (error != 0) { 613 #ifdef MAC 614 mac_inpcb_destroy(inp); 615 #endif 616 goto out; 617 } 618 #endif /*IPSEC*/ 619 #ifdef INET6 620 if (INP_SOCKAF(so) == AF_INET6) { 621 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 622 if (V_ip6_v6only) 623 inp->inp_flags |= IN6P_IPV6_V6ONLY; 624 #ifdef INET 625 else 626 inp->inp_vflag |= INP_IPV4; 627 #endif 628 if (V_ip6_auto_flowlabel) 629 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 630 inp->in6p_hops = -1; /* use kernel default */ 631 } 632 #endif 633 #if defined(INET) && defined(INET6) 634 else 635 #endif 636 #ifdef INET 637 inp->inp_vflag |= INP_IPV4; 638 #endif 639 /* 640 * Routes in inpcb's can cache L2 as well; they are guaranteed 641 * to be cleaned up. 642 */ 643 inp->inp_route.ro_flags = RT_LLE_CACHE; 644 #ifdef TCPHPTS 645 /* 646 * If using hpts lets drop a random number in so 647 * not all new connections fall on the same CPU. 648 */ 649 inp->inp_hpts_cpu = hpts_random_cpu(inp); 650 #endif 651 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 652 INP_WLOCK(inp); 653 INP_INFO_WLOCK(pcbinfo); 654 pcbinfo->ipi_count++; 655 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 656 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 657 INP_INFO_WUNLOCK(pcbinfo); 658 so->so_pcb = inp; 659 660 return (0); 661 662 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 663 out: 664 uma_zfree_smr(pcbinfo->ipi_zone, inp); 665 return (error); 666 #endif 667 } 668 669 #ifdef INET 670 int 671 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 672 { 673 int anonport, error; 674 675 KASSERT(nam == NULL || nam->sa_family == AF_INET, 676 ("%s: invalid address family for %p", __func__, nam)); 677 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 678 ("%s: invalid address length for %p", __func__, nam)); 679 INP_WLOCK_ASSERT(inp); 680 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 681 682 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 683 return (EINVAL); 684 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 685 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 686 &inp->inp_lport, cred); 687 if (error) 688 return (error); 689 if (in_pcbinshash(inp) != 0) { 690 inp->inp_laddr.s_addr = INADDR_ANY; 691 inp->inp_lport = 0; 692 return (EAGAIN); 693 } 694 if (anonport) 695 inp->inp_flags |= INP_ANONPORT; 696 return (0); 697 } 698 #endif 699 700 #if defined(INET) || defined(INET6) 701 /* 702 * Assign a local port like in_pcb_lport(), but also used with connect() 703 * and a foreign address and port. If fsa is non-NULL, choose a local port 704 * that is unused with those, otherwise one that is completely unused. 705 * lsa can be NULL for IPv6. 706 */ 707 int 708 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 709 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 710 { 711 struct inpcbinfo *pcbinfo; 712 struct inpcb *tmpinp; 713 unsigned short *lastport; 714 int count, error; 715 u_short aux, first, last, lport; 716 #ifdef INET 717 struct in_addr laddr, faddr; 718 #endif 719 #ifdef INET6 720 struct in6_addr *laddr6, *faddr6; 721 #endif 722 723 pcbinfo = inp->inp_pcbinfo; 724 725 /* 726 * Because no actual state changes occur here, a global write lock on 727 * the pcbinfo isn't required. 728 */ 729 INP_LOCK_ASSERT(inp); 730 INP_HASH_LOCK_ASSERT(pcbinfo); 731 732 if (inp->inp_flags & INP_HIGHPORT) { 733 first = V_ipport_hifirstauto; /* sysctl */ 734 last = V_ipport_hilastauto; 735 lastport = &pcbinfo->ipi_lasthi; 736 } else if (inp->inp_flags & INP_LOWPORT) { 737 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 738 if (error) 739 return (error); 740 first = V_ipport_lowfirstauto; /* 1023 */ 741 last = V_ipport_lowlastauto; /* 600 */ 742 lastport = &pcbinfo->ipi_lastlow; 743 } else { 744 first = V_ipport_firstauto; /* sysctl */ 745 last = V_ipport_lastauto; 746 lastport = &pcbinfo->ipi_lastport; 747 } 748 749 /* 750 * Instead of having two loops further down counting up or down 751 * make sure that first is always <= last and go with only one 752 * code path implementing all logic. 753 */ 754 if (first > last) { 755 aux = first; 756 first = last; 757 last = aux; 758 } 759 760 #ifdef INET 761 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 762 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 763 if (lsa != NULL) 764 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 765 if (fsa != NULL) 766 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 767 } 768 #endif 769 #ifdef INET6 770 laddr6 = NULL; 771 if ((inp->inp_vflag & INP_IPV6) != 0) { 772 if (lsa != NULL) 773 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 774 if (fsa != NULL) 775 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 776 } 777 #endif 778 779 tmpinp = NULL; 780 lport = *lportp; 781 782 if (V_ipport_randomized) 783 *lastport = first + (arc4random() % (last - first)); 784 785 count = last - first; 786 787 do { 788 if (count-- < 0) /* completely used? */ 789 return (EADDRNOTAVAIL); 790 ++*lastport; 791 if (*lastport < first || *lastport > last) 792 *lastport = first; 793 lport = htons(*lastport); 794 795 if (fsa != NULL) { 796 #ifdef INET 797 if (lsa->sa_family == AF_INET) { 798 tmpinp = in_pcblookup_hash_locked(pcbinfo, 799 faddr, fport, laddr, lport, lookupflags, 800 NULL, M_NODOM); 801 } 802 #endif 803 #ifdef INET6 804 if (lsa->sa_family == AF_INET6) { 805 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 806 faddr6, fport, laddr6, lport, lookupflags, 807 NULL, M_NODOM); 808 } 809 #endif 810 } else { 811 #ifdef INET6 812 if ((inp->inp_vflag & INP_IPV6) != 0) { 813 tmpinp = in6_pcblookup_local(pcbinfo, 814 &inp->in6p_laddr, lport, lookupflags, cred); 815 #ifdef INET 816 if (tmpinp == NULL && 817 (inp->inp_vflag & INP_IPV4)) 818 tmpinp = in_pcblookup_local(pcbinfo, 819 laddr, lport, lookupflags, cred); 820 #endif 821 } 822 #endif 823 #if defined(INET) && defined(INET6) 824 else 825 #endif 826 #ifdef INET 827 tmpinp = in_pcblookup_local(pcbinfo, laddr, 828 lport, lookupflags, cred); 829 #endif 830 } 831 } while (tmpinp != NULL); 832 833 *lportp = lport; 834 835 return (0); 836 } 837 838 /* 839 * Select a local port (number) to use. 840 */ 841 int 842 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 843 struct ucred *cred, int lookupflags) 844 { 845 struct sockaddr_in laddr; 846 847 if (laddrp) { 848 bzero(&laddr, sizeof(laddr)); 849 laddr.sin_family = AF_INET; 850 laddr.sin_addr = *laddrp; 851 } 852 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 853 NULL, lportp, NULL, 0, cred, lookupflags)); 854 } 855 856 /* 857 * Return cached socket options. 858 */ 859 int 860 inp_so_options(const struct inpcb *inp) 861 { 862 int so_options; 863 864 so_options = 0; 865 866 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 867 so_options |= SO_REUSEPORT_LB; 868 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 869 so_options |= SO_REUSEPORT; 870 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 871 so_options |= SO_REUSEADDR; 872 return (so_options); 873 } 874 #endif /* INET || INET6 */ 875 876 /* 877 * Check if a new BINDMULTI socket is allowed to be created. 878 * 879 * ni points to the new inp. 880 * oi points to the existing inp. 881 * 882 * This checks whether the existing inp also has BINDMULTI and 883 * whether the credentials match. 884 */ 885 int 886 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 887 { 888 /* Check permissions match */ 889 if ((ni->inp_flags2 & INP_BINDMULTI) && 890 (ni->inp_cred->cr_uid != 891 oi->inp_cred->cr_uid)) 892 return (0); 893 894 /* Check the existing inp has BINDMULTI set */ 895 if ((ni->inp_flags2 & INP_BINDMULTI) && 896 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 897 return (0); 898 899 /* 900 * We're okay - either INP_BINDMULTI isn't set on ni, or 901 * it is and it matches the checks. 902 */ 903 return (1); 904 } 905 906 #ifdef INET 907 /* 908 * Set up a bind operation on a PCB, performing port allocation 909 * as required, but do not actually modify the PCB. Callers can 910 * either complete the bind by setting inp_laddr/inp_lport and 911 * calling in_pcbinshash(), or they can just use the resulting 912 * port and address to authorise the sending of a once-off packet. 913 * 914 * On error, the values of *laddrp and *lportp are not changed. 915 */ 916 int 917 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 918 u_short *lportp, struct ucred *cred) 919 { 920 struct socket *so = inp->inp_socket; 921 struct sockaddr_in *sin; 922 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 923 struct in_addr laddr; 924 u_short lport = 0; 925 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 926 int error; 927 928 /* 929 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 930 * so that we don't have to add to the (already messy) code below. 931 */ 932 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 933 934 /* 935 * No state changes, so read locks are sufficient here. 936 */ 937 INP_LOCK_ASSERT(inp); 938 INP_HASH_LOCK_ASSERT(pcbinfo); 939 940 laddr.s_addr = *laddrp; 941 if (nam != NULL && laddr.s_addr != INADDR_ANY) 942 return (EINVAL); 943 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 944 lookupflags = INPLOOKUP_WILDCARD; 945 if (nam == NULL) { 946 if ((error = prison_local_ip4(cred, &laddr)) != 0) 947 return (error); 948 } else { 949 sin = (struct sockaddr_in *)nam; 950 KASSERT(sin->sin_family == AF_INET, 951 ("%s: invalid family for address %p", __func__, sin)); 952 KASSERT(sin->sin_len == sizeof(*sin), 953 ("%s: invalid length for address %p", __func__, sin)); 954 955 error = prison_local_ip4(cred, &sin->sin_addr); 956 if (error) 957 return (error); 958 if (sin->sin_port != *lportp) { 959 /* Don't allow the port to change. */ 960 if (*lportp != 0) 961 return (EINVAL); 962 lport = sin->sin_port; 963 } 964 /* NB: lport is left as 0 if the port isn't being changed. */ 965 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 966 /* 967 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 968 * allow complete duplication of binding if 969 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 970 * and a multicast address is bound on both 971 * new and duplicated sockets. 972 */ 973 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 974 reuseport = SO_REUSEADDR|SO_REUSEPORT; 975 /* 976 * XXX: How to deal with SO_REUSEPORT_LB here? 977 * Treat same as SO_REUSEPORT for now. 978 */ 979 if ((so->so_options & 980 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 981 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 982 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 983 sin->sin_port = 0; /* yech... */ 984 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 985 /* 986 * Is the address a local IP address? 987 * If INP_BINDANY is set, then the socket may be bound 988 * to any endpoint address, local or not. 989 */ 990 if ((inp->inp_flags & INP_BINDANY) == 0 && 991 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 992 return (EADDRNOTAVAIL); 993 } 994 laddr = sin->sin_addr; 995 if (lport) { 996 struct inpcb *t; 997 998 /* GROSS */ 999 if (ntohs(lport) <= V_ipport_reservedhigh && 1000 ntohs(lport) >= V_ipport_reservedlow && 1001 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 1002 return (EACCES); 1003 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 1004 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 1005 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1006 lport, INPLOOKUP_WILDCARD, cred); 1007 /* 1008 * XXX 1009 * This entire block sorely needs a rewrite. 1010 */ 1011 if (t && 1012 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1013 (so->so_type != SOCK_STREAM || 1014 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1015 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1016 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1017 (t->inp_flags2 & INP_REUSEPORT) || 1018 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1019 (inp->inp_cred->cr_uid != 1020 t->inp_cred->cr_uid)) 1021 return (EADDRINUSE); 1022 1023 /* 1024 * If the socket is a BINDMULTI socket, then 1025 * the credentials need to match and the 1026 * original socket also has to have been bound 1027 * with BINDMULTI. 1028 */ 1029 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1030 return (EADDRINUSE); 1031 } 1032 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1033 lport, lookupflags, cred); 1034 if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1035 (reuseport & inp_so_options(t)) == 0 && 1036 (reuseport_lb & inp_so_options(t)) == 0) { 1037 #ifdef INET6 1038 if (ntohl(sin->sin_addr.s_addr) != 1039 INADDR_ANY || 1040 ntohl(t->inp_laddr.s_addr) != 1041 INADDR_ANY || 1042 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1043 (t->inp_vflag & INP_IPV6PROTO) == 0) 1044 #endif 1045 return (EADDRINUSE); 1046 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1047 return (EADDRINUSE); 1048 } 1049 } 1050 } 1051 if (*lportp != 0) 1052 lport = *lportp; 1053 if (lport == 0) { 1054 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1055 if (error != 0) 1056 return (error); 1057 } 1058 *laddrp = laddr.s_addr; 1059 *lportp = lport; 1060 return (0); 1061 } 1062 1063 /* 1064 * Connect from a socket to a specified address. 1065 * Both address and port must be specified in argument sin. 1066 * If don't have a local address for this socket yet, 1067 * then pick one. 1068 */ 1069 int 1070 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1071 bool rehash) 1072 { 1073 u_short lport, fport; 1074 in_addr_t laddr, faddr; 1075 int anonport, error; 1076 1077 INP_WLOCK_ASSERT(inp); 1078 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1079 1080 lport = inp->inp_lport; 1081 laddr = inp->inp_laddr.s_addr; 1082 anonport = (lport == 0); 1083 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1084 NULL, cred); 1085 if (error) 1086 return (error); 1087 1088 /* Do the initial binding of the local address if required. */ 1089 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1090 KASSERT(rehash == true, 1091 ("Rehashing required for unbound inps")); 1092 inp->inp_lport = lport; 1093 inp->inp_laddr.s_addr = laddr; 1094 if (in_pcbinshash(inp) != 0) { 1095 inp->inp_laddr.s_addr = INADDR_ANY; 1096 inp->inp_lport = 0; 1097 return (EAGAIN); 1098 } 1099 } 1100 1101 /* Commit the remaining changes. */ 1102 inp->inp_lport = lport; 1103 inp->inp_laddr.s_addr = laddr; 1104 inp->inp_faddr.s_addr = faddr; 1105 inp->inp_fport = fport; 1106 if (rehash) { 1107 in_pcbrehash(inp); 1108 } else { 1109 in_pcbinshash(inp); 1110 } 1111 1112 if (anonport) 1113 inp->inp_flags |= INP_ANONPORT; 1114 return (0); 1115 } 1116 1117 /* 1118 * Do proper source address selection on an unbound socket in case 1119 * of connect. Take jails into account as well. 1120 */ 1121 int 1122 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1123 struct ucred *cred) 1124 { 1125 struct ifaddr *ifa; 1126 struct sockaddr *sa; 1127 struct sockaddr_in *sin, dst; 1128 struct nhop_object *nh; 1129 int error; 1130 1131 NET_EPOCH_ASSERT(); 1132 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1133 /* 1134 * Bypass source address selection and use the primary jail IP 1135 * if requested. 1136 */ 1137 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) 1138 return (0); 1139 1140 error = 0; 1141 1142 nh = NULL; 1143 bzero(&dst, sizeof(dst)); 1144 sin = &dst; 1145 sin->sin_family = AF_INET; 1146 sin->sin_len = sizeof(struct sockaddr_in); 1147 sin->sin_addr.s_addr = faddr->s_addr; 1148 1149 /* 1150 * If route is known our src addr is taken from the i/f, 1151 * else punt. 1152 * 1153 * Find out route to destination. 1154 */ 1155 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1156 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1157 0, NHR_NONE, 0); 1158 1159 /* 1160 * If we found a route, use the address corresponding to 1161 * the outgoing interface. 1162 * 1163 * Otherwise assume faddr is reachable on a directly connected 1164 * network and try to find a corresponding interface to take 1165 * the source address from. 1166 */ 1167 if (nh == NULL || nh->nh_ifp == NULL) { 1168 struct in_ifaddr *ia; 1169 struct ifnet *ifp; 1170 1171 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1172 inp->inp_socket->so_fibnum)); 1173 if (ia == NULL) { 1174 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1175 inp->inp_socket->so_fibnum)); 1176 } 1177 if (ia == NULL) { 1178 error = ENETUNREACH; 1179 goto done; 1180 } 1181 1182 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1183 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1184 goto done; 1185 } 1186 1187 ifp = ia->ia_ifp; 1188 ia = NULL; 1189 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1190 sa = ifa->ifa_addr; 1191 if (sa->sa_family != AF_INET) 1192 continue; 1193 sin = (struct sockaddr_in *)sa; 1194 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1195 ia = (struct in_ifaddr *)ifa; 1196 break; 1197 } 1198 } 1199 if (ia != NULL) { 1200 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1201 goto done; 1202 } 1203 1204 /* 3. As a last resort return the 'default' jail address. */ 1205 error = prison_get_ip4(cred, laddr); 1206 goto done; 1207 } 1208 1209 /* 1210 * If the outgoing interface on the route found is not 1211 * a loopback interface, use the address from that interface. 1212 * In case of jails do those three steps: 1213 * 1. check if the interface address belongs to the jail. If so use it. 1214 * 2. check if we have any address on the outgoing interface 1215 * belonging to this jail. If so use it. 1216 * 3. as a last resort return the 'default' jail address. 1217 */ 1218 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1219 struct in_ifaddr *ia; 1220 struct ifnet *ifp; 1221 1222 /* If not jailed, use the default returned. */ 1223 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1224 ia = (struct in_ifaddr *)nh->nh_ifa; 1225 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1226 goto done; 1227 } 1228 1229 /* Jailed. */ 1230 /* 1. Check if the iface address belongs to the jail. */ 1231 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1232 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1233 ia = (struct in_ifaddr *)nh->nh_ifa; 1234 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1235 goto done; 1236 } 1237 1238 /* 1239 * 2. Check if we have any address on the outgoing interface 1240 * belonging to this jail. 1241 */ 1242 ia = NULL; 1243 ifp = nh->nh_ifp; 1244 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1245 sa = ifa->ifa_addr; 1246 if (sa->sa_family != AF_INET) 1247 continue; 1248 sin = (struct sockaddr_in *)sa; 1249 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1250 ia = (struct in_ifaddr *)ifa; 1251 break; 1252 } 1253 } 1254 if (ia != NULL) { 1255 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1256 goto done; 1257 } 1258 1259 /* 3. As a last resort return the 'default' jail address. */ 1260 error = prison_get_ip4(cred, laddr); 1261 goto done; 1262 } 1263 1264 /* 1265 * The outgoing interface is marked with 'loopback net', so a route 1266 * to ourselves is here. 1267 * Try to find the interface of the destination address and then 1268 * take the address from there. That interface is not necessarily 1269 * a loopback interface. 1270 * In case of jails, check that it is an address of the jail 1271 * and if we cannot find, fall back to the 'default' jail address. 1272 */ 1273 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1274 struct in_ifaddr *ia; 1275 1276 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1277 inp->inp_socket->so_fibnum)); 1278 if (ia == NULL) 1279 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1280 inp->inp_socket->so_fibnum)); 1281 if (ia == NULL) 1282 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1283 1284 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1285 if (ia == NULL) { 1286 error = ENETUNREACH; 1287 goto done; 1288 } 1289 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1290 goto done; 1291 } 1292 1293 /* Jailed. */ 1294 if (ia != NULL) { 1295 struct ifnet *ifp; 1296 1297 ifp = ia->ia_ifp; 1298 ia = NULL; 1299 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1300 sa = ifa->ifa_addr; 1301 if (sa->sa_family != AF_INET) 1302 continue; 1303 sin = (struct sockaddr_in *)sa; 1304 if (prison_check_ip4(cred, 1305 &sin->sin_addr) == 0) { 1306 ia = (struct in_ifaddr *)ifa; 1307 break; 1308 } 1309 } 1310 if (ia != NULL) { 1311 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1312 goto done; 1313 } 1314 } 1315 1316 /* 3. As a last resort return the 'default' jail address. */ 1317 error = prison_get_ip4(cred, laddr); 1318 goto done; 1319 } 1320 1321 done: 1322 return (error); 1323 } 1324 1325 /* 1326 * Set up for a connect from a socket to the specified address. 1327 * On entry, *laddrp and *lportp should contain the current local 1328 * address and port for the PCB; these are updated to the values 1329 * that should be placed in inp_laddr and inp_lport to complete 1330 * the connect. 1331 * 1332 * On success, *faddrp and *fportp will be set to the remote address 1333 * and port. These are not updated in the error case. 1334 * 1335 * If the operation fails because the connection already exists, 1336 * *oinpp will be set to the PCB of that connection so that the 1337 * caller can decide to override it. In all other cases, *oinpp 1338 * is set to NULL. 1339 */ 1340 int 1341 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1342 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1343 struct inpcb **oinpp, struct ucred *cred) 1344 { 1345 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1346 struct in_ifaddr *ia; 1347 struct inpcb *oinp; 1348 struct in_addr laddr, faddr; 1349 u_short lport, fport; 1350 int error; 1351 1352 KASSERT(sin->sin_family == AF_INET, 1353 ("%s: invalid address family for %p", __func__, sin)); 1354 KASSERT(sin->sin_len == sizeof(*sin), 1355 ("%s: invalid address length for %p", __func__, sin)); 1356 1357 /* 1358 * Because a global state change doesn't actually occur here, a read 1359 * lock is sufficient. 1360 */ 1361 NET_EPOCH_ASSERT(); 1362 INP_LOCK_ASSERT(inp); 1363 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1364 1365 if (oinpp != NULL) 1366 *oinpp = NULL; 1367 if (sin->sin_port == 0) 1368 return (EADDRNOTAVAIL); 1369 laddr.s_addr = *laddrp; 1370 lport = *lportp; 1371 faddr = sin->sin_addr; 1372 fport = sin->sin_port; 1373 #ifdef ROUTE_MPATH 1374 if (CALC_FLOWID_OUTBOUND) { 1375 uint32_t hash_val, hash_type; 1376 1377 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1378 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1379 1380 inp->inp_flowid = hash_val; 1381 inp->inp_flowtype = hash_type; 1382 } 1383 #endif 1384 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1385 /* 1386 * If the destination address is INADDR_ANY, 1387 * use the primary local address. 1388 * If the supplied address is INADDR_BROADCAST, 1389 * and the primary interface supports broadcast, 1390 * choose the broadcast address for that interface. 1391 */ 1392 if (faddr.s_addr == INADDR_ANY) { 1393 faddr = 1394 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1395 if (cred != NULL && 1396 (error = prison_get_ip4(cred, &faddr)) != 0) 1397 return (error); 1398 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1399 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1400 IFF_BROADCAST) 1401 faddr = satosin(&CK_STAILQ_FIRST( 1402 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1403 } 1404 } 1405 if (laddr.s_addr == INADDR_ANY) { 1406 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1407 /* 1408 * If the destination address is multicast and an outgoing 1409 * interface has been set as a multicast option, prefer the 1410 * address of that interface as our source address. 1411 */ 1412 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1413 inp->inp_moptions != NULL) { 1414 struct ip_moptions *imo; 1415 struct ifnet *ifp; 1416 1417 imo = inp->inp_moptions; 1418 if (imo->imo_multicast_ifp != NULL) { 1419 ifp = imo->imo_multicast_ifp; 1420 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1421 if ((ia->ia_ifp == ifp) && 1422 (cred == NULL || 1423 prison_check_ip4(cred, 1424 &ia->ia_addr.sin_addr) == 0)) 1425 break; 1426 } 1427 if (ia == NULL) 1428 error = EADDRNOTAVAIL; 1429 else { 1430 laddr = ia->ia_addr.sin_addr; 1431 error = 0; 1432 } 1433 } 1434 } 1435 if (error) 1436 return (error); 1437 } 1438 1439 if (lport != 0) { 1440 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1441 fport, laddr, lport, 0, NULL, M_NODOM); 1442 if (oinp != NULL) { 1443 if (oinpp != NULL) 1444 *oinpp = oinp; 1445 return (EADDRINUSE); 1446 } 1447 } else { 1448 struct sockaddr_in lsin, fsin; 1449 1450 bzero(&lsin, sizeof(lsin)); 1451 bzero(&fsin, sizeof(fsin)); 1452 lsin.sin_family = AF_INET; 1453 lsin.sin_addr = laddr; 1454 fsin.sin_family = AF_INET; 1455 fsin.sin_addr = faddr; 1456 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1457 &lport, (struct sockaddr *)& fsin, fport, cred, 1458 INPLOOKUP_WILDCARD); 1459 if (error) 1460 return (error); 1461 } 1462 *laddrp = laddr.s_addr; 1463 *lportp = lport; 1464 *faddrp = faddr.s_addr; 1465 *fportp = fport; 1466 return (0); 1467 } 1468 1469 void 1470 in_pcbdisconnect(struct inpcb *inp) 1471 { 1472 1473 INP_WLOCK_ASSERT(inp); 1474 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1475 1476 inp->inp_faddr.s_addr = INADDR_ANY; 1477 inp->inp_fport = 0; 1478 in_pcbrehash(inp); 1479 } 1480 #endif /* INET */ 1481 1482 /* 1483 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1484 * For most protocols, this will be invoked immediately prior to calling 1485 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1486 * socket, in which case in_pcbfree() is deferred. 1487 */ 1488 void 1489 in_pcbdetach(struct inpcb *inp) 1490 { 1491 1492 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1493 1494 #ifdef RATELIMIT 1495 if (inp->inp_snd_tag != NULL) 1496 in_pcbdetach_txrtlmt(inp); 1497 #endif 1498 inp->inp_socket->so_pcb = NULL; 1499 inp->inp_socket = NULL; 1500 } 1501 1502 /* 1503 * inpcb hash lookups are protected by SMR section. 1504 * 1505 * Once desired pcb has been found, switching from SMR section to a pcb 1506 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1507 * here because SMR is a critical section. 1508 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1509 */ 1510 static inline void 1511 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1512 { 1513 1514 lock == INPLOOKUP_RLOCKPCB ? 1515 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1516 } 1517 1518 static inline void 1519 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1520 { 1521 1522 lock == INPLOOKUP_RLOCKPCB ? 1523 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1524 } 1525 1526 static inline int 1527 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1528 { 1529 1530 return (lock == INPLOOKUP_RLOCKPCB ? 1531 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1532 } 1533 1534 static inline bool 1535 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1536 { 1537 1538 return (lock == INPLOOKUP_RLOCKPCB ? 1539 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1540 } 1541 1542 bool 1543 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1544 { 1545 1546 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1547 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1548 1549 if (__predict_true(inp_trylock(inp, lock))) { 1550 if (__predict_false(inp->inp_flags & INP_FREED)) { 1551 smr_exit(inp->inp_pcbinfo->ipi_smr); 1552 inp_unlock(inp, lock); 1553 return (false); 1554 } 1555 smr_exit(inp->inp_pcbinfo->ipi_smr); 1556 return (true); 1557 } 1558 1559 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1560 smr_exit(inp->inp_pcbinfo->ipi_smr); 1561 inp_lock(inp, lock); 1562 if (__predict_false(in_pcbrele(inp, lock))) 1563 return (false); 1564 /* 1565 * inp acquired through refcount & lock for sure didn't went 1566 * through uma_zfree(). However, it may have already went 1567 * through in_pcbfree() and has another reference, that 1568 * prevented its release by our in_pcbrele(). 1569 */ 1570 if (__predict_false(inp->inp_flags & INP_FREED)) { 1571 inp_unlock(inp, lock); 1572 return (false); 1573 } 1574 return (true); 1575 } else { 1576 smr_exit(inp->inp_pcbinfo->ipi_smr); 1577 return (false); 1578 } 1579 } 1580 1581 /* 1582 * inp_next() - inpcb hash/list traversal iterator 1583 * 1584 * Requires initialized struct inpcb_iterator for context. 1585 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1586 * 1587 * - Iterator can have either write-lock or read-lock semantics, that can not 1588 * be changed later. 1589 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1590 * a single hash slot. Note: only rip_input() does the latter. 1591 * - Iterator may have optional bool matching function. The matching function 1592 * will be executed for each inpcb in the SMR context, so it can not acquire 1593 * locks and can safely access only immutable fields of inpcb. 1594 * 1595 * A fresh initialized iterator has NULL inpcb in its context and that 1596 * means that inp_next() call would return the very first inpcb on the list 1597 * locked with desired semantic. In all following calls the context pointer 1598 * shall hold the current inpcb pointer. The KPI user is not supposed to 1599 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1600 * and write NULL to its context. After end of traversal an iterator can be 1601 * reused. 1602 * 1603 * List traversals have the following features/constraints: 1604 * - New entries won't be seen, as they are always added to the head of a list. 1605 * - Removed entries won't stop traversal as long as they are not added to 1606 * a different list. This is violated by in_pcbrehash(). 1607 */ 1608 #define II_LIST_FIRST(ipi, hash) \ 1609 (((hash) == INP_ALL_LIST) ? \ 1610 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1611 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1612 #define II_LIST_NEXT(inp, hash) \ 1613 (((hash) == INP_ALL_LIST) ? \ 1614 CK_LIST_NEXT((inp), inp_list) : \ 1615 CK_LIST_NEXT((inp), inp_hash)) 1616 #define II_LOCK_ASSERT(inp, lock) \ 1617 rw_assert(&(inp)->inp_lock, \ 1618 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1619 struct inpcb * 1620 inp_next(struct inpcb_iterator *ii) 1621 { 1622 const struct inpcbinfo *ipi = ii->ipi; 1623 inp_match_t *match = ii->match; 1624 void *ctx = ii->ctx; 1625 inp_lookup_t lock = ii->lock; 1626 int hash = ii->hash; 1627 struct inpcb *inp; 1628 1629 if (ii->inp == NULL) { /* First call. */ 1630 smr_enter(ipi->ipi_smr); 1631 /* This is unrolled CK_LIST_FOREACH(). */ 1632 for (inp = II_LIST_FIRST(ipi, hash); 1633 inp != NULL; 1634 inp = II_LIST_NEXT(inp, hash)) { 1635 if (match != NULL && (match)(inp, ctx) == false) 1636 continue; 1637 if (__predict_true(inp_smr_lock(inp, lock))) 1638 break; 1639 else { 1640 smr_enter(ipi->ipi_smr); 1641 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1642 inp = II_LIST_FIRST(ipi, hash); 1643 if (inp == NULL) 1644 break; 1645 } 1646 } 1647 1648 if (inp == NULL) 1649 smr_exit(ipi->ipi_smr); 1650 else 1651 ii->inp = inp; 1652 1653 return (inp); 1654 } 1655 1656 /* Not a first call. */ 1657 smr_enter(ipi->ipi_smr); 1658 restart: 1659 inp = ii->inp; 1660 II_LOCK_ASSERT(inp, lock); 1661 next: 1662 inp = II_LIST_NEXT(inp, hash); 1663 if (inp == NULL) { 1664 smr_exit(ipi->ipi_smr); 1665 goto found; 1666 } 1667 1668 if (match != NULL && (match)(inp, ctx) == false) 1669 goto next; 1670 1671 if (__predict_true(inp_trylock(inp, lock))) { 1672 if (__predict_false(inp->inp_flags & INP_FREED)) { 1673 /* 1674 * Entries are never inserted in middle of a list, thus 1675 * as long as we are in SMR, we can continue traversal. 1676 * Jump to 'restart' should yield in the same result, 1677 * but could produce unnecessary looping. Could this 1678 * looping be unbound? 1679 */ 1680 inp_unlock(inp, lock); 1681 goto next; 1682 } else { 1683 smr_exit(ipi->ipi_smr); 1684 goto found; 1685 } 1686 } 1687 1688 /* 1689 * Can't obtain lock immediately, thus going hard. Once we exit the 1690 * SMR section we can no longer jump to 'next', and our only stable 1691 * anchoring point is ii->inp, which we keep locked for this case, so 1692 * we jump to 'restart'. 1693 */ 1694 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1695 smr_exit(ipi->ipi_smr); 1696 inp_lock(inp, lock); 1697 if (__predict_false(in_pcbrele(inp, lock))) { 1698 smr_enter(ipi->ipi_smr); 1699 goto restart; 1700 } 1701 /* 1702 * See comment in inp_smr_lock(). 1703 */ 1704 if (__predict_false(inp->inp_flags & INP_FREED)) { 1705 inp_unlock(inp, lock); 1706 smr_enter(ipi->ipi_smr); 1707 goto restart; 1708 } 1709 } else 1710 goto next; 1711 1712 found: 1713 inp_unlock(ii->inp, lock); 1714 ii->inp = inp; 1715 1716 return (ii->inp); 1717 } 1718 1719 /* 1720 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1721 * stability of an inpcb pointer despite the inpcb lock being released or 1722 * SMR section exited. 1723 * 1724 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1725 */ 1726 void 1727 in_pcbref(struct inpcb *inp) 1728 { 1729 u_int old __diagused; 1730 1731 old = refcount_acquire(&inp->inp_refcount); 1732 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1733 } 1734 1735 /* 1736 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1737 * freeing the pcb, if the reference was very last. 1738 */ 1739 bool 1740 in_pcbrele_rlocked(struct inpcb *inp) 1741 { 1742 1743 INP_RLOCK_ASSERT(inp); 1744 1745 if (refcount_release(&inp->inp_refcount) == 0) 1746 return (false); 1747 1748 MPASS(inp->inp_flags & INP_FREED); 1749 MPASS(inp->inp_socket == NULL); 1750 MPASS(inp->inp_in_hpts == 0); 1751 INP_RUNLOCK(inp); 1752 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1753 return (true); 1754 } 1755 1756 bool 1757 in_pcbrele_wlocked(struct inpcb *inp) 1758 { 1759 1760 INP_WLOCK_ASSERT(inp); 1761 1762 if (refcount_release(&inp->inp_refcount) == 0) 1763 return (false); 1764 1765 MPASS(inp->inp_flags & INP_FREED); 1766 MPASS(inp->inp_socket == NULL); 1767 MPASS(inp->inp_in_hpts == 0); 1768 INP_WUNLOCK(inp); 1769 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1770 return (true); 1771 } 1772 1773 /* 1774 * Unconditionally schedule an inpcb to be freed by decrementing its 1775 * reference count, which should occur only after the inpcb has been detached 1776 * from its socket. If another thread holds a temporary reference (acquired 1777 * using in_pcbref()) then the free is deferred until that reference is 1778 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1779 * Almost all work, including removal from global lists, is done in this 1780 * context, where the pcbinfo lock is held. 1781 */ 1782 void 1783 in_pcbfree(struct inpcb *inp) 1784 { 1785 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1786 #ifdef INET 1787 struct ip_moptions *imo; 1788 #endif 1789 #ifdef INET6 1790 struct ip6_moptions *im6o; 1791 #endif 1792 1793 INP_WLOCK_ASSERT(inp); 1794 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1795 KASSERT((inp->inp_flags & INP_FREED) == 0, 1796 ("%s: called twice for pcb %p", __func__, inp)); 1797 1798 inp->inp_flags |= INP_FREED; 1799 INP_INFO_WLOCK(pcbinfo); 1800 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1801 pcbinfo->ipi_count--; 1802 CK_LIST_REMOVE(inp, inp_list); 1803 INP_INFO_WUNLOCK(pcbinfo); 1804 1805 if (inp->inp_flags & INP_INHASHLIST) 1806 in_pcbremhash(inp); 1807 1808 RO_INVALIDATE_CACHE(&inp->inp_route); 1809 #ifdef MAC 1810 mac_inpcb_destroy(inp); 1811 #endif 1812 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1813 if (inp->inp_sp != NULL) 1814 ipsec_delete_pcbpolicy(inp); 1815 #endif 1816 #ifdef INET 1817 if (inp->inp_options) 1818 (void)m_free(inp->inp_options); 1819 imo = inp->inp_moptions; 1820 #endif 1821 #ifdef INET6 1822 if (inp->inp_vflag & INP_IPV6PROTO) { 1823 ip6_freepcbopts(inp->in6p_outputopts); 1824 im6o = inp->in6p_moptions; 1825 } else 1826 im6o = NULL; 1827 #endif 1828 1829 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1830 INP_WUNLOCK(inp); 1831 } 1832 #ifdef INET6 1833 ip6_freemoptions(im6o); 1834 #endif 1835 #ifdef INET 1836 inp_freemoptions(imo); 1837 #endif 1838 /* Destruction is finalized in inpcb_dtor(). */ 1839 } 1840 1841 static void 1842 inpcb_dtor(void *mem, int size, void *arg) 1843 { 1844 struct inpcb *inp = mem; 1845 1846 crfree(inp->inp_cred); 1847 #ifdef INVARIANTS 1848 inp->inp_cred = NULL; 1849 #endif 1850 } 1851 1852 /* 1853 * Different protocols initialize their inpcbs differently - giving 1854 * different name to the lock. But they all are disposed the same. 1855 */ 1856 static void 1857 inpcb_fini(void *mem, int size) 1858 { 1859 struct inpcb *inp = mem; 1860 1861 INP_LOCK_DESTROY(inp); 1862 } 1863 1864 /* 1865 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1866 * port reservation, and preventing it from being returned by inpcb lookups. 1867 * 1868 * It is used by TCP to mark an inpcb as unused and avoid future packet 1869 * delivery or event notification when a socket remains open but TCP has 1870 * closed. This might occur as a result of a shutdown()-initiated TCP close 1871 * or a RST on the wire, and allows the port binding to be reused while still 1872 * maintaining the invariant that so_pcb always points to a valid inpcb until 1873 * in_pcbdetach(). 1874 * 1875 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1876 * in_pcbnotifyall() and in_pcbpurgeif0()? 1877 */ 1878 void 1879 in_pcbdrop(struct inpcb *inp) 1880 { 1881 1882 INP_WLOCK_ASSERT(inp); 1883 #ifdef INVARIANTS 1884 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1885 MPASS(inp->inp_refcount > 1); 1886 #endif 1887 1888 inp->inp_flags |= INP_DROPPED; 1889 if (inp->inp_flags & INP_INHASHLIST) 1890 in_pcbremhash(inp); 1891 } 1892 1893 #ifdef INET 1894 /* 1895 * Common routines to return the socket addresses associated with inpcbs. 1896 */ 1897 struct sockaddr * 1898 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1899 { 1900 struct sockaddr_in *sin; 1901 1902 sin = malloc(sizeof *sin, M_SONAME, 1903 M_WAITOK | M_ZERO); 1904 sin->sin_family = AF_INET; 1905 sin->sin_len = sizeof(*sin); 1906 sin->sin_addr = *addr_p; 1907 sin->sin_port = port; 1908 1909 return (struct sockaddr *)sin; 1910 } 1911 1912 int 1913 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1914 { 1915 struct inpcb *inp; 1916 struct in_addr addr; 1917 in_port_t port; 1918 1919 inp = sotoinpcb(so); 1920 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1921 1922 INP_RLOCK(inp); 1923 port = inp->inp_lport; 1924 addr = inp->inp_laddr; 1925 INP_RUNLOCK(inp); 1926 1927 *nam = in_sockaddr(port, &addr); 1928 return 0; 1929 } 1930 1931 int 1932 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1933 { 1934 struct inpcb *inp; 1935 struct in_addr addr; 1936 in_port_t port; 1937 1938 inp = sotoinpcb(so); 1939 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1940 1941 INP_RLOCK(inp); 1942 port = inp->inp_fport; 1943 addr = inp->inp_faddr; 1944 INP_RUNLOCK(inp); 1945 1946 *nam = in_sockaddr(port, &addr); 1947 return 0; 1948 } 1949 1950 void 1951 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1952 struct inpcb *(*notify)(struct inpcb *, int)) 1953 { 1954 struct inpcb *inp, *inp_temp; 1955 1956 INP_INFO_WLOCK(pcbinfo); 1957 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1958 INP_WLOCK(inp); 1959 #ifdef INET6 1960 if ((inp->inp_vflag & INP_IPV4) == 0) { 1961 INP_WUNLOCK(inp); 1962 continue; 1963 } 1964 #endif 1965 if (inp->inp_faddr.s_addr != faddr.s_addr || 1966 inp->inp_socket == NULL) { 1967 INP_WUNLOCK(inp); 1968 continue; 1969 } 1970 if ((*notify)(inp, errno)) 1971 INP_WUNLOCK(inp); 1972 } 1973 INP_INFO_WUNLOCK(pcbinfo); 1974 } 1975 1976 static bool 1977 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1978 { 1979 1980 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1981 return (true); 1982 else 1983 return (false); 1984 } 1985 1986 void 1987 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1988 { 1989 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1990 inp_v4_multi_match, NULL); 1991 struct inpcb *inp; 1992 struct in_multi *inm; 1993 struct in_mfilter *imf; 1994 struct ip_moptions *imo; 1995 1996 IN_MULTI_LOCK_ASSERT(); 1997 1998 while ((inp = inp_next(&inpi)) != NULL) { 1999 INP_WLOCK_ASSERT(inp); 2000 2001 imo = inp->inp_moptions; 2002 /* 2003 * Unselect the outgoing interface if it is being 2004 * detached. 2005 */ 2006 if (imo->imo_multicast_ifp == ifp) 2007 imo->imo_multicast_ifp = NULL; 2008 2009 /* 2010 * Drop multicast group membership if we joined 2011 * through the interface being detached. 2012 * 2013 * XXX This can all be deferred to an epoch_call 2014 */ 2015 restart: 2016 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2017 if ((inm = imf->imf_inm) == NULL) 2018 continue; 2019 if (inm->inm_ifp != ifp) 2020 continue; 2021 ip_mfilter_remove(&imo->imo_head, imf); 2022 in_leavegroup_locked(inm, NULL); 2023 ip_mfilter_free(imf); 2024 goto restart; 2025 } 2026 } 2027 } 2028 2029 /* 2030 * Lookup a PCB based on the local address and port. Caller must hold the 2031 * hash lock. No inpcb locks or references are acquired. 2032 */ 2033 #define INP_LOOKUP_MAPPED_PCB_COST 3 2034 struct inpcb * 2035 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2036 u_short lport, int lookupflags, struct ucred *cred) 2037 { 2038 struct inpcb *inp; 2039 #ifdef INET6 2040 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2041 #else 2042 int matchwild = 3; 2043 #endif 2044 int wildcard; 2045 2046 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2047 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2048 INP_HASH_LOCK_ASSERT(pcbinfo); 2049 2050 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2051 struct inpcbhead *head; 2052 /* 2053 * Look for an unconnected (wildcard foreign addr) PCB that 2054 * matches the local address and port we're looking for. 2055 */ 2056 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2057 pcbinfo->ipi_hashmask)]; 2058 CK_LIST_FOREACH(inp, head, inp_hash) { 2059 #ifdef INET6 2060 /* XXX inp locking */ 2061 if ((inp->inp_vflag & INP_IPV4) == 0) 2062 continue; 2063 #endif 2064 if (inp->inp_faddr.s_addr == INADDR_ANY && 2065 inp->inp_laddr.s_addr == laddr.s_addr && 2066 inp->inp_lport == lport) { 2067 /* 2068 * Found? 2069 */ 2070 if (cred == NULL || 2071 prison_equal_ip4(cred->cr_prison, 2072 inp->inp_cred->cr_prison)) 2073 return (inp); 2074 } 2075 } 2076 /* 2077 * Not found. 2078 */ 2079 return (NULL); 2080 } else { 2081 struct inpcbporthead *porthash; 2082 struct inpcbport *phd; 2083 struct inpcb *match = NULL; 2084 /* 2085 * Best fit PCB lookup. 2086 * 2087 * First see if this local port is in use by looking on the 2088 * port hash list. 2089 */ 2090 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2091 pcbinfo->ipi_porthashmask)]; 2092 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2093 if (phd->phd_port == lport) 2094 break; 2095 } 2096 if (phd != NULL) { 2097 /* 2098 * Port is in use by one or more PCBs. Look for best 2099 * fit. 2100 */ 2101 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2102 wildcard = 0; 2103 if (cred != NULL && 2104 !prison_equal_ip4(inp->inp_cred->cr_prison, 2105 cred->cr_prison)) 2106 continue; 2107 #ifdef INET6 2108 /* XXX inp locking */ 2109 if ((inp->inp_vflag & INP_IPV4) == 0) 2110 continue; 2111 /* 2112 * We never select the PCB that has 2113 * INP_IPV6 flag and is bound to :: if 2114 * we have another PCB which is bound 2115 * to 0.0.0.0. If a PCB has the 2116 * INP_IPV6 flag, then we set its cost 2117 * higher than IPv4 only PCBs. 2118 * 2119 * Note that the case only happens 2120 * when a socket is bound to ::, under 2121 * the condition that the use of the 2122 * mapped address is allowed. 2123 */ 2124 if ((inp->inp_vflag & INP_IPV6) != 0) 2125 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2126 #endif 2127 if (inp->inp_faddr.s_addr != INADDR_ANY) 2128 wildcard++; 2129 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2130 if (laddr.s_addr == INADDR_ANY) 2131 wildcard++; 2132 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2133 continue; 2134 } else { 2135 if (laddr.s_addr != INADDR_ANY) 2136 wildcard++; 2137 } 2138 if (wildcard < matchwild) { 2139 match = inp; 2140 matchwild = wildcard; 2141 if (matchwild == 0) 2142 break; 2143 } 2144 } 2145 } 2146 return (match); 2147 } 2148 } 2149 #undef INP_LOOKUP_MAPPED_PCB_COST 2150 2151 static struct inpcb * 2152 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2153 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2154 uint16_t fport, int lookupflags, int numa_domain) 2155 { 2156 struct inpcb *local_wild, *numa_wild; 2157 const struct inpcblbgrouphead *hdr; 2158 struct inpcblbgroup *grp; 2159 uint32_t idx; 2160 2161 INP_HASH_LOCK_ASSERT(pcbinfo); 2162 2163 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2164 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2165 2166 /* 2167 * Order of socket selection: 2168 * 1. non-wild. 2169 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). 2170 * 2171 * NOTE: 2172 * - Load balanced group does not contain jailed sockets 2173 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets 2174 */ 2175 local_wild = NULL; 2176 numa_wild = NULL; 2177 CK_LIST_FOREACH(grp, hdr, il_list) { 2178 #ifdef INET6 2179 if (!(grp->il_vflag & INP_IPV4)) 2180 continue; 2181 #endif 2182 if (grp->il_lport != lport) 2183 continue; 2184 2185 idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2186 grp->il_inpcnt; 2187 if (grp->il_laddr.s_addr == laddr->s_addr) { 2188 if (numa_domain == M_NODOM || 2189 grp->il_numa_domain == numa_domain) { 2190 return (grp->il_inp[idx]); 2191 } else { 2192 numa_wild = grp->il_inp[idx]; 2193 } 2194 } 2195 if (grp->il_laddr.s_addr == INADDR_ANY && 2196 (lookupflags & INPLOOKUP_WILDCARD) != 0 && 2197 (local_wild == NULL || numa_domain == M_NODOM || 2198 grp->il_numa_domain == numa_domain)) { 2199 local_wild = grp->il_inp[idx]; 2200 } 2201 } 2202 if (numa_wild != NULL) 2203 return (numa_wild); 2204 2205 return (local_wild); 2206 } 2207 2208 /* 2209 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2210 * that the caller has either locked the hash list, which usually happens 2211 * for bind(2) operations, or is in SMR section, which happens when sorting 2212 * out incoming packets. 2213 */ 2214 static struct inpcb * 2215 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2216 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2217 struct ifnet *ifp, uint8_t numa_domain) 2218 { 2219 struct inpcbhead *head; 2220 struct inpcb *inp, *tmpinp; 2221 u_short fport = fport_arg, lport = lport_arg; 2222 2223 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2224 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2225 INP_HASH_LOCK_ASSERT(pcbinfo); 2226 2227 /* 2228 * First look for an exact match. 2229 */ 2230 tmpinp = NULL; 2231 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, 2232 pcbinfo->ipi_hashmask)]; 2233 CK_LIST_FOREACH(inp, head, inp_hash) { 2234 #ifdef INET6 2235 /* XXX inp locking */ 2236 if ((inp->inp_vflag & INP_IPV4) == 0) 2237 continue; 2238 #endif 2239 if (inp->inp_faddr.s_addr == faddr.s_addr && 2240 inp->inp_laddr.s_addr == laddr.s_addr && 2241 inp->inp_fport == fport && 2242 inp->inp_lport == lport) { 2243 /* 2244 * XXX We should be able to directly return 2245 * the inp here, without any checks. 2246 * Well unless both bound with SO_REUSEPORT? 2247 */ 2248 if (prison_flag(inp->inp_cred, PR_IP4)) 2249 return (inp); 2250 if (tmpinp == NULL) 2251 tmpinp = inp; 2252 } 2253 } 2254 if (tmpinp != NULL) 2255 return (tmpinp); 2256 2257 /* 2258 * Then look in lb group (for wildcard match). 2259 */ 2260 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2261 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2262 fport, lookupflags, numa_domain); 2263 if (inp != NULL) 2264 return (inp); 2265 } 2266 2267 /* 2268 * Then look for a wildcard match, if requested. 2269 */ 2270 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2271 struct inpcb *local_wild = NULL, *local_exact = NULL; 2272 #ifdef INET6 2273 struct inpcb *local_wild_mapped = NULL; 2274 #endif 2275 struct inpcb *jail_wild = NULL; 2276 int injail; 2277 2278 /* 2279 * Order of socket selection - we always prefer jails. 2280 * 1. jailed, non-wild. 2281 * 2. jailed, wild. 2282 * 3. non-jailed, non-wild. 2283 * 4. non-jailed, wild. 2284 */ 2285 2286 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2287 pcbinfo->ipi_hashmask)]; 2288 CK_LIST_FOREACH(inp, head, inp_hash) { 2289 #ifdef INET6 2290 /* XXX inp locking */ 2291 if ((inp->inp_vflag & INP_IPV4) == 0) 2292 continue; 2293 #endif 2294 if (inp->inp_faddr.s_addr != INADDR_ANY || 2295 inp->inp_lport != lport) 2296 continue; 2297 2298 injail = prison_flag(inp->inp_cred, PR_IP4); 2299 if (injail) { 2300 if (prison_check_ip4_locked( 2301 inp->inp_cred->cr_prison, &laddr) != 0) 2302 continue; 2303 } else { 2304 if (local_exact != NULL) 2305 continue; 2306 } 2307 2308 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2309 if (injail) 2310 return (inp); 2311 else 2312 local_exact = inp; 2313 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2314 #ifdef INET6 2315 /* XXX inp locking, NULL check */ 2316 if (inp->inp_vflag & INP_IPV6PROTO) 2317 local_wild_mapped = inp; 2318 else 2319 #endif 2320 if (injail) 2321 jail_wild = inp; 2322 else 2323 local_wild = inp; 2324 } 2325 } /* LIST_FOREACH */ 2326 if (jail_wild != NULL) 2327 return (jail_wild); 2328 if (local_exact != NULL) 2329 return (local_exact); 2330 if (local_wild != NULL) 2331 return (local_wild); 2332 #ifdef INET6 2333 if (local_wild_mapped != NULL) 2334 return (local_wild_mapped); 2335 #endif 2336 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2337 2338 return (NULL); 2339 } 2340 2341 /* 2342 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2343 * hash list lock, and will return the inpcb locked (i.e., requires 2344 * INPLOOKUP_LOCKPCB). 2345 */ 2346 static struct inpcb * 2347 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2348 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2349 struct ifnet *ifp, uint8_t numa_domain) 2350 { 2351 struct inpcb *inp; 2352 2353 smr_enter(pcbinfo->ipi_smr); 2354 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2355 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2356 if (inp != NULL) { 2357 if (__predict_false(inp_smr_lock(inp, 2358 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2359 inp = NULL; 2360 } else 2361 smr_exit(pcbinfo->ipi_smr); 2362 2363 return (inp); 2364 } 2365 2366 /* 2367 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2368 * from which a pre-calculated hash value may be extracted. 2369 */ 2370 struct inpcb * 2371 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2372 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2373 { 2374 2375 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2376 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2377 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2378 ("%s: LOCKPCB not set", __func__)); 2379 2380 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2381 lookupflags, ifp, M_NODOM)); 2382 } 2383 2384 struct inpcb * 2385 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2386 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2387 struct ifnet *ifp, struct mbuf *m) 2388 { 2389 2390 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2391 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2392 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2393 ("%s: LOCKPCB not set", __func__)); 2394 2395 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2396 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2397 } 2398 #endif /* INET */ 2399 2400 /* 2401 * Insert PCB onto various hash lists. 2402 */ 2403 int 2404 in_pcbinshash(struct inpcb *inp) 2405 { 2406 struct inpcbhead *pcbhash; 2407 struct inpcbporthead *pcbporthash; 2408 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2409 struct inpcbport *phd; 2410 int so_options; 2411 2412 INP_WLOCK_ASSERT(inp); 2413 INP_HASH_WLOCK_ASSERT(pcbinfo); 2414 2415 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2416 ("in_pcbinshash: INP_INHASHLIST")); 2417 2418 #ifdef INET6 2419 if (inp->inp_vflag & INP_IPV6) 2420 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2421 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2422 else 2423 #endif 2424 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2425 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2426 2427 pcbporthash = &pcbinfo->ipi_porthashbase[ 2428 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2429 2430 /* 2431 * Add entry to load balance group. 2432 * Only do this if SO_REUSEPORT_LB is set. 2433 */ 2434 so_options = inp_so_options(inp); 2435 if (so_options & SO_REUSEPORT_LB) { 2436 int ret = in_pcbinslbgrouphash(inp, M_NODOM); 2437 if (ret) { 2438 /* pcb lb group malloc fail (ret=ENOBUFS). */ 2439 return (ret); 2440 } 2441 } 2442 2443 /* 2444 * Go through port list and look for a head for this lport. 2445 */ 2446 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2447 if (phd->phd_port == inp->inp_lport) 2448 break; 2449 } 2450 /* 2451 * If none exists, malloc one and tack it on. 2452 */ 2453 if (phd == NULL) { 2454 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2455 if (phd == NULL) { 2456 return (ENOBUFS); /* XXX */ 2457 } 2458 phd->phd_port = inp->inp_lport; 2459 CK_LIST_INIT(&phd->phd_pcblist); 2460 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2461 } 2462 inp->inp_phd = phd; 2463 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2464 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2465 inp->inp_flags |= INP_INHASHLIST; 2466 2467 return (0); 2468 } 2469 2470 static void 2471 in_pcbremhash(struct inpcb *inp) 2472 { 2473 struct inpcbport *phd = inp->inp_phd; 2474 2475 INP_WLOCK_ASSERT(inp); 2476 MPASS(inp->inp_flags & INP_INHASHLIST); 2477 2478 INP_HASH_WLOCK(inp->inp_pcbinfo); 2479 /* XXX: Only do if SO_REUSEPORT_LB set? */ 2480 in_pcbremlbgrouphash(inp); 2481 CK_LIST_REMOVE(inp, inp_hash); 2482 CK_LIST_REMOVE(inp, inp_portlist); 2483 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2484 CK_LIST_REMOVE(phd, phd_hash); 2485 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2486 } 2487 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2488 inp->inp_flags &= ~INP_INHASHLIST; 2489 } 2490 2491 /* 2492 * Move PCB to the proper hash bucket when { faddr, fport } have been 2493 * changed. NOTE: This does not handle the case of the lport changing (the 2494 * hashed port list would have to be updated as well), so the lport must 2495 * not change after in_pcbinshash() has been called. 2496 * 2497 * XXXGL: a race between this function and SMR-protected hash iterator 2498 * will lead to iterator traversing a possibly wrong hash list. However, 2499 * this race should have been here since change from rwlock to epoch. 2500 */ 2501 void 2502 in_pcbrehash(struct inpcb *inp) 2503 { 2504 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2505 struct inpcbhead *head; 2506 2507 INP_WLOCK_ASSERT(inp); 2508 INP_HASH_WLOCK_ASSERT(pcbinfo); 2509 2510 KASSERT(inp->inp_flags & INP_INHASHLIST, 2511 ("in_pcbrehash: !INP_INHASHLIST")); 2512 2513 #ifdef INET6 2514 if (inp->inp_vflag & INP_IPV6) 2515 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2516 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2517 else 2518 #endif 2519 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2520 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2521 2522 CK_LIST_REMOVE(inp, inp_hash); 2523 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2524 } 2525 2526 /* 2527 * Check for alternatives when higher level complains 2528 * about service problems. For now, invalidate cached 2529 * routing information. If the route was created dynamically 2530 * (by a redirect), time to try a default gateway again. 2531 */ 2532 void 2533 in_losing(struct inpcb *inp) 2534 { 2535 2536 RO_INVALIDATE_CACHE(&inp->inp_route); 2537 return; 2538 } 2539 2540 /* 2541 * A set label operation has occurred at the socket layer, propagate the 2542 * label change into the in_pcb for the socket. 2543 */ 2544 void 2545 in_pcbsosetlabel(struct socket *so) 2546 { 2547 #ifdef MAC 2548 struct inpcb *inp; 2549 2550 inp = sotoinpcb(so); 2551 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2552 2553 INP_WLOCK(inp); 2554 SOCK_LOCK(so); 2555 mac_inpcb_sosetlabel(so, inp); 2556 SOCK_UNLOCK(so); 2557 INP_WUNLOCK(inp); 2558 #endif 2559 } 2560 2561 void 2562 inp_wlock(struct inpcb *inp) 2563 { 2564 2565 INP_WLOCK(inp); 2566 } 2567 2568 void 2569 inp_wunlock(struct inpcb *inp) 2570 { 2571 2572 INP_WUNLOCK(inp); 2573 } 2574 2575 void 2576 inp_rlock(struct inpcb *inp) 2577 { 2578 2579 INP_RLOCK(inp); 2580 } 2581 2582 void 2583 inp_runlock(struct inpcb *inp) 2584 { 2585 2586 INP_RUNLOCK(inp); 2587 } 2588 2589 #ifdef INVARIANT_SUPPORT 2590 void 2591 inp_lock_assert(struct inpcb *inp) 2592 { 2593 2594 INP_WLOCK_ASSERT(inp); 2595 } 2596 2597 void 2598 inp_unlock_assert(struct inpcb *inp) 2599 { 2600 2601 INP_UNLOCK_ASSERT(inp); 2602 } 2603 #endif 2604 2605 void 2606 inp_apply_all(struct inpcbinfo *pcbinfo, 2607 void (*func)(struct inpcb *, void *), void *arg) 2608 { 2609 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2610 INPLOOKUP_WLOCKPCB); 2611 struct inpcb *inp; 2612 2613 while ((inp = inp_next(&inpi)) != NULL) 2614 func(inp, arg); 2615 } 2616 2617 struct socket * 2618 inp_inpcbtosocket(struct inpcb *inp) 2619 { 2620 2621 INP_WLOCK_ASSERT(inp); 2622 return (inp->inp_socket); 2623 } 2624 2625 struct tcpcb * 2626 inp_inpcbtotcpcb(struct inpcb *inp) 2627 { 2628 2629 INP_WLOCK_ASSERT(inp); 2630 return ((struct tcpcb *)inp->inp_ppcb); 2631 } 2632 2633 int 2634 inp_ip_tos_get(const struct inpcb *inp) 2635 { 2636 2637 return (inp->inp_ip_tos); 2638 } 2639 2640 void 2641 inp_ip_tos_set(struct inpcb *inp, int val) 2642 { 2643 2644 inp->inp_ip_tos = val; 2645 } 2646 2647 void 2648 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2649 uint32_t *faddr, uint16_t *fp) 2650 { 2651 2652 INP_LOCK_ASSERT(inp); 2653 *laddr = inp->inp_laddr.s_addr; 2654 *faddr = inp->inp_faddr.s_addr; 2655 *lp = inp->inp_lport; 2656 *fp = inp->inp_fport; 2657 } 2658 2659 struct inpcb * 2660 so_sotoinpcb(struct socket *so) 2661 { 2662 2663 return (sotoinpcb(so)); 2664 } 2665 2666 /* 2667 * Create an external-format (``xinpcb'') structure using the information in 2668 * the kernel-format in_pcb structure pointed to by inp. This is done to 2669 * reduce the spew of irrelevant information over this interface, to isolate 2670 * user code from changes in the kernel structure, and potentially to provide 2671 * information-hiding if we decide that some of this information should be 2672 * hidden from users. 2673 */ 2674 void 2675 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2676 { 2677 2678 bzero(xi, sizeof(*xi)); 2679 xi->xi_len = sizeof(struct xinpcb); 2680 if (inp->inp_socket) 2681 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2682 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2683 xi->inp_gencnt = inp->inp_gencnt; 2684 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2685 xi->inp_flow = inp->inp_flow; 2686 xi->inp_flowid = inp->inp_flowid; 2687 xi->inp_flowtype = inp->inp_flowtype; 2688 xi->inp_flags = inp->inp_flags; 2689 xi->inp_flags2 = inp->inp_flags2; 2690 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2691 xi->in6p_cksum = inp->in6p_cksum; 2692 xi->in6p_hops = inp->in6p_hops; 2693 xi->inp_ip_tos = inp->inp_ip_tos; 2694 xi->inp_vflag = inp->inp_vflag; 2695 xi->inp_ip_ttl = inp->inp_ip_ttl; 2696 xi->inp_ip_p = inp->inp_ip_p; 2697 xi->inp_ip_minttl = inp->inp_ip_minttl; 2698 } 2699 2700 int 2701 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2702 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2703 { 2704 struct sockopt sopt; 2705 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2706 INPLOOKUP_WLOCKPCB); 2707 struct inpcb *inp; 2708 struct sockopt_parameters *params; 2709 struct socket *so; 2710 int error; 2711 char buf[1024]; 2712 2713 if (req->oldptr != NULL || req->oldlen != 0) 2714 return (EINVAL); 2715 if (req->newptr == NULL) 2716 return (EPERM); 2717 if (req->newlen > sizeof(buf)) 2718 return (ENOMEM); 2719 error = SYSCTL_IN(req, buf, req->newlen); 2720 if (error != 0) 2721 return (error); 2722 if (req->newlen < sizeof(struct sockopt_parameters)) 2723 return (EINVAL); 2724 params = (struct sockopt_parameters *)buf; 2725 sopt.sopt_level = params->sop_level; 2726 sopt.sopt_name = params->sop_optname; 2727 sopt.sopt_dir = SOPT_SET; 2728 sopt.sopt_val = params->sop_optval; 2729 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2730 sopt.sopt_td = NULL; 2731 #ifdef INET6 2732 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2733 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2734 params->sop_inc.inc6_laddr.s6_addr16[1] = 2735 htons(params->sop_inc.inc6_zoneid & 0xffff); 2736 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2737 params->sop_inc.inc6_faddr.s6_addr16[1] = 2738 htons(params->sop_inc.inc6_zoneid & 0xffff); 2739 } 2740 #endif 2741 if (params->sop_inc.inc_lport != htons(0)) { 2742 if (params->sop_inc.inc_fport == htons(0)) 2743 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, 2744 pcbinfo->ipi_hashmask); 2745 else 2746 #ifdef INET6 2747 if (params->sop_inc.inc_flags & INC_ISIPV6) 2748 inpi.hash = INP6_PCBHASH( 2749 ¶ms->sop_inc.inc6_faddr, 2750 params->sop_inc.inc_lport, 2751 params->sop_inc.inc_fport, 2752 pcbinfo->ipi_hashmask); 2753 else 2754 #endif 2755 inpi.hash = INP_PCBHASH( 2756 ¶ms->sop_inc.inc_faddr, 2757 params->sop_inc.inc_lport, 2758 params->sop_inc.inc_fport, 2759 pcbinfo->ipi_hashmask); 2760 } 2761 while ((inp = inp_next(&inpi)) != NULL) 2762 if (inp->inp_gencnt == params->sop_id) { 2763 if (inp->inp_flags & INP_DROPPED) { 2764 INP_WUNLOCK(inp); 2765 return (ECONNRESET); 2766 } 2767 so = inp->inp_socket; 2768 KASSERT(so != NULL, ("inp_socket == NULL")); 2769 soref(so); 2770 error = (*ctloutput_set)(inp, &sopt); 2771 sorele(so); 2772 break; 2773 } 2774 if (inp == NULL) 2775 error = ESRCH; 2776 return (error); 2777 } 2778 2779 #ifdef DDB 2780 static void 2781 db_print_indent(int indent) 2782 { 2783 int i; 2784 2785 for (i = 0; i < indent; i++) 2786 db_printf(" "); 2787 } 2788 2789 static void 2790 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2791 { 2792 char faddr_str[48], laddr_str[48]; 2793 2794 db_print_indent(indent); 2795 db_printf("%s at %p\n", name, inc); 2796 2797 indent += 2; 2798 2799 #ifdef INET6 2800 if (inc->inc_flags & INC_ISIPV6) { 2801 /* IPv6. */ 2802 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2803 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2804 } else 2805 #endif 2806 { 2807 /* IPv4. */ 2808 inet_ntoa_r(inc->inc_laddr, laddr_str); 2809 inet_ntoa_r(inc->inc_faddr, faddr_str); 2810 } 2811 db_print_indent(indent); 2812 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2813 ntohs(inc->inc_lport)); 2814 db_print_indent(indent); 2815 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2816 ntohs(inc->inc_fport)); 2817 } 2818 2819 static void 2820 db_print_inpflags(int inp_flags) 2821 { 2822 int comma; 2823 2824 comma = 0; 2825 if (inp_flags & INP_RECVOPTS) { 2826 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2827 comma = 1; 2828 } 2829 if (inp_flags & INP_RECVRETOPTS) { 2830 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2831 comma = 1; 2832 } 2833 if (inp_flags & INP_RECVDSTADDR) { 2834 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2835 comma = 1; 2836 } 2837 if (inp_flags & INP_ORIGDSTADDR) { 2838 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2839 comma = 1; 2840 } 2841 if (inp_flags & INP_HDRINCL) { 2842 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2843 comma = 1; 2844 } 2845 if (inp_flags & INP_HIGHPORT) { 2846 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2847 comma = 1; 2848 } 2849 if (inp_flags & INP_LOWPORT) { 2850 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2851 comma = 1; 2852 } 2853 if (inp_flags & INP_ANONPORT) { 2854 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2855 comma = 1; 2856 } 2857 if (inp_flags & INP_RECVIF) { 2858 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2859 comma = 1; 2860 } 2861 if (inp_flags & INP_MTUDISC) { 2862 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2863 comma = 1; 2864 } 2865 if (inp_flags & INP_RECVTTL) { 2866 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2867 comma = 1; 2868 } 2869 if (inp_flags & INP_DONTFRAG) { 2870 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2871 comma = 1; 2872 } 2873 if (inp_flags & INP_RECVTOS) { 2874 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2875 comma = 1; 2876 } 2877 if (inp_flags & IN6P_IPV6_V6ONLY) { 2878 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2879 comma = 1; 2880 } 2881 if (inp_flags & IN6P_PKTINFO) { 2882 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2883 comma = 1; 2884 } 2885 if (inp_flags & IN6P_HOPLIMIT) { 2886 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2887 comma = 1; 2888 } 2889 if (inp_flags & IN6P_HOPOPTS) { 2890 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2891 comma = 1; 2892 } 2893 if (inp_flags & IN6P_DSTOPTS) { 2894 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2895 comma = 1; 2896 } 2897 if (inp_flags & IN6P_RTHDR) { 2898 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2899 comma = 1; 2900 } 2901 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2902 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2903 comma = 1; 2904 } 2905 if (inp_flags & IN6P_TCLASS) { 2906 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2907 comma = 1; 2908 } 2909 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2910 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2911 comma = 1; 2912 } 2913 if (inp_flags & INP_ONESBCAST) { 2914 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2915 comma = 1; 2916 } 2917 if (inp_flags & INP_DROPPED) { 2918 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2919 comma = 1; 2920 } 2921 if (inp_flags & INP_SOCKREF) { 2922 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2923 comma = 1; 2924 } 2925 if (inp_flags & IN6P_RFC2292) { 2926 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2927 comma = 1; 2928 } 2929 if (inp_flags & IN6P_MTU) { 2930 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2931 comma = 1; 2932 } 2933 } 2934 2935 static void 2936 db_print_inpvflag(u_char inp_vflag) 2937 { 2938 int comma; 2939 2940 comma = 0; 2941 if (inp_vflag & INP_IPV4) { 2942 db_printf("%sINP_IPV4", comma ? ", " : ""); 2943 comma = 1; 2944 } 2945 if (inp_vflag & INP_IPV6) { 2946 db_printf("%sINP_IPV6", comma ? ", " : ""); 2947 comma = 1; 2948 } 2949 if (inp_vflag & INP_IPV6PROTO) { 2950 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2951 comma = 1; 2952 } 2953 } 2954 2955 static void 2956 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2957 { 2958 2959 db_print_indent(indent); 2960 db_printf("%s at %p\n", name, inp); 2961 2962 indent += 2; 2963 2964 db_print_indent(indent); 2965 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2966 2967 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2968 2969 db_print_indent(indent); 2970 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2971 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 2972 2973 db_print_indent(indent); 2974 db_printf("inp_label: %p inp_flags: 0x%x (", 2975 inp->inp_label, inp->inp_flags); 2976 db_print_inpflags(inp->inp_flags); 2977 db_printf(")\n"); 2978 2979 db_print_indent(indent); 2980 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 2981 inp->inp_vflag); 2982 db_print_inpvflag(inp->inp_vflag); 2983 db_printf(")\n"); 2984 2985 db_print_indent(indent); 2986 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 2987 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 2988 2989 db_print_indent(indent); 2990 #ifdef INET6 2991 if (inp->inp_vflag & INP_IPV6) { 2992 db_printf("in6p_options: %p in6p_outputopts: %p " 2993 "in6p_moptions: %p\n", inp->in6p_options, 2994 inp->in6p_outputopts, inp->in6p_moptions); 2995 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 2996 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 2997 inp->in6p_hops); 2998 } else 2999 #endif 3000 { 3001 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3002 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3003 inp->inp_options, inp->inp_moptions); 3004 } 3005 3006 db_print_indent(indent); 3007 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3008 (uintmax_t)inp->inp_gencnt); 3009 } 3010 3011 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3012 { 3013 struct inpcb *inp; 3014 3015 if (!have_addr) { 3016 db_printf("usage: show inpcb <addr>\n"); 3017 return; 3018 } 3019 inp = (struct inpcb *)addr; 3020 3021 db_print_inpcb(inp, "inpcb", 0); 3022 } 3023 #endif /* DDB */ 3024 3025 #ifdef RATELIMIT 3026 /* 3027 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3028 * if any. 3029 */ 3030 int 3031 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3032 { 3033 union if_snd_tag_modify_params params = { 3034 .rate_limit.max_rate = max_pacing_rate, 3035 .rate_limit.flags = M_NOWAIT, 3036 }; 3037 struct m_snd_tag *mst; 3038 int error; 3039 3040 mst = inp->inp_snd_tag; 3041 if (mst == NULL) 3042 return (EINVAL); 3043 3044 if (mst->sw->snd_tag_modify == NULL) { 3045 error = EOPNOTSUPP; 3046 } else { 3047 error = mst->sw->snd_tag_modify(mst, ¶ms); 3048 } 3049 return (error); 3050 } 3051 3052 /* 3053 * Query existing TX rate limit based on the existing 3054 * "inp->inp_snd_tag", if any. 3055 */ 3056 int 3057 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3058 { 3059 union if_snd_tag_query_params params = { }; 3060 struct m_snd_tag *mst; 3061 int error; 3062 3063 mst = inp->inp_snd_tag; 3064 if (mst == NULL) 3065 return (EINVAL); 3066 3067 if (mst->sw->snd_tag_query == NULL) { 3068 error = EOPNOTSUPP; 3069 } else { 3070 error = mst->sw->snd_tag_query(mst, ¶ms); 3071 if (error == 0 && p_max_pacing_rate != NULL) 3072 *p_max_pacing_rate = params.rate_limit.max_rate; 3073 } 3074 return (error); 3075 } 3076 3077 /* 3078 * Query existing TX queue level based on the existing 3079 * "inp->inp_snd_tag", if any. 3080 */ 3081 int 3082 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3083 { 3084 union if_snd_tag_query_params params = { }; 3085 struct m_snd_tag *mst; 3086 int error; 3087 3088 mst = inp->inp_snd_tag; 3089 if (mst == NULL) 3090 return (EINVAL); 3091 3092 if (mst->sw->snd_tag_query == NULL) 3093 return (EOPNOTSUPP); 3094 3095 error = mst->sw->snd_tag_query(mst, ¶ms); 3096 if (error == 0 && p_txqueue_level != NULL) 3097 *p_txqueue_level = params.rate_limit.queue_level; 3098 return (error); 3099 } 3100 3101 /* 3102 * Allocate a new TX rate limit send tag from the network interface 3103 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3104 */ 3105 int 3106 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3107 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3108 3109 { 3110 union if_snd_tag_alloc_params params = { 3111 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3112 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3113 .rate_limit.hdr.flowid = flowid, 3114 .rate_limit.hdr.flowtype = flowtype, 3115 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3116 .rate_limit.max_rate = max_pacing_rate, 3117 .rate_limit.flags = M_NOWAIT, 3118 }; 3119 int error; 3120 3121 INP_WLOCK_ASSERT(inp); 3122 3123 /* 3124 * If there is already a send tag, or the INP is being torn 3125 * down, allocating a new send tag is not allowed. Else send 3126 * tags may leak. 3127 */ 3128 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3129 return (EINVAL); 3130 3131 error = m_snd_tag_alloc(ifp, ¶ms, st); 3132 #ifdef INET 3133 if (error == 0) { 3134 counter_u64_add(rate_limit_set_ok, 1); 3135 counter_u64_add(rate_limit_active, 1); 3136 } else if (error != EOPNOTSUPP) 3137 counter_u64_add(rate_limit_alloc_fail, 1); 3138 #endif 3139 return (error); 3140 } 3141 3142 void 3143 in_pcbdetach_tag(struct m_snd_tag *mst) 3144 { 3145 3146 m_snd_tag_rele(mst); 3147 #ifdef INET 3148 counter_u64_add(rate_limit_active, -1); 3149 #endif 3150 } 3151 3152 /* 3153 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3154 * if any: 3155 */ 3156 void 3157 in_pcbdetach_txrtlmt(struct inpcb *inp) 3158 { 3159 struct m_snd_tag *mst; 3160 3161 INP_WLOCK_ASSERT(inp); 3162 3163 mst = inp->inp_snd_tag; 3164 inp->inp_snd_tag = NULL; 3165 3166 if (mst == NULL) 3167 return; 3168 3169 m_snd_tag_rele(mst); 3170 #ifdef INET 3171 counter_u64_add(rate_limit_active, -1); 3172 #endif 3173 } 3174 3175 int 3176 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3177 { 3178 int error; 3179 3180 /* 3181 * If the existing send tag is for the wrong interface due to 3182 * a route change, first drop the existing tag. Set the 3183 * CHANGED flag so that we will keep trying to allocate a new 3184 * tag if we fail to allocate one this time. 3185 */ 3186 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3187 in_pcbdetach_txrtlmt(inp); 3188 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3189 } 3190 3191 /* 3192 * NOTE: When attaching to a network interface a reference is 3193 * made to ensure the network interface doesn't go away until 3194 * all ratelimit connections are gone. The network interface 3195 * pointers compared below represent valid network interfaces, 3196 * except when comparing towards NULL. 3197 */ 3198 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3199 error = 0; 3200 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3201 if (inp->inp_snd_tag != NULL) 3202 in_pcbdetach_txrtlmt(inp); 3203 error = 0; 3204 } else if (inp->inp_snd_tag == NULL) { 3205 /* 3206 * In order to utilize packet pacing with RSS, we need 3207 * to wait until there is a valid RSS hash before we 3208 * can proceed: 3209 */ 3210 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3211 error = EAGAIN; 3212 } else { 3213 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3214 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3215 } 3216 } else { 3217 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3218 } 3219 if (error == 0 || error == EOPNOTSUPP) 3220 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3221 3222 return (error); 3223 } 3224 3225 /* 3226 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3227 * is set in the fast path and will attach/detach/modify the TX rate 3228 * limit send tag based on the socket's so_max_pacing_rate value. 3229 */ 3230 void 3231 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3232 { 3233 struct socket *socket; 3234 uint32_t max_pacing_rate; 3235 bool did_upgrade; 3236 3237 if (inp == NULL) 3238 return; 3239 3240 socket = inp->inp_socket; 3241 if (socket == NULL) 3242 return; 3243 3244 if (!INP_WLOCKED(inp)) { 3245 /* 3246 * NOTE: If the write locking fails, we need to bail 3247 * out and use the non-ratelimited ring for the 3248 * transmit until there is a new chance to get the 3249 * write lock. 3250 */ 3251 if (!INP_TRY_UPGRADE(inp)) 3252 return; 3253 did_upgrade = 1; 3254 } else { 3255 did_upgrade = 0; 3256 } 3257 3258 /* 3259 * NOTE: The so_max_pacing_rate value is read unlocked, 3260 * because atomic updates are not required since the variable 3261 * is checked at every mbuf we send. It is assumed that the 3262 * variable read itself will be atomic. 3263 */ 3264 max_pacing_rate = socket->so_max_pacing_rate; 3265 3266 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3267 3268 if (did_upgrade) 3269 INP_DOWNGRADE(inp); 3270 } 3271 3272 /* 3273 * Track route changes for TX rate limiting. 3274 */ 3275 void 3276 in_pcboutput_eagain(struct inpcb *inp) 3277 { 3278 bool did_upgrade; 3279 3280 if (inp == NULL) 3281 return; 3282 3283 if (inp->inp_snd_tag == NULL) 3284 return; 3285 3286 if (!INP_WLOCKED(inp)) { 3287 /* 3288 * NOTE: If the write locking fails, we need to bail 3289 * out and use the non-ratelimited ring for the 3290 * transmit until there is a new chance to get the 3291 * write lock. 3292 */ 3293 if (!INP_TRY_UPGRADE(inp)) 3294 return; 3295 did_upgrade = 1; 3296 } else { 3297 did_upgrade = 0; 3298 } 3299 3300 /* detach rate limiting */ 3301 in_pcbdetach_txrtlmt(inp); 3302 3303 /* make sure new mbuf send tag allocation is made */ 3304 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3305 3306 if (did_upgrade) 3307 INP_DOWNGRADE(inp); 3308 } 3309 3310 #ifdef INET 3311 static void 3312 rl_init(void *st) 3313 { 3314 rate_limit_new = counter_u64_alloc(M_WAITOK); 3315 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3316 rate_limit_active = counter_u64_alloc(M_WAITOK); 3317 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3318 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3319 } 3320 3321 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3322 #endif 3323 #endif /* RATELIMIT */ 3324