1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/lock.h> 54 #include <sys/malloc.h> 55 #include <sys/mbuf.h> 56 #include <sys/callout.h> 57 #include <sys/eventhandler.h> 58 #include <sys/domain.h> 59 #include <sys/protosw.h> 60 #include <sys/smp.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_types.h> 81 #include <net/if_llatbl.h> 82 #include <net/route.h> 83 #include <net/rss_config.h> 84 #include <net/vnet.h> 85 86 #if defined(INET) || defined(INET6) 87 #include <netinet/in.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/in_pcb_var.h> 90 #ifdef INET 91 #include <netinet/in_var.h> 92 #include <netinet/in_fib.h> 93 #endif 94 #include <netinet/ip_var.h> 95 #include <netinet/tcp_var.h> 96 #ifdef TCPHPTS 97 #include <netinet/tcp_hpts.h> 98 #endif 99 #include <netinet/udp.h> 100 #include <netinet/udp_var.h> 101 #ifdef INET6 102 #include <netinet/ip6.h> 103 #include <netinet6/in6_pcb.h> 104 #include <netinet6/in6_var.h> 105 #include <netinet6/ip6_var.h> 106 #endif /* INET6 */ 107 #include <net/route/nhop.h> 108 #endif 109 110 #include <netipsec/ipsec_support.h> 111 112 #include <security/mac/mac_framework.h> 113 114 #define INPCBLBGROUP_SIZMIN 8 115 #define INPCBLBGROUP_SIZMAX 256 116 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 117 118 static struct callout ipport_tick_callout; 119 120 /* 121 * These configure the range of local port addresses assigned to 122 * "unspecified" outgoing connections/packets/whatever. 123 */ 124 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 125 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 126 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 127 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 128 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 129 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 130 131 /* 132 * Reserved ports accessible only to root. There are significant 133 * security considerations that must be accounted for when changing these, 134 * but the security benefits can be great. Please be careful. 135 */ 136 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 137 VNET_DEFINE(int, ipport_reservedlow); 138 139 /* Variables dealing with random ephemeral port allocation. */ 140 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ 141 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ 142 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ 143 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ 144 VNET_DEFINE(int, ipport_tcpallocs); 145 VNET_DEFINE_STATIC(int, ipport_tcplastcount); 146 147 #define V_ipport_tcplastcount VNET(ipport_tcplastcount) 148 149 #ifdef INET 150 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 151 struct in_addr faddr, u_int fport_arg, 152 struct in_addr laddr, u_int lport_arg, 153 int lookupflags, struct ifnet *ifp, 154 uint8_t numa_domain); 155 156 #define RANGECHK(var, min, max) \ 157 if ((var) < (min)) { (var) = (min); } \ 158 else if ((var) > (max)) { (var) = (max); } 159 160 static int 161 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 162 { 163 int error; 164 165 error = sysctl_handle_int(oidp, arg1, arg2, req); 166 if (error == 0) { 167 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 168 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 169 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 170 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 171 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 172 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 173 } 174 return (error); 175 } 176 177 #undef RANGECHK 178 179 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 180 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 181 "IP Ports"); 182 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 196 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 197 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 198 ""); 199 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 200 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 201 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 202 ""); 203 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 204 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 205 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 206 ""); 207 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 208 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 209 &VNET_NAME(ipport_reservedhigh), 0, ""); 210 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 211 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 212 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 213 CTLFLAG_VNET | CTLFLAG_RW, 214 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 215 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, 216 CTLFLAG_VNET | CTLFLAG_RW, 217 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " 218 "allocations before switching to a sequential one"); 219 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, 220 CTLFLAG_VNET | CTLFLAG_RW, 221 &VNET_NAME(ipport_randomtime), 0, 222 "Minimum time to keep sequential port " 223 "allocation before switching to a random one"); 224 225 #ifdef RATELIMIT 226 counter_u64_t rate_limit_new; 227 counter_u64_t rate_limit_chg; 228 counter_u64_t rate_limit_active; 229 counter_u64_t rate_limit_alloc_fail; 230 counter_u64_t rate_limit_set_ok; 231 232 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 233 "IP Rate Limiting"); 234 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 235 &rate_limit_active, "Active rate limited connections"); 236 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 237 &rate_limit_alloc_fail, "Rate limited connection failures"); 238 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 239 &rate_limit_set_ok, "Rate limited setting succeeded"); 240 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 241 &rate_limit_new, "Total Rate limit new attempts"); 242 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 243 &rate_limit_chg, "Total Rate limited change attempts"); 244 245 #endif /* RATELIMIT */ 246 247 #endif /* INET */ 248 249 /* 250 * in_pcb.c: manage the Protocol Control Blocks. 251 * 252 * NOTE: It is assumed that most of these functions will be called with 253 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 254 * functions often modify hash chains or addresses in pcbs. 255 */ 256 257 static struct inpcblbgroup * 258 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, 259 uint16_t port, const union in_dependaddr *addr, int size, 260 uint8_t numa_domain) 261 { 262 struct inpcblbgroup *grp; 263 size_t bytes; 264 265 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 266 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 267 if (!grp) 268 return (NULL); 269 grp->il_vflag = vflag; 270 grp->il_lport = port; 271 grp->il_numa_domain = numa_domain; 272 grp->il_dependladdr = *addr; 273 grp->il_inpsiz = size; 274 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 275 return (grp); 276 } 277 278 static void 279 in_pcblbgroup_free_deferred(epoch_context_t ctx) 280 { 281 struct inpcblbgroup *grp; 282 283 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 284 free(grp, M_PCB); 285 } 286 287 static void 288 in_pcblbgroup_free(struct inpcblbgroup *grp) 289 { 290 291 CK_LIST_REMOVE(grp, il_list); 292 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 293 } 294 295 static struct inpcblbgroup * 296 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 297 struct inpcblbgroup *old_grp, int size) 298 { 299 struct inpcblbgroup *grp; 300 int i; 301 302 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, 303 old_grp->il_lport, &old_grp->il_dependladdr, size, 304 old_grp->il_numa_domain); 305 if (grp == NULL) 306 return (NULL); 307 308 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 309 ("invalid new local group size %d and old local group count %d", 310 grp->il_inpsiz, old_grp->il_inpcnt)); 311 312 for (i = 0; i < old_grp->il_inpcnt; ++i) 313 grp->il_inp[i] = old_grp->il_inp[i]; 314 grp->il_inpcnt = old_grp->il_inpcnt; 315 in_pcblbgroup_free(old_grp); 316 return (grp); 317 } 318 319 /* 320 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 321 * and shrink group if possible. 322 */ 323 static void 324 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 325 int i) 326 { 327 struct inpcblbgroup *grp, *new_grp; 328 329 grp = *grpp; 330 for (; i + 1 < grp->il_inpcnt; ++i) 331 grp->il_inp[i] = grp->il_inp[i + 1]; 332 grp->il_inpcnt--; 333 334 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 335 grp->il_inpcnt <= grp->il_inpsiz / 4) { 336 /* Shrink this group. */ 337 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 338 if (new_grp != NULL) 339 *grpp = new_grp; 340 } 341 } 342 343 /* 344 * Add PCB to load balance group for SO_REUSEPORT_LB option. 345 */ 346 static int 347 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 348 { 349 const static struct timeval interval = { 60, 0 }; 350 static struct timeval lastprint; 351 struct inpcbinfo *pcbinfo; 352 struct inpcblbgrouphead *hdr; 353 struct inpcblbgroup *grp; 354 uint32_t idx; 355 356 pcbinfo = inp->inp_pcbinfo; 357 358 INP_WLOCK_ASSERT(inp); 359 INP_HASH_WLOCK_ASSERT(pcbinfo); 360 361 /* 362 * Don't allow jailed socket to join local group. 363 */ 364 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) 365 return (0); 366 367 #ifdef INET6 368 /* 369 * Don't allow IPv4 mapped INET6 wild socket. 370 */ 371 if ((inp->inp_vflag & INP_IPV4) && 372 inp->inp_laddr.s_addr == INADDR_ANY && 373 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 374 return (0); 375 } 376 #endif 377 378 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 379 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 380 CK_LIST_FOREACH(grp, hdr, il_list) { 381 if (grp->il_vflag == inp->inp_vflag && 382 grp->il_lport == inp->inp_lport && 383 grp->il_numa_domain == numa_domain && 384 memcmp(&grp->il_dependladdr, 385 &inp->inp_inc.inc_ie.ie_dependladdr, 386 sizeof(grp->il_dependladdr)) == 0) 387 break; 388 } 389 if (grp == NULL) { 390 /* Create new load balance group. */ 391 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, 392 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 393 INPCBLBGROUP_SIZMIN, numa_domain); 394 if (grp == NULL) 395 return (ENOBUFS); 396 } else if (grp->il_inpcnt == grp->il_inpsiz) { 397 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 398 if (ratecheck(&lastprint, &interval)) 399 printf("lb group port %d, limit reached\n", 400 ntohs(grp->il_lport)); 401 return (0); 402 } 403 404 /* Expand this local group. */ 405 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 406 if (grp == NULL) 407 return (ENOBUFS); 408 } 409 410 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 411 ("invalid local group size %d and count %d", grp->il_inpsiz, 412 grp->il_inpcnt)); 413 414 grp->il_inp[grp->il_inpcnt] = inp; 415 grp->il_inpcnt++; 416 return (0); 417 } 418 419 /* 420 * Remove PCB from load balance group. 421 */ 422 static void 423 in_pcbremlbgrouphash(struct inpcb *inp) 424 { 425 struct inpcbinfo *pcbinfo; 426 struct inpcblbgrouphead *hdr; 427 struct inpcblbgroup *grp; 428 int i; 429 430 pcbinfo = inp->inp_pcbinfo; 431 432 INP_WLOCK_ASSERT(inp); 433 INP_HASH_WLOCK_ASSERT(pcbinfo); 434 435 hdr = &pcbinfo->ipi_lbgrouphashbase[ 436 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 437 CK_LIST_FOREACH(grp, hdr, il_list) { 438 for (i = 0; i < grp->il_inpcnt; ++i) { 439 if (grp->il_inp[i] != inp) 440 continue; 441 442 if (grp->il_inpcnt == 1) { 443 /* We are the last, free this local group. */ 444 in_pcblbgroup_free(grp); 445 } else { 446 /* Pull up inpcbs, shrink group if possible. */ 447 in_pcblbgroup_reorder(hdr, &grp, i); 448 } 449 return; 450 } 451 } 452 } 453 454 int 455 in_pcblbgroup_numa(struct inpcb *inp, int arg) 456 { 457 struct inpcbinfo *pcbinfo; 458 struct inpcblbgrouphead *hdr; 459 struct inpcblbgroup *grp; 460 int err, i; 461 uint8_t numa_domain; 462 463 switch (arg) { 464 case TCP_REUSPORT_LB_NUMA_NODOM: 465 numa_domain = M_NODOM; 466 break; 467 case TCP_REUSPORT_LB_NUMA_CURDOM: 468 numa_domain = PCPU_GET(domain); 469 break; 470 default: 471 if (arg < 0 || arg >= vm_ndomains) 472 return (EINVAL); 473 numa_domain = arg; 474 } 475 476 err = 0; 477 pcbinfo = inp->inp_pcbinfo; 478 INP_WLOCK_ASSERT(inp); 479 INP_HASH_WLOCK(pcbinfo); 480 hdr = &pcbinfo->ipi_lbgrouphashbase[ 481 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 482 CK_LIST_FOREACH(grp, hdr, il_list) { 483 for (i = 0; i < grp->il_inpcnt; ++i) { 484 if (grp->il_inp[i] != inp) 485 continue; 486 487 if (grp->il_numa_domain == numa_domain) { 488 goto abort_with_hash_wlock; 489 } 490 491 /* Remove it from the old group. */ 492 in_pcbremlbgrouphash(inp); 493 494 /* Add it to the new group based on numa domain. */ 495 in_pcbinslbgrouphash(inp, numa_domain); 496 goto abort_with_hash_wlock; 497 } 498 } 499 err = ENOENT; 500 abort_with_hash_wlock: 501 INP_HASH_WUNLOCK(pcbinfo); 502 return (err); 503 } 504 505 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 506 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 507 508 /* 509 * Initialize an inpcbinfo -- we should be able to reduce the number of 510 * arguments in time. 511 */ 512 static void inpcb_dtor(void *, int, void *); 513 static void inpcb_fini(void *, int); 514 void 515 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, 516 u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, 517 uma_init inpcbzone_init) 518 { 519 520 mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); 521 mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); 522 #ifdef VIMAGE 523 pcbinfo->ipi_vnet = curvnet; 524 #endif 525 CK_LIST_INIT(&pcbinfo->ipi_listhead); 526 pcbinfo->ipi_count = 0; 527 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 528 &pcbinfo->ipi_hashmask); 529 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 530 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 531 &pcbinfo->ipi_porthashmask); 532 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 533 &pcbinfo->ipi_lbgrouphashmask); 534 pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), 535 NULL, inpcb_dtor, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 536 UMA_ZONE_SMR); 537 uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); 538 uma_zone_set_warning(pcbinfo->ipi_zone, 539 "kern.ipc.maxsockets limit reached"); 540 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 541 pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, 542 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 543 uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); 544 } 545 546 /* 547 * Destroy an inpcbinfo. 548 */ 549 void 550 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 551 { 552 553 KASSERT(pcbinfo->ipi_count == 0, 554 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 555 556 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 557 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 558 pcbinfo->ipi_porthashmask); 559 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 560 pcbinfo->ipi_lbgrouphashmask); 561 uma_zdestroy(pcbinfo->ipi_zone); 562 mtx_destroy(&pcbinfo->ipi_hash_lock); 563 mtx_destroy(&pcbinfo->ipi_lock); 564 } 565 566 /* 567 * Allocate a PCB and associate it with the socket. 568 * On success return with the PCB locked. 569 */ 570 int 571 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 572 { 573 struct inpcb *inp; 574 int error; 575 576 error = 0; 577 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 578 if (inp == NULL) 579 return (ENOBUFS); 580 bzero(&inp->inp_start_zero, inp_zero_size); 581 #ifdef NUMA 582 inp->inp_numa_domain = M_NODOM; 583 #endif 584 inp->inp_pcbinfo = pcbinfo; 585 inp->inp_socket = so; 586 inp->inp_cred = crhold(so->so_cred); 587 inp->inp_inc.inc_fibnum = so->so_fibnum; 588 #ifdef MAC 589 error = mac_inpcb_init(inp, M_NOWAIT); 590 if (error != 0) 591 goto out; 592 mac_inpcb_create(so, inp); 593 #endif 594 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 595 error = ipsec_init_pcbpolicy(inp); 596 if (error != 0) { 597 #ifdef MAC 598 mac_inpcb_destroy(inp); 599 #endif 600 goto out; 601 } 602 #endif /*IPSEC*/ 603 #ifdef INET6 604 if (INP_SOCKAF(so) == AF_INET6) { 605 inp->inp_vflag |= INP_IPV6PROTO; 606 if (V_ip6_v6only) 607 inp->inp_flags |= IN6P_IPV6_V6ONLY; 608 } 609 if (V_ip6_auto_flowlabel) 610 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 611 #endif 612 /* 613 * Routes in inpcb's can cache L2 as well; they are guaranteed 614 * to be cleaned up. 615 */ 616 inp->inp_route.ro_flags = RT_LLE_CACHE; 617 #ifdef TCPHPTS 618 /* 619 * If using hpts lets drop a random number in so 620 * not all new connections fall on the same CPU. 621 */ 622 inp->inp_hpts_cpu = inp->inp_dropq_cpu = hpts_random_cpu(inp); 623 #endif 624 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 625 INP_WLOCK(inp); 626 INP_INFO_WLOCK(pcbinfo); 627 pcbinfo->ipi_count++; 628 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 629 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 630 INP_INFO_WUNLOCK(pcbinfo); 631 so->so_pcb = inp; 632 633 return (0); 634 635 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 636 out: 637 uma_zfree_smr(pcbinfo->ipi_zone, inp); 638 return (error); 639 #endif 640 } 641 642 #ifdef INET 643 int 644 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 645 { 646 int anonport, error; 647 648 KASSERT(nam == NULL || nam->sa_family == AF_INET, 649 ("%s: invalid address family for %p", __func__, nam)); 650 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 651 ("%s: invalid address length for %p", __func__, nam)); 652 INP_WLOCK_ASSERT(inp); 653 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 654 655 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 656 return (EINVAL); 657 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 658 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 659 &inp->inp_lport, cred); 660 if (error) 661 return (error); 662 if (in_pcbinshash(inp) != 0) { 663 inp->inp_laddr.s_addr = INADDR_ANY; 664 inp->inp_lport = 0; 665 return (EAGAIN); 666 } 667 if (anonport) 668 inp->inp_flags |= INP_ANONPORT; 669 return (0); 670 } 671 #endif 672 673 #if defined(INET) || defined(INET6) 674 /* 675 * Assign a local port like in_pcb_lport(), but also used with connect() 676 * and a foreign address and port. If fsa is non-NULL, choose a local port 677 * that is unused with those, otherwise one that is completely unused. 678 * lsa can be NULL for IPv6. 679 */ 680 int 681 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 682 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 683 { 684 struct inpcbinfo *pcbinfo; 685 struct inpcb *tmpinp; 686 unsigned short *lastport; 687 int count, dorandom, error; 688 u_short aux, first, last, lport; 689 #ifdef INET 690 struct in_addr laddr, faddr; 691 #endif 692 #ifdef INET6 693 struct in6_addr *laddr6, *faddr6; 694 #endif 695 696 pcbinfo = inp->inp_pcbinfo; 697 698 /* 699 * Because no actual state changes occur here, a global write lock on 700 * the pcbinfo isn't required. 701 */ 702 INP_LOCK_ASSERT(inp); 703 INP_HASH_LOCK_ASSERT(pcbinfo); 704 705 if (inp->inp_flags & INP_HIGHPORT) { 706 first = V_ipport_hifirstauto; /* sysctl */ 707 last = V_ipport_hilastauto; 708 lastport = &pcbinfo->ipi_lasthi; 709 } else if (inp->inp_flags & INP_LOWPORT) { 710 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 711 if (error) 712 return (error); 713 first = V_ipport_lowfirstauto; /* 1023 */ 714 last = V_ipport_lowlastauto; /* 600 */ 715 lastport = &pcbinfo->ipi_lastlow; 716 } else { 717 first = V_ipport_firstauto; /* sysctl */ 718 last = V_ipport_lastauto; 719 lastport = &pcbinfo->ipi_lastport; 720 } 721 /* 722 * For UDP(-Lite), use random port allocation as long as the user 723 * allows it. For TCP (and as of yet unknown) connections, 724 * use random port allocation only if the user allows it AND 725 * ipport_tick() allows it. 726 */ 727 if (V_ipport_randomized && 728 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || 729 pcbinfo == &V_ulitecbinfo)) 730 dorandom = 1; 731 else 732 dorandom = 0; 733 /* 734 * It makes no sense to do random port allocation if 735 * we have the only port available. 736 */ 737 if (first == last) 738 dorandom = 0; 739 /* Make sure to not include UDP(-Lite) packets in the count. */ 740 if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo) 741 V_ipport_tcpallocs++; 742 /* 743 * Instead of having two loops further down counting up or down 744 * make sure that first is always <= last and go with only one 745 * code path implementing all logic. 746 */ 747 if (first > last) { 748 aux = first; 749 first = last; 750 last = aux; 751 } 752 753 #ifdef INET 754 laddr.s_addr = INADDR_ANY; 755 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 756 if (lsa != NULL) 757 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 758 if (fsa != NULL) 759 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 760 } 761 #endif 762 #ifdef INET6 763 laddr6 = NULL; 764 if ((inp->inp_vflag & INP_IPV6) != 0) { 765 if (lsa != NULL) 766 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 767 if (fsa != NULL) 768 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 769 } 770 #endif 771 772 tmpinp = NULL; 773 lport = *lportp; 774 775 if (dorandom) 776 *lastport = first + (arc4random() % (last - first)); 777 778 count = last - first; 779 780 do { 781 if (count-- < 0) /* completely used? */ 782 return (EADDRNOTAVAIL); 783 ++*lastport; 784 if (*lastport < first || *lastport > last) 785 *lastport = first; 786 lport = htons(*lastport); 787 788 if (fsa != NULL) { 789 #ifdef INET 790 if (lsa->sa_family == AF_INET) { 791 tmpinp = in_pcblookup_hash_locked(pcbinfo, 792 faddr, fport, laddr, lport, lookupflags, 793 NULL, M_NODOM); 794 } 795 #endif 796 #ifdef INET6 797 if (lsa->sa_family == AF_INET6) { 798 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 799 faddr6, fport, laddr6, lport, lookupflags, 800 NULL, M_NODOM); 801 } 802 #endif 803 } else { 804 #ifdef INET6 805 if ((inp->inp_vflag & INP_IPV6) != 0) 806 tmpinp = in6_pcblookup_local(pcbinfo, 807 &inp->in6p_laddr, lport, lookupflags, cred); 808 #endif 809 #if defined(INET) && defined(INET6) 810 else 811 #endif 812 #ifdef INET 813 tmpinp = in_pcblookup_local(pcbinfo, laddr, 814 lport, lookupflags, cred); 815 #endif 816 } 817 } while (tmpinp != NULL); 818 819 *lportp = lport; 820 821 return (0); 822 } 823 824 /* 825 * Select a local port (number) to use. 826 */ 827 int 828 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 829 struct ucred *cred, int lookupflags) 830 { 831 struct sockaddr_in laddr; 832 833 if (laddrp) { 834 bzero(&laddr, sizeof(laddr)); 835 laddr.sin_family = AF_INET; 836 laddr.sin_addr = *laddrp; 837 } 838 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 839 NULL, lportp, NULL, 0, cred, lookupflags)); 840 } 841 842 /* 843 * Return cached socket options. 844 */ 845 int 846 inp_so_options(const struct inpcb *inp) 847 { 848 int so_options; 849 850 so_options = 0; 851 852 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 853 so_options |= SO_REUSEPORT_LB; 854 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 855 so_options |= SO_REUSEPORT; 856 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 857 so_options |= SO_REUSEADDR; 858 return (so_options); 859 } 860 #endif /* INET || INET6 */ 861 862 /* 863 * Check if a new BINDMULTI socket is allowed to be created. 864 * 865 * ni points to the new inp. 866 * oi points to the exisitng inp. 867 * 868 * This checks whether the existing inp also has BINDMULTI and 869 * whether the credentials match. 870 */ 871 int 872 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 873 { 874 /* Check permissions match */ 875 if ((ni->inp_flags2 & INP_BINDMULTI) && 876 (ni->inp_cred->cr_uid != 877 oi->inp_cred->cr_uid)) 878 return (0); 879 880 /* Check the existing inp has BINDMULTI set */ 881 if ((ni->inp_flags2 & INP_BINDMULTI) && 882 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 883 return (0); 884 885 /* 886 * We're okay - either INP_BINDMULTI isn't set on ni, or 887 * it is and it matches the checks. 888 */ 889 return (1); 890 } 891 892 #ifdef INET 893 /* 894 * Set up a bind operation on a PCB, performing port allocation 895 * as required, but do not actually modify the PCB. Callers can 896 * either complete the bind by setting inp_laddr/inp_lport and 897 * calling in_pcbinshash(), or they can just use the resulting 898 * port and address to authorise the sending of a once-off packet. 899 * 900 * On error, the values of *laddrp and *lportp are not changed. 901 */ 902 int 903 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 904 u_short *lportp, struct ucred *cred) 905 { 906 struct socket *so = inp->inp_socket; 907 struct sockaddr_in *sin; 908 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 909 struct in_addr laddr; 910 u_short lport = 0; 911 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 912 int error; 913 914 /* 915 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 916 * so that we don't have to add to the (already messy) code below. 917 */ 918 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 919 920 /* 921 * No state changes, so read locks are sufficient here. 922 */ 923 INP_LOCK_ASSERT(inp); 924 INP_HASH_LOCK_ASSERT(pcbinfo); 925 926 laddr.s_addr = *laddrp; 927 if (nam != NULL && laddr.s_addr != INADDR_ANY) 928 return (EINVAL); 929 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 930 lookupflags = INPLOOKUP_WILDCARD; 931 if (nam == NULL) { 932 if ((error = prison_local_ip4(cred, &laddr)) != 0) 933 return (error); 934 } else { 935 sin = (struct sockaddr_in *)nam; 936 KASSERT(sin->sin_family == AF_INET, 937 ("%s: invalid family for address %p", __func__, sin)); 938 KASSERT(sin->sin_len == sizeof(*sin), 939 ("%s: invalid length for address %p", __func__, sin)); 940 941 error = prison_local_ip4(cred, &sin->sin_addr); 942 if (error) 943 return (error); 944 if (sin->sin_port != *lportp) { 945 /* Don't allow the port to change. */ 946 if (*lportp != 0) 947 return (EINVAL); 948 lport = sin->sin_port; 949 } 950 /* NB: lport is left as 0 if the port isn't being changed. */ 951 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 952 /* 953 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 954 * allow complete duplication of binding if 955 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 956 * and a multicast address is bound on both 957 * new and duplicated sockets. 958 */ 959 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 960 reuseport = SO_REUSEADDR|SO_REUSEPORT; 961 /* 962 * XXX: How to deal with SO_REUSEPORT_LB here? 963 * Treat same as SO_REUSEPORT for now. 964 */ 965 if ((so->so_options & 966 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 967 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 968 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 969 sin->sin_port = 0; /* yech... */ 970 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 971 /* 972 * Is the address a local IP address? 973 * If INP_BINDANY is set, then the socket may be bound 974 * to any endpoint address, local or not. 975 */ 976 if ((inp->inp_flags & INP_BINDANY) == 0 && 977 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 978 return (EADDRNOTAVAIL); 979 } 980 laddr = sin->sin_addr; 981 if (lport) { 982 struct inpcb *t; 983 struct tcptw *tw; 984 985 /* GROSS */ 986 if (ntohs(lport) <= V_ipport_reservedhigh && 987 ntohs(lport) >= V_ipport_reservedlow && 988 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 989 return (EACCES); 990 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 991 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 992 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 993 lport, INPLOOKUP_WILDCARD, cred); 994 /* 995 * XXX 996 * This entire block sorely needs a rewrite. 997 */ 998 if (t && 999 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1000 ((t->inp_flags & INP_TIMEWAIT) == 0) && 1001 (so->so_type != SOCK_STREAM || 1002 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1003 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1004 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1005 (t->inp_flags2 & INP_REUSEPORT) || 1006 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1007 (inp->inp_cred->cr_uid != 1008 t->inp_cred->cr_uid)) 1009 return (EADDRINUSE); 1010 1011 /* 1012 * If the socket is a BINDMULTI socket, then 1013 * the credentials need to match and the 1014 * original socket also has to have been bound 1015 * with BINDMULTI. 1016 */ 1017 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1018 return (EADDRINUSE); 1019 } 1020 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1021 lport, lookupflags, cred); 1022 if (t && (t->inp_flags & INP_TIMEWAIT)) { 1023 /* 1024 * XXXRW: If an incpb has had its timewait 1025 * state recycled, we treat the address as 1026 * being in use (for now). This is better 1027 * than a panic, but not desirable. 1028 */ 1029 tw = intotw(t); 1030 if (tw == NULL || 1031 ((reuseport & tw->tw_so_options) == 0 && 1032 (reuseport_lb & 1033 tw->tw_so_options) == 0)) { 1034 return (EADDRINUSE); 1035 } 1036 } else if (t && 1037 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1038 (reuseport & inp_so_options(t)) == 0 && 1039 (reuseport_lb & inp_so_options(t)) == 0) { 1040 #ifdef INET6 1041 if (ntohl(sin->sin_addr.s_addr) != 1042 INADDR_ANY || 1043 ntohl(t->inp_laddr.s_addr) != 1044 INADDR_ANY || 1045 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1046 (t->inp_vflag & INP_IPV6PROTO) == 0) 1047 #endif 1048 return (EADDRINUSE); 1049 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1050 return (EADDRINUSE); 1051 } 1052 } 1053 } 1054 if (*lportp != 0) 1055 lport = *lportp; 1056 if (lport == 0) { 1057 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1058 if (error != 0) 1059 return (error); 1060 } 1061 *laddrp = laddr.s_addr; 1062 *lportp = lport; 1063 return (0); 1064 } 1065 1066 /* 1067 * Connect from a socket to a specified address. 1068 * Both address and port must be specified in argument sin. 1069 * If don't have a local address for this socket yet, 1070 * then pick one. 1071 */ 1072 int 1073 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1074 bool rehash) 1075 { 1076 u_short lport, fport; 1077 in_addr_t laddr, faddr; 1078 int anonport, error; 1079 1080 INP_WLOCK_ASSERT(inp); 1081 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1082 1083 lport = inp->inp_lport; 1084 laddr = inp->inp_laddr.s_addr; 1085 anonport = (lport == 0); 1086 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1087 NULL, cred); 1088 if (error) 1089 return (error); 1090 1091 /* Do the initial binding of the local address if required. */ 1092 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1093 KASSERT(rehash == true, 1094 ("Rehashing required for unbound inps")); 1095 inp->inp_lport = lport; 1096 inp->inp_laddr.s_addr = laddr; 1097 if (in_pcbinshash(inp) != 0) { 1098 inp->inp_laddr.s_addr = INADDR_ANY; 1099 inp->inp_lport = 0; 1100 return (EAGAIN); 1101 } 1102 } 1103 1104 /* Commit the remaining changes. */ 1105 inp->inp_lport = lport; 1106 inp->inp_laddr.s_addr = laddr; 1107 inp->inp_faddr.s_addr = faddr; 1108 inp->inp_fport = fport; 1109 if (rehash) { 1110 in_pcbrehash(inp); 1111 } else { 1112 in_pcbinshash(inp); 1113 } 1114 1115 if (anonport) 1116 inp->inp_flags |= INP_ANONPORT; 1117 return (0); 1118 } 1119 1120 /* 1121 * Do proper source address selection on an unbound socket in case 1122 * of connect. Take jails into account as well. 1123 */ 1124 int 1125 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1126 struct ucred *cred) 1127 { 1128 struct ifaddr *ifa; 1129 struct sockaddr *sa; 1130 struct sockaddr_in *sin, dst; 1131 struct nhop_object *nh; 1132 int error; 1133 1134 NET_EPOCH_ASSERT(); 1135 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1136 /* 1137 * Bypass source address selection and use the primary jail IP 1138 * if requested. 1139 */ 1140 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) 1141 return (0); 1142 1143 error = 0; 1144 1145 nh = NULL; 1146 bzero(&dst, sizeof(dst)); 1147 sin = &dst; 1148 sin->sin_family = AF_INET; 1149 sin->sin_len = sizeof(struct sockaddr_in); 1150 sin->sin_addr.s_addr = faddr->s_addr; 1151 1152 /* 1153 * If route is known our src addr is taken from the i/f, 1154 * else punt. 1155 * 1156 * Find out route to destination. 1157 */ 1158 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1159 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1160 0, NHR_NONE, 0); 1161 1162 /* 1163 * If we found a route, use the address corresponding to 1164 * the outgoing interface. 1165 * 1166 * Otherwise assume faddr is reachable on a directly connected 1167 * network and try to find a corresponding interface to take 1168 * the source address from. 1169 */ 1170 if (nh == NULL || nh->nh_ifp == NULL) { 1171 struct in_ifaddr *ia; 1172 struct ifnet *ifp; 1173 1174 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1175 inp->inp_socket->so_fibnum)); 1176 if (ia == NULL) { 1177 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1178 inp->inp_socket->so_fibnum)); 1179 } 1180 if (ia == NULL) { 1181 error = ENETUNREACH; 1182 goto done; 1183 } 1184 1185 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1186 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1187 goto done; 1188 } 1189 1190 ifp = ia->ia_ifp; 1191 ia = NULL; 1192 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1193 sa = ifa->ifa_addr; 1194 if (sa->sa_family != AF_INET) 1195 continue; 1196 sin = (struct sockaddr_in *)sa; 1197 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1198 ia = (struct in_ifaddr *)ifa; 1199 break; 1200 } 1201 } 1202 if (ia != NULL) { 1203 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1204 goto done; 1205 } 1206 1207 /* 3. As a last resort return the 'default' jail address. */ 1208 error = prison_get_ip4(cred, laddr); 1209 goto done; 1210 } 1211 1212 /* 1213 * If the outgoing interface on the route found is not 1214 * a loopback interface, use the address from that interface. 1215 * In case of jails do those three steps: 1216 * 1. check if the interface address belongs to the jail. If so use it. 1217 * 2. check if we have any address on the outgoing interface 1218 * belonging to this jail. If so use it. 1219 * 3. as a last resort return the 'default' jail address. 1220 */ 1221 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1222 struct in_ifaddr *ia; 1223 struct ifnet *ifp; 1224 1225 /* If not jailed, use the default returned. */ 1226 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1227 ia = (struct in_ifaddr *)nh->nh_ifa; 1228 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1229 goto done; 1230 } 1231 1232 /* Jailed. */ 1233 /* 1. Check if the iface address belongs to the jail. */ 1234 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1235 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1236 ia = (struct in_ifaddr *)nh->nh_ifa; 1237 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1238 goto done; 1239 } 1240 1241 /* 1242 * 2. Check if we have any address on the outgoing interface 1243 * belonging to this jail. 1244 */ 1245 ia = NULL; 1246 ifp = nh->nh_ifp; 1247 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1248 sa = ifa->ifa_addr; 1249 if (sa->sa_family != AF_INET) 1250 continue; 1251 sin = (struct sockaddr_in *)sa; 1252 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1253 ia = (struct in_ifaddr *)ifa; 1254 break; 1255 } 1256 } 1257 if (ia != NULL) { 1258 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1259 goto done; 1260 } 1261 1262 /* 3. As a last resort return the 'default' jail address. */ 1263 error = prison_get_ip4(cred, laddr); 1264 goto done; 1265 } 1266 1267 /* 1268 * The outgoing interface is marked with 'loopback net', so a route 1269 * to ourselves is here. 1270 * Try to find the interface of the destination address and then 1271 * take the address from there. That interface is not necessarily 1272 * a loopback interface. 1273 * In case of jails, check that it is an address of the jail 1274 * and if we cannot find, fall back to the 'default' jail address. 1275 */ 1276 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1277 struct in_ifaddr *ia; 1278 1279 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1280 inp->inp_socket->so_fibnum)); 1281 if (ia == NULL) 1282 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1283 inp->inp_socket->so_fibnum)); 1284 if (ia == NULL) 1285 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1286 1287 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1288 if (ia == NULL) { 1289 error = ENETUNREACH; 1290 goto done; 1291 } 1292 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1293 goto done; 1294 } 1295 1296 /* Jailed. */ 1297 if (ia != NULL) { 1298 struct ifnet *ifp; 1299 1300 ifp = ia->ia_ifp; 1301 ia = NULL; 1302 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1303 sa = ifa->ifa_addr; 1304 if (sa->sa_family != AF_INET) 1305 continue; 1306 sin = (struct sockaddr_in *)sa; 1307 if (prison_check_ip4(cred, 1308 &sin->sin_addr) == 0) { 1309 ia = (struct in_ifaddr *)ifa; 1310 break; 1311 } 1312 } 1313 if (ia != NULL) { 1314 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1315 goto done; 1316 } 1317 } 1318 1319 /* 3. As a last resort return the 'default' jail address. */ 1320 error = prison_get_ip4(cred, laddr); 1321 goto done; 1322 } 1323 1324 done: 1325 return (error); 1326 } 1327 1328 /* 1329 * Set up for a connect from a socket to the specified address. 1330 * On entry, *laddrp and *lportp should contain the current local 1331 * address and port for the PCB; these are updated to the values 1332 * that should be placed in inp_laddr and inp_lport to complete 1333 * the connect. 1334 * 1335 * On success, *faddrp and *fportp will be set to the remote address 1336 * and port. These are not updated in the error case. 1337 * 1338 * If the operation fails because the connection already exists, 1339 * *oinpp will be set to the PCB of that connection so that the 1340 * caller can decide to override it. In all other cases, *oinpp 1341 * is set to NULL. 1342 */ 1343 int 1344 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1345 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1346 struct inpcb **oinpp, struct ucred *cred) 1347 { 1348 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1349 struct in_ifaddr *ia; 1350 struct inpcb *oinp; 1351 struct in_addr laddr, faddr; 1352 u_short lport, fport; 1353 int error; 1354 1355 KASSERT(sin->sin_family == AF_INET, 1356 ("%s: invalid address family for %p", __func__, sin)); 1357 KASSERT(sin->sin_len == sizeof(*sin), 1358 ("%s: invalid address length for %p", __func__, sin)); 1359 1360 /* 1361 * Because a global state change doesn't actually occur here, a read 1362 * lock is sufficient. 1363 */ 1364 NET_EPOCH_ASSERT(); 1365 INP_LOCK_ASSERT(inp); 1366 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1367 1368 if (oinpp != NULL) 1369 *oinpp = NULL; 1370 if (sin->sin_port == 0) 1371 return (EADDRNOTAVAIL); 1372 laddr.s_addr = *laddrp; 1373 lport = *lportp; 1374 faddr = sin->sin_addr; 1375 fport = sin->sin_port; 1376 #ifdef ROUTE_MPATH 1377 if (CALC_FLOWID_OUTBOUND) { 1378 uint32_t hash_val, hash_type; 1379 1380 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1381 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1382 1383 inp->inp_flowid = hash_val; 1384 inp->inp_flowtype = hash_type; 1385 } 1386 #endif 1387 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1388 /* 1389 * If the destination address is INADDR_ANY, 1390 * use the primary local address. 1391 * If the supplied address is INADDR_BROADCAST, 1392 * and the primary interface supports broadcast, 1393 * choose the broadcast address for that interface. 1394 */ 1395 if (faddr.s_addr == INADDR_ANY) { 1396 faddr = 1397 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1398 if (cred != NULL && 1399 (error = prison_get_ip4(cred, &faddr)) != 0) 1400 return (error); 1401 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1402 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1403 IFF_BROADCAST) 1404 faddr = satosin(&CK_STAILQ_FIRST( 1405 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1406 } 1407 } 1408 if (laddr.s_addr == INADDR_ANY) { 1409 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1410 /* 1411 * If the destination address is multicast and an outgoing 1412 * interface has been set as a multicast option, prefer the 1413 * address of that interface as our source address. 1414 */ 1415 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1416 inp->inp_moptions != NULL) { 1417 struct ip_moptions *imo; 1418 struct ifnet *ifp; 1419 1420 imo = inp->inp_moptions; 1421 if (imo->imo_multicast_ifp != NULL) { 1422 ifp = imo->imo_multicast_ifp; 1423 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1424 if ((ia->ia_ifp == ifp) && 1425 (cred == NULL || 1426 prison_check_ip4(cred, 1427 &ia->ia_addr.sin_addr) == 0)) 1428 break; 1429 } 1430 if (ia == NULL) 1431 error = EADDRNOTAVAIL; 1432 else { 1433 laddr = ia->ia_addr.sin_addr; 1434 error = 0; 1435 } 1436 } 1437 } 1438 if (error) 1439 return (error); 1440 } 1441 1442 if (lport != 0) { 1443 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1444 fport, laddr, lport, 0, NULL, M_NODOM); 1445 if (oinp != NULL) { 1446 if (oinpp != NULL) 1447 *oinpp = oinp; 1448 return (EADDRINUSE); 1449 } 1450 } else { 1451 struct sockaddr_in lsin, fsin; 1452 1453 bzero(&lsin, sizeof(lsin)); 1454 bzero(&fsin, sizeof(fsin)); 1455 lsin.sin_family = AF_INET; 1456 lsin.sin_addr = laddr; 1457 fsin.sin_family = AF_INET; 1458 fsin.sin_addr = faddr; 1459 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1460 &lport, (struct sockaddr *)& fsin, fport, cred, 1461 INPLOOKUP_WILDCARD); 1462 if (error) 1463 return (error); 1464 } 1465 *laddrp = laddr.s_addr; 1466 *lportp = lport; 1467 *faddrp = faddr.s_addr; 1468 *fportp = fport; 1469 return (0); 1470 } 1471 1472 void 1473 in_pcbdisconnect(struct inpcb *inp) 1474 { 1475 1476 INP_WLOCK_ASSERT(inp); 1477 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1478 1479 inp->inp_faddr.s_addr = INADDR_ANY; 1480 inp->inp_fport = 0; 1481 in_pcbrehash(inp); 1482 } 1483 #endif /* INET */ 1484 1485 /* 1486 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1487 * For most protocols, this will be invoked immediately prior to calling 1488 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1489 * socket, in which case in_pcbfree() is deferred. 1490 */ 1491 void 1492 in_pcbdetach(struct inpcb *inp) 1493 { 1494 1495 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1496 1497 #ifdef RATELIMIT 1498 if (inp->inp_snd_tag != NULL) 1499 in_pcbdetach_txrtlmt(inp); 1500 #endif 1501 inp->inp_socket->so_pcb = NULL; 1502 inp->inp_socket = NULL; 1503 } 1504 1505 /* 1506 * inpcb hash lookups are protected by SMR section. 1507 * 1508 * Once desired pcb has been found, switching from SMR section to a pcb 1509 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1510 * here because SMR is a critical section. 1511 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1512 */ 1513 static inline void 1514 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1515 { 1516 1517 lock == INPLOOKUP_RLOCKPCB ? 1518 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1519 } 1520 1521 static inline void 1522 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1523 { 1524 1525 lock == INPLOOKUP_RLOCKPCB ? 1526 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1527 } 1528 1529 static inline int 1530 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1531 { 1532 1533 return (lock == INPLOOKUP_RLOCKPCB ? 1534 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1535 } 1536 1537 static inline bool 1538 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1539 { 1540 1541 return (lock == INPLOOKUP_RLOCKPCB ? 1542 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1543 } 1544 1545 bool 1546 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1547 { 1548 1549 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1550 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1551 1552 if (__predict_true(inp_trylock(inp, lock))) { 1553 if (__predict_false(inp->inp_flags & INP_FREED)) { 1554 smr_exit(inp->inp_pcbinfo->ipi_smr); 1555 inp_unlock(inp, lock); 1556 return (false); 1557 } 1558 smr_exit(inp->inp_pcbinfo->ipi_smr); 1559 return (true); 1560 } 1561 1562 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1563 smr_exit(inp->inp_pcbinfo->ipi_smr); 1564 inp_lock(inp, lock); 1565 if (__predict_false(in_pcbrele(inp, lock))) 1566 return (false); 1567 /* 1568 * inp acquired through refcount & lock for sure didn't went 1569 * through uma_zfree(). However, it may have already went 1570 * through in_pcbfree() and has another reference, that 1571 * prevented its release by our in_pcbrele(). 1572 */ 1573 if (__predict_false(inp->inp_flags & INP_FREED)) { 1574 inp_unlock(inp, lock); 1575 return (false); 1576 } 1577 return (true); 1578 } else { 1579 smr_exit(inp->inp_pcbinfo->ipi_smr); 1580 return (false); 1581 } 1582 } 1583 1584 /* 1585 * inp_next() - inpcb hash/list traversal iterator 1586 * 1587 * Requires initialized struct inpcb_iterator for context. 1588 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1589 * 1590 * - Iterator can have either write-lock or read-lock semantics, that can not 1591 * be changed later. 1592 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1593 * a single hash slot. Note: only rip_input() does the latter. 1594 * - Iterator may have optional bool matching function. The matching function 1595 * will be executed for each inpcb in the SMR context, so it can not acquire 1596 * locks and can safely access only immutable fields of inpcb. 1597 * 1598 * A fresh initialized iterator has NULL inpcb in its context and that 1599 * means that inp_next() call would return the very first inpcb on the list 1600 * locked with desired semantic. In all following calls the context pointer 1601 * shall hold the current inpcb pointer. The KPI user is not supposed to 1602 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1603 * and write NULL to its context. After end of traversal an iterator can be 1604 * reused. 1605 * 1606 * List traversals have the following features/constraints: 1607 * - New entries won't be seen, as they are always added to the head of a list. 1608 * - Removed entries won't stop traversal as long as they are not added to 1609 * a different list. This is violated by in_pcbrehash(). 1610 */ 1611 #define II_LIST_FIRST(ipi, hash) \ 1612 (((hash) == INP_ALL_LIST) ? \ 1613 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1614 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1615 #define II_LIST_NEXT(inp, hash) \ 1616 (((hash) == INP_ALL_LIST) ? \ 1617 CK_LIST_NEXT((inp), inp_list) : \ 1618 CK_LIST_NEXT((inp), inp_hash)) 1619 #define II_LOCK_ASSERT(inp, lock) \ 1620 rw_assert(&(inp)->inp_lock, \ 1621 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1622 struct inpcb * 1623 inp_next(struct inpcb_iterator *ii) 1624 { 1625 const struct inpcbinfo *ipi = ii->ipi; 1626 inp_match_t *match = ii->match; 1627 void *ctx = ii->ctx; 1628 inp_lookup_t lock = ii->lock; 1629 int hash = ii->hash; 1630 struct inpcb *inp; 1631 1632 if (ii->inp == NULL) { /* First call. */ 1633 smr_enter(ipi->ipi_smr); 1634 /* This is unrolled CK_LIST_FOREACH(). */ 1635 for (inp = II_LIST_FIRST(ipi, hash); 1636 inp != NULL; 1637 inp = II_LIST_NEXT(inp, hash)) { 1638 if (match != NULL && (match)(inp, ctx) == false) 1639 continue; 1640 if (__predict_true(inp_smr_lock(inp, lock))) 1641 break; 1642 else { 1643 smr_enter(ipi->ipi_smr); 1644 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1645 inp = II_LIST_FIRST(ipi, hash); 1646 } 1647 } 1648 1649 if (inp == NULL) 1650 smr_exit(ipi->ipi_smr); 1651 else 1652 ii->inp = inp; 1653 1654 return (inp); 1655 } 1656 1657 /* Not a first call. */ 1658 smr_enter(ipi->ipi_smr); 1659 restart: 1660 inp = ii->inp; 1661 II_LOCK_ASSERT(inp, lock); 1662 next: 1663 inp = II_LIST_NEXT(inp, hash); 1664 if (inp == NULL) { 1665 smr_exit(ipi->ipi_smr); 1666 goto found; 1667 } 1668 1669 if (match != NULL && (match)(inp, ctx) == false) 1670 goto next; 1671 1672 if (__predict_true(inp_trylock(inp, lock))) { 1673 if (__predict_false(inp->inp_flags & INP_FREED)) { 1674 /* 1675 * Entries are never inserted in middle of a list, thus 1676 * as long as we are in SMR, we can continue traversal. 1677 * Jump to 'restart' should yield in the same result, 1678 * but could produce unnecessary looping. Could this 1679 * looping be unbound? 1680 */ 1681 inp_unlock(inp, lock); 1682 goto next; 1683 } else { 1684 smr_exit(ipi->ipi_smr); 1685 goto found; 1686 } 1687 } 1688 1689 /* 1690 * Can't obtain lock immediately, thus going hard. Once we exit the 1691 * SMR section we can no longer jump to 'next', and our only stable 1692 * anchoring point is ii->inp, which we keep locked for this case, so 1693 * we jump to 'restart'. 1694 */ 1695 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1696 smr_exit(ipi->ipi_smr); 1697 inp_lock(inp, lock); 1698 if (__predict_false(in_pcbrele(inp, lock))) { 1699 smr_enter(ipi->ipi_smr); 1700 goto restart; 1701 } 1702 /* 1703 * See comment in inp_smr_lock(). 1704 */ 1705 if (__predict_false(inp->inp_flags & INP_FREED)) { 1706 inp_unlock(inp, lock); 1707 smr_enter(ipi->ipi_smr); 1708 goto restart; 1709 } 1710 } else 1711 goto next; 1712 1713 found: 1714 inp_unlock(ii->inp, lock); 1715 ii->inp = inp; 1716 1717 return (ii->inp); 1718 } 1719 1720 /* 1721 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1722 * stability of an inpcb pointer despite the inpcb lock being released or 1723 * SMR section exited. 1724 * 1725 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1726 */ 1727 void 1728 in_pcbref(struct inpcb *inp) 1729 { 1730 u_int old __diagused; 1731 1732 old = refcount_acquire(&inp->inp_refcount); 1733 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1734 } 1735 1736 /* 1737 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1738 * freeing the pcb, if the reference was very last. 1739 */ 1740 bool 1741 in_pcbrele_rlocked(struct inpcb *inp) 1742 { 1743 1744 INP_RLOCK_ASSERT(inp); 1745 1746 if (refcount_release(&inp->inp_refcount) == 0) 1747 return (false); 1748 1749 MPASS(inp->inp_flags & INP_FREED); 1750 MPASS(inp->inp_socket == NULL); 1751 MPASS(inp->inp_in_hpts == 0); 1752 MPASS(inp->inp_in_dropq == 0); 1753 INP_RUNLOCK(inp); 1754 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1755 return (true); 1756 } 1757 1758 bool 1759 in_pcbrele_wlocked(struct inpcb *inp) 1760 { 1761 1762 INP_WLOCK_ASSERT(inp); 1763 1764 if (refcount_release(&inp->inp_refcount) == 0) 1765 return (false); 1766 1767 MPASS(inp->inp_flags & INP_FREED); 1768 MPASS(inp->inp_socket == NULL); 1769 MPASS(inp->inp_in_hpts == 0); 1770 MPASS(inp->inp_in_dropq == 0); 1771 INP_WUNLOCK(inp); 1772 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1773 return (true); 1774 } 1775 1776 /* 1777 * Unconditionally schedule an inpcb to be freed by decrementing its 1778 * reference count, which should occur only after the inpcb has been detached 1779 * from its socket. If another thread holds a temporary reference (acquired 1780 * using in_pcbref()) then the free is deferred until that reference is 1781 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1782 * Almost all work, including removal from global lists, is done in this 1783 * context, where the pcbinfo lock is held. 1784 */ 1785 void 1786 in_pcbfree(struct inpcb *inp) 1787 { 1788 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1789 #ifdef INET 1790 struct ip_moptions *imo; 1791 #endif 1792 #ifdef INET6 1793 struct ip6_moptions *im6o; 1794 #endif 1795 1796 INP_WLOCK_ASSERT(inp); 1797 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1798 KASSERT((inp->inp_flags & INP_FREED) == 0, 1799 ("%s: called twice for pcb %p", __func__, inp)); 1800 1801 inp->inp_flags |= INP_FREED; 1802 INP_INFO_WLOCK(pcbinfo); 1803 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1804 pcbinfo->ipi_count--; 1805 CK_LIST_REMOVE(inp, inp_list); 1806 INP_INFO_WUNLOCK(pcbinfo); 1807 1808 if (inp->inp_flags & INP_INHASHLIST) { 1809 struct inpcbport *phd = inp->inp_phd; 1810 1811 INP_HASH_WLOCK(pcbinfo); 1812 /* XXX: Only do if SO_REUSEPORT_LB set? */ 1813 in_pcbremlbgrouphash(inp); 1814 1815 CK_LIST_REMOVE(inp, inp_hash); 1816 CK_LIST_REMOVE(inp, inp_portlist); 1817 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1818 CK_LIST_REMOVE(phd, phd_hash); 1819 uma_zfree_smr(pcbinfo->ipi_portzone, phd); 1820 } 1821 INP_HASH_WUNLOCK(pcbinfo); 1822 inp->inp_flags &= ~INP_INHASHLIST; 1823 } 1824 1825 RO_INVALIDATE_CACHE(&inp->inp_route); 1826 #ifdef MAC 1827 mac_inpcb_destroy(inp); 1828 #endif 1829 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1830 if (inp->inp_sp != NULL) 1831 ipsec_delete_pcbpolicy(inp); 1832 #endif 1833 #ifdef INET 1834 if (inp->inp_options) 1835 (void)m_free(inp->inp_options); 1836 imo = inp->inp_moptions; 1837 #endif 1838 #ifdef INET6 1839 if (inp->inp_vflag & INP_IPV6PROTO) { 1840 ip6_freepcbopts(inp->in6p_outputopts); 1841 im6o = inp->in6p_moptions; 1842 } else 1843 im6o = NULL; 1844 #endif 1845 1846 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1847 INP_WUNLOCK(inp); 1848 } 1849 #ifdef INET6 1850 ip6_freemoptions(im6o); 1851 #endif 1852 #ifdef INET 1853 inp_freemoptions(imo); 1854 #endif 1855 /* Destruction is finalized in inpcb_dtor(). */ 1856 } 1857 1858 static void 1859 inpcb_dtor(void *mem, int size, void *arg) 1860 { 1861 struct inpcb *inp = mem; 1862 1863 crfree(inp->inp_cred); 1864 #ifdef INVARIANTS 1865 inp->inp_cred = NULL; 1866 #endif 1867 } 1868 1869 /* 1870 * Different protocols initialize their inpcbs differently - giving 1871 * different name to the lock. But they all are disposed the same. 1872 */ 1873 static void 1874 inpcb_fini(void *mem, int size) 1875 { 1876 struct inpcb *inp = mem; 1877 1878 INP_LOCK_DESTROY(inp); 1879 } 1880 1881 /* 1882 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1883 * port reservation, and preventing it from being returned by inpcb lookups. 1884 * 1885 * It is used by TCP to mark an inpcb as unused and avoid future packet 1886 * delivery or event notification when a socket remains open but TCP has 1887 * closed. This might occur as a result of a shutdown()-initiated TCP close 1888 * or a RST on the wire, and allows the port binding to be reused while still 1889 * maintaining the invariant that so_pcb always points to a valid inpcb until 1890 * in_pcbdetach(). 1891 * 1892 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1893 * in_pcbnotifyall() and in_pcbpurgeif0()? 1894 */ 1895 void 1896 in_pcbdrop(struct inpcb *inp) 1897 { 1898 1899 INP_WLOCK_ASSERT(inp); 1900 #ifdef INVARIANTS 1901 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1902 MPASS(inp->inp_refcount > 1); 1903 #endif 1904 1905 /* 1906 * XXXRW: Possibly we should protect the setting of INP_DROPPED with 1907 * the hash lock...? 1908 */ 1909 inp->inp_flags |= INP_DROPPED; 1910 if (inp->inp_flags & INP_INHASHLIST) { 1911 struct inpcbport *phd = inp->inp_phd; 1912 1913 INP_HASH_WLOCK(inp->inp_pcbinfo); 1914 in_pcbremlbgrouphash(inp); 1915 CK_LIST_REMOVE(inp, inp_hash); 1916 CK_LIST_REMOVE(inp, inp_portlist); 1917 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1918 CK_LIST_REMOVE(phd, phd_hash); 1919 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 1920 } 1921 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1922 inp->inp_flags &= ~INP_INHASHLIST; 1923 } 1924 } 1925 1926 #ifdef INET 1927 /* 1928 * Common routines to return the socket addresses associated with inpcbs. 1929 */ 1930 struct sockaddr * 1931 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1932 { 1933 struct sockaddr_in *sin; 1934 1935 sin = malloc(sizeof *sin, M_SONAME, 1936 M_WAITOK | M_ZERO); 1937 sin->sin_family = AF_INET; 1938 sin->sin_len = sizeof(*sin); 1939 sin->sin_addr = *addr_p; 1940 sin->sin_port = port; 1941 1942 return (struct sockaddr *)sin; 1943 } 1944 1945 int 1946 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1947 { 1948 struct inpcb *inp; 1949 struct in_addr addr; 1950 in_port_t port; 1951 1952 inp = sotoinpcb(so); 1953 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1954 1955 INP_RLOCK(inp); 1956 port = inp->inp_lport; 1957 addr = inp->inp_laddr; 1958 INP_RUNLOCK(inp); 1959 1960 *nam = in_sockaddr(port, &addr); 1961 return 0; 1962 } 1963 1964 int 1965 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1966 { 1967 struct inpcb *inp; 1968 struct in_addr addr; 1969 in_port_t port; 1970 1971 inp = sotoinpcb(so); 1972 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1973 1974 INP_RLOCK(inp); 1975 port = inp->inp_fport; 1976 addr = inp->inp_faddr; 1977 INP_RUNLOCK(inp); 1978 1979 *nam = in_sockaddr(port, &addr); 1980 return 0; 1981 } 1982 1983 void 1984 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1985 struct inpcb *(*notify)(struct inpcb *, int)) 1986 { 1987 struct inpcb *inp, *inp_temp; 1988 1989 INP_INFO_WLOCK(pcbinfo); 1990 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1991 INP_WLOCK(inp); 1992 #ifdef INET6 1993 if ((inp->inp_vflag & INP_IPV4) == 0) { 1994 INP_WUNLOCK(inp); 1995 continue; 1996 } 1997 #endif 1998 if (inp->inp_faddr.s_addr != faddr.s_addr || 1999 inp->inp_socket == NULL) { 2000 INP_WUNLOCK(inp); 2001 continue; 2002 } 2003 if ((*notify)(inp, errno)) 2004 INP_WUNLOCK(inp); 2005 } 2006 INP_INFO_WUNLOCK(pcbinfo); 2007 } 2008 2009 static bool 2010 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 2011 { 2012 2013 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 2014 return (true); 2015 else 2016 return (false); 2017 } 2018 2019 void 2020 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 2021 { 2022 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 2023 inp_v4_multi_match, NULL); 2024 struct inpcb *inp; 2025 struct in_multi *inm; 2026 struct in_mfilter *imf; 2027 struct ip_moptions *imo; 2028 2029 IN_MULTI_LOCK_ASSERT(); 2030 2031 while ((inp = inp_next(&inpi)) != NULL) { 2032 INP_WLOCK_ASSERT(inp); 2033 2034 imo = inp->inp_moptions; 2035 /* 2036 * Unselect the outgoing interface if it is being 2037 * detached. 2038 */ 2039 if (imo->imo_multicast_ifp == ifp) 2040 imo->imo_multicast_ifp = NULL; 2041 2042 /* 2043 * Drop multicast group membership if we joined 2044 * through the interface being detached. 2045 * 2046 * XXX This can all be deferred to an epoch_call 2047 */ 2048 restart: 2049 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2050 if ((inm = imf->imf_inm) == NULL) 2051 continue; 2052 if (inm->inm_ifp != ifp) 2053 continue; 2054 ip_mfilter_remove(&imo->imo_head, imf); 2055 in_leavegroup_locked(inm, NULL); 2056 ip_mfilter_free(imf); 2057 goto restart; 2058 } 2059 } 2060 } 2061 2062 /* 2063 * Lookup a PCB based on the local address and port. Caller must hold the 2064 * hash lock. No inpcb locks or references are acquired. 2065 */ 2066 #define INP_LOOKUP_MAPPED_PCB_COST 3 2067 struct inpcb * 2068 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2069 u_short lport, int lookupflags, struct ucred *cred) 2070 { 2071 struct inpcb *inp; 2072 #ifdef INET6 2073 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2074 #else 2075 int matchwild = 3; 2076 #endif 2077 int wildcard; 2078 2079 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2080 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2081 INP_HASH_LOCK_ASSERT(pcbinfo); 2082 2083 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2084 struct inpcbhead *head; 2085 /* 2086 * Look for an unconnected (wildcard foreign addr) PCB that 2087 * matches the local address and port we're looking for. 2088 */ 2089 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 2090 0, pcbinfo->ipi_hashmask)]; 2091 CK_LIST_FOREACH(inp, head, inp_hash) { 2092 #ifdef INET6 2093 /* XXX inp locking */ 2094 if ((inp->inp_vflag & INP_IPV4) == 0) 2095 continue; 2096 #endif 2097 if (inp->inp_faddr.s_addr == INADDR_ANY && 2098 inp->inp_laddr.s_addr == laddr.s_addr && 2099 inp->inp_lport == lport) { 2100 /* 2101 * Found? 2102 */ 2103 if (cred == NULL || 2104 prison_equal_ip4(cred->cr_prison, 2105 inp->inp_cred->cr_prison)) 2106 return (inp); 2107 } 2108 } 2109 /* 2110 * Not found. 2111 */ 2112 return (NULL); 2113 } else { 2114 struct inpcbporthead *porthash; 2115 struct inpcbport *phd; 2116 struct inpcb *match = NULL; 2117 /* 2118 * Best fit PCB lookup. 2119 * 2120 * First see if this local port is in use by looking on the 2121 * port hash list. 2122 */ 2123 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2124 pcbinfo->ipi_porthashmask)]; 2125 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2126 if (phd->phd_port == lport) 2127 break; 2128 } 2129 if (phd != NULL) { 2130 /* 2131 * Port is in use by one or more PCBs. Look for best 2132 * fit. 2133 */ 2134 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2135 wildcard = 0; 2136 if (cred != NULL && 2137 !prison_equal_ip4(inp->inp_cred->cr_prison, 2138 cred->cr_prison)) 2139 continue; 2140 #ifdef INET6 2141 /* XXX inp locking */ 2142 if ((inp->inp_vflag & INP_IPV4) == 0) 2143 continue; 2144 /* 2145 * We never select the PCB that has 2146 * INP_IPV6 flag and is bound to :: if 2147 * we have another PCB which is bound 2148 * to 0.0.0.0. If a PCB has the 2149 * INP_IPV6 flag, then we set its cost 2150 * higher than IPv4 only PCBs. 2151 * 2152 * Note that the case only happens 2153 * when a socket is bound to ::, under 2154 * the condition that the use of the 2155 * mapped address is allowed. 2156 */ 2157 if ((inp->inp_vflag & INP_IPV6) != 0) 2158 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2159 #endif 2160 if (inp->inp_faddr.s_addr != INADDR_ANY) 2161 wildcard++; 2162 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2163 if (laddr.s_addr == INADDR_ANY) 2164 wildcard++; 2165 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2166 continue; 2167 } else { 2168 if (laddr.s_addr != INADDR_ANY) 2169 wildcard++; 2170 } 2171 if (wildcard < matchwild) { 2172 match = inp; 2173 matchwild = wildcard; 2174 if (matchwild == 0) 2175 break; 2176 } 2177 } 2178 } 2179 return (match); 2180 } 2181 } 2182 #undef INP_LOOKUP_MAPPED_PCB_COST 2183 2184 static struct inpcb * 2185 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2186 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2187 uint16_t fport, int lookupflags, int numa_domain) 2188 { 2189 struct inpcb *local_wild, *numa_wild; 2190 const struct inpcblbgrouphead *hdr; 2191 struct inpcblbgroup *grp; 2192 uint32_t idx; 2193 2194 INP_HASH_LOCK_ASSERT(pcbinfo); 2195 2196 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2197 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2198 2199 /* 2200 * Order of socket selection: 2201 * 1. non-wild. 2202 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). 2203 * 2204 * NOTE: 2205 * - Load balanced group does not contain jailed sockets 2206 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets 2207 */ 2208 local_wild = NULL; 2209 numa_wild = NULL; 2210 CK_LIST_FOREACH(grp, hdr, il_list) { 2211 #ifdef INET6 2212 if (!(grp->il_vflag & INP_IPV4)) 2213 continue; 2214 #endif 2215 if (grp->il_lport != lport) 2216 continue; 2217 2218 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % 2219 grp->il_inpcnt; 2220 if (grp->il_laddr.s_addr == laddr->s_addr) { 2221 if (numa_domain == M_NODOM || 2222 grp->il_numa_domain == numa_domain) { 2223 return (grp->il_inp[idx]); 2224 } else { 2225 numa_wild = grp->il_inp[idx]; 2226 } 2227 } 2228 if (grp->il_laddr.s_addr == INADDR_ANY && 2229 (lookupflags & INPLOOKUP_WILDCARD) != 0 && 2230 (local_wild == NULL || numa_domain == M_NODOM || 2231 grp->il_numa_domain == numa_domain)) { 2232 local_wild = grp->il_inp[idx]; 2233 } 2234 } 2235 if (numa_wild != NULL) 2236 return (numa_wild); 2237 2238 return (local_wild); 2239 } 2240 2241 /* 2242 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2243 * that the caller has either locked the hash list, which usually happens 2244 * for bind(2) operations, or is in SMR section, which happens when sorting 2245 * out incoming packets. 2246 */ 2247 static struct inpcb * 2248 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2249 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2250 struct ifnet *ifp, uint8_t numa_domain) 2251 { 2252 struct inpcbhead *head; 2253 struct inpcb *inp, *tmpinp; 2254 u_short fport = fport_arg, lport = lport_arg; 2255 2256 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2257 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2258 INP_HASH_LOCK_ASSERT(pcbinfo); 2259 2260 /* 2261 * First look for an exact match. 2262 */ 2263 tmpinp = NULL; 2264 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, 2265 pcbinfo->ipi_hashmask)]; 2266 CK_LIST_FOREACH(inp, head, inp_hash) { 2267 #ifdef INET6 2268 /* XXX inp locking */ 2269 if ((inp->inp_vflag & INP_IPV4) == 0) 2270 continue; 2271 #endif 2272 if (inp->inp_faddr.s_addr == faddr.s_addr && 2273 inp->inp_laddr.s_addr == laddr.s_addr && 2274 inp->inp_fport == fport && 2275 inp->inp_lport == lport) { 2276 /* 2277 * XXX We should be able to directly return 2278 * the inp here, without any checks. 2279 * Well unless both bound with SO_REUSEPORT? 2280 */ 2281 if (prison_flag(inp->inp_cred, PR_IP4)) 2282 return (inp); 2283 if (tmpinp == NULL) 2284 tmpinp = inp; 2285 } 2286 } 2287 if (tmpinp != NULL) 2288 return (tmpinp); 2289 2290 /* 2291 * Then look in lb group (for wildcard match). 2292 */ 2293 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2294 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2295 fport, lookupflags, numa_domain); 2296 if (inp != NULL) 2297 return (inp); 2298 } 2299 2300 /* 2301 * Then look for a wildcard match, if requested. 2302 */ 2303 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2304 struct inpcb *local_wild = NULL, *local_exact = NULL; 2305 #ifdef INET6 2306 struct inpcb *local_wild_mapped = NULL; 2307 #endif 2308 struct inpcb *jail_wild = NULL; 2309 int injail; 2310 2311 /* 2312 * Order of socket selection - we always prefer jails. 2313 * 1. jailed, non-wild. 2314 * 2. jailed, wild. 2315 * 3. non-jailed, non-wild. 2316 * 4. non-jailed, wild. 2317 */ 2318 2319 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 2320 0, pcbinfo->ipi_hashmask)]; 2321 CK_LIST_FOREACH(inp, head, inp_hash) { 2322 #ifdef INET6 2323 /* XXX inp locking */ 2324 if ((inp->inp_vflag & INP_IPV4) == 0) 2325 continue; 2326 #endif 2327 if (inp->inp_faddr.s_addr != INADDR_ANY || 2328 inp->inp_lport != lport) 2329 continue; 2330 2331 injail = prison_flag(inp->inp_cred, PR_IP4); 2332 if (injail) { 2333 if (prison_check_ip4_locked( 2334 inp->inp_cred->cr_prison, &laddr) != 0) 2335 continue; 2336 } else { 2337 if (local_exact != NULL) 2338 continue; 2339 } 2340 2341 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2342 if (injail) 2343 return (inp); 2344 else 2345 local_exact = inp; 2346 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2347 #ifdef INET6 2348 /* XXX inp locking, NULL check */ 2349 if (inp->inp_vflag & INP_IPV6PROTO) 2350 local_wild_mapped = inp; 2351 else 2352 #endif 2353 if (injail) 2354 jail_wild = inp; 2355 else 2356 local_wild = inp; 2357 } 2358 } /* LIST_FOREACH */ 2359 if (jail_wild != NULL) 2360 return (jail_wild); 2361 if (local_exact != NULL) 2362 return (local_exact); 2363 if (local_wild != NULL) 2364 return (local_wild); 2365 #ifdef INET6 2366 if (local_wild_mapped != NULL) 2367 return (local_wild_mapped); 2368 #endif 2369 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2370 2371 return (NULL); 2372 } 2373 2374 /* 2375 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2376 * hash list lock, and will return the inpcb locked (i.e., requires 2377 * INPLOOKUP_LOCKPCB). 2378 */ 2379 static struct inpcb * 2380 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2381 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2382 struct ifnet *ifp, uint8_t numa_domain) 2383 { 2384 struct inpcb *inp; 2385 2386 smr_enter(pcbinfo->ipi_smr); 2387 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2388 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2389 if (inp != NULL) { 2390 if (__predict_false(inp_smr_lock(inp, 2391 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2392 inp = NULL; 2393 } else 2394 smr_exit(pcbinfo->ipi_smr); 2395 2396 return (inp); 2397 } 2398 2399 /* 2400 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2401 * from which a pre-calculated hash value may be extracted. 2402 */ 2403 struct inpcb * 2404 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2405 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2406 { 2407 2408 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2409 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2410 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2411 ("%s: LOCKPCB not set", __func__)); 2412 2413 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2414 lookupflags, ifp, M_NODOM)); 2415 } 2416 2417 struct inpcb * 2418 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2419 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2420 struct ifnet *ifp, struct mbuf *m) 2421 { 2422 2423 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2424 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2425 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2426 ("%s: LOCKPCB not set", __func__)); 2427 2428 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2429 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2430 } 2431 #endif /* INET */ 2432 2433 /* 2434 * Insert PCB onto various hash lists. 2435 */ 2436 int 2437 in_pcbinshash(struct inpcb *inp) 2438 { 2439 struct inpcbhead *pcbhash; 2440 struct inpcbporthead *pcbporthash; 2441 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2442 struct inpcbport *phd; 2443 u_int32_t hashkey_faddr; 2444 int so_options; 2445 2446 INP_WLOCK_ASSERT(inp); 2447 INP_HASH_WLOCK_ASSERT(pcbinfo); 2448 2449 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2450 ("in_pcbinshash: INP_INHASHLIST")); 2451 2452 #ifdef INET6 2453 if (inp->inp_vflag & INP_IPV6) 2454 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2455 else 2456 #endif 2457 hashkey_faddr = inp->inp_faddr.s_addr; 2458 2459 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2460 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2461 2462 pcbporthash = &pcbinfo->ipi_porthashbase[ 2463 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2464 2465 /* 2466 * Add entry to load balance group. 2467 * Only do this if SO_REUSEPORT_LB is set. 2468 */ 2469 so_options = inp_so_options(inp); 2470 if (so_options & SO_REUSEPORT_LB) { 2471 int ret = in_pcbinslbgrouphash(inp, M_NODOM); 2472 if (ret) { 2473 /* pcb lb group malloc fail (ret=ENOBUFS). */ 2474 return (ret); 2475 } 2476 } 2477 2478 /* 2479 * Go through port list and look for a head for this lport. 2480 */ 2481 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2482 if (phd->phd_port == inp->inp_lport) 2483 break; 2484 } 2485 /* 2486 * If none exists, malloc one and tack it on. 2487 */ 2488 if (phd == NULL) { 2489 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2490 if (phd == NULL) { 2491 return (ENOBUFS); /* XXX */ 2492 } 2493 phd->phd_port = inp->inp_lport; 2494 CK_LIST_INIT(&phd->phd_pcblist); 2495 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2496 } 2497 inp->inp_phd = phd; 2498 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2499 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2500 inp->inp_flags |= INP_INHASHLIST; 2501 2502 return (0); 2503 } 2504 2505 /* 2506 * Move PCB to the proper hash bucket when { faddr, fport } have been 2507 * changed. NOTE: This does not handle the case of the lport changing (the 2508 * hashed port list would have to be updated as well), so the lport must 2509 * not change after in_pcbinshash() has been called. 2510 * 2511 * XXXGL: a race between this function and SMR-protected hash iterator 2512 * will lead to iterator traversing a possibly wrong hash list. However, 2513 * this race should have been here since change from rwlock to epoch. 2514 */ 2515 void 2516 in_pcbrehash(struct inpcb *inp) 2517 { 2518 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2519 struct inpcbhead *head; 2520 u_int32_t hashkey_faddr; 2521 2522 INP_WLOCK_ASSERT(inp); 2523 INP_HASH_WLOCK_ASSERT(pcbinfo); 2524 2525 KASSERT(inp->inp_flags & INP_INHASHLIST, 2526 ("in_pcbrehash: !INP_INHASHLIST")); 2527 2528 #ifdef INET6 2529 if (inp->inp_vflag & INP_IPV6) 2530 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2531 else 2532 #endif 2533 hashkey_faddr = inp->inp_faddr.s_addr; 2534 2535 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2536 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2537 2538 CK_LIST_REMOVE(inp, inp_hash); 2539 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2540 } 2541 2542 /* 2543 * Check for alternatives when higher level complains 2544 * about service problems. For now, invalidate cached 2545 * routing information. If the route was created dynamically 2546 * (by a redirect), time to try a default gateway again. 2547 */ 2548 void 2549 in_losing(struct inpcb *inp) 2550 { 2551 2552 RO_INVALIDATE_CACHE(&inp->inp_route); 2553 return; 2554 } 2555 2556 /* 2557 * A set label operation has occurred at the socket layer, propagate the 2558 * label change into the in_pcb for the socket. 2559 */ 2560 void 2561 in_pcbsosetlabel(struct socket *so) 2562 { 2563 #ifdef MAC 2564 struct inpcb *inp; 2565 2566 inp = sotoinpcb(so); 2567 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2568 2569 INP_WLOCK(inp); 2570 SOCK_LOCK(so); 2571 mac_inpcb_sosetlabel(so, inp); 2572 SOCK_UNLOCK(so); 2573 INP_WUNLOCK(inp); 2574 #endif 2575 } 2576 2577 /* 2578 * ipport_tick runs once per second, determining if random port allocation 2579 * should be continued. If more than ipport_randomcps ports have been 2580 * allocated in the last second, then we return to sequential port 2581 * allocation. We return to random allocation only once we drop below 2582 * ipport_randomcps for at least ipport_randomtime seconds. 2583 */ 2584 static void 2585 ipport_tick(void *xtp) 2586 { 2587 VNET_ITERATOR_DECL(vnet_iter); 2588 2589 VNET_LIST_RLOCK_NOSLEEP(); 2590 VNET_FOREACH(vnet_iter) { 2591 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ 2592 if (V_ipport_tcpallocs - V_ipport_tcplastcount <= 2593 V_ipport_randomcps) { 2594 if (V_ipport_stoprandom > 0) 2595 V_ipport_stoprandom--; 2596 } else 2597 V_ipport_stoprandom = V_ipport_randomtime; 2598 V_ipport_tcplastcount = V_ipport_tcpallocs; 2599 CURVNET_RESTORE(); 2600 } 2601 VNET_LIST_RUNLOCK_NOSLEEP(); 2602 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); 2603 } 2604 2605 static void 2606 ip_fini(void *xtp) 2607 { 2608 2609 callout_stop(&ipport_tick_callout); 2610 } 2611 2612 /* 2613 * The ipport_callout should start running at about the time we attach the 2614 * inet or inet6 domains. 2615 */ 2616 static void 2617 ipport_tick_init(const void *unused __unused) 2618 { 2619 2620 /* Start ipport_tick. */ 2621 callout_init(&ipport_tick_callout, 1); 2622 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 2623 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 2624 SHUTDOWN_PRI_DEFAULT); 2625 } 2626 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 2627 ipport_tick_init, NULL); 2628 2629 void 2630 inp_wlock(struct inpcb *inp) 2631 { 2632 2633 INP_WLOCK(inp); 2634 } 2635 2636 void 2637 inp_wunlock(struct inpcb *inp) 2638 { 2639 2640 INP_WUNLOCK(inp); 2641 } 2642 2643 void 2644 inp_rlock(struct inpcb *inp) 2645 { 2646 2647 INP_RLOCK(inp); 2648 } 2649 2650 void 2651 inp_runlock(struct inpcb *inp) 2652 { 2653 2654 INP_RUNLOCK(inp); 2655 } 2656 2657 #ifdef INVARIANT_SUPPORT 2658 void 2659 inp_lock_assert(struct inpcb *inp) 2660 { 2661 2662 INP_WLOCK_ASSERT(inp); 2663 } 2664 2665 void 2666 inp_unlock_assert(struct inpcb *inp) 2667 { 2668 2669 INP_UNLOCK_ASSERT(inp); 2670 } 2671 #endif 2672 2673 void 2674 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) 2675 { 2676 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 2677 INPLOOKUP_WLOCKPCB); 2678 struct inpcb *inp; 2679 2680 while ((inp = inp_next(&inpi)) != NULL) 2681 func(inp, arg); 2682 } 2683 2684 struct socket * 2685 inp_inpcbtosocket(struct inpcb *inp) 2686 { 2687 2688 INP_WLOCK_ASSERT(inp); 2689 return (inp->inp_socket); 2690 } 2691 2692 struct tcpcb * 2693 inp_inpcbtotcpcb(struct inpcb *inp) 2694 { 2695 2696 INP_WLOCK_ASSERT(inp); 2697 return ((struct tcpcb *)inp->inp_ppcb); 2698 } 2699 2700 int 2701 inp_ip_tos_get(const struct inpcb *inp) 2702 { 2703 2704 return (inp->inp_ip_tos); 2705 } 2706 2707 void 2708 inp_ip_tos_set(struct inpcb *inp, int val) 2709 { 2710 2711 inp->inp_ip_tos = val; 2712 } 2713 2714 void 2715 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2716 uint32_t *faddr, uint16_t *fp) 2717 { 2718 2719 INP_LOCK_ASSERT(inp); 2720 *laddr = inp->inp_laddr.s_addr; 2721 *faddr = inp->inp_faddr.s_addr; 2722 *lp = inp->inp_lport; 2723 *fp = inp->inp_fport; 2724 } 2725 2726 struct inpcb * 2727 so_sotoinpcb(struct socket *so) 2728 { 2729 2730 return (sotoinpcb(so)); 2731 } 2732 2733 struct tcpcb * 2734 so_sototcpcb(struct socket *so) 2735 { 2736 2737 return (sototcpcb(so)); 2738 } 2739 2740 /* 2741 * Create an external-format (``xinpcb'') structure using the information in 2742 * the kernel-format in_pcb structure pointed to by inp. This is done to 2743 * reduce the spew of irrelevant information over this interface, to isolate 2744 * user code from changes in the kernel structure, and potentially to provide 2745 * information-hiding if we decide that some of this information should be 2746 * hidden from users. 2747 */ 2748 void 2749 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2750 { 2751 2752 bzero(xi, sizeof(*xi)); 2753 xi->xi_len = sizeof(struct xinpcb); 2754 if (inp->inp_socket) 2755 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2756 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2757 xi->inp_gencnt = inp->inp_gencnt; 2758 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2759 xi->inp_flow = inp->inp_flow; 2760 xi->inp_flowid = inp->inp_flowid; 2761 xi->inp_flowtype = inp->inp_flowtype; 2762 xi->inp_flags = inp->inp_flags; 2763 xi->inp_flags2 = inp->inp_flags2; 2764 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2765 xi->in6p_cksum = inp->in6p_cksum; 2766 xi->in6p_hops = inp->in6p_hops; 2767 xi->inp_ip_tos = inp->inp_ip_tos; 2768 xi->inp_vflag = inp->inp_vflag; 2769 xi->inp_ip_ttl = inp->inp_ip_ttl; 2770 xi->inp_ip_p = inp->inp_ip_p; 2771 xi->inp_ip_minttl = inp->inp_ip_minttl; 2772 } 2773 2774 #ifdef DDB 2775 static void 2776 db_print_indent(int indent) 2777 { 2778 int i; 2779 2780 for (i = 0; i < indent; i++) 2781 db_printf(" "); 2782 } 2783 2784 static void 2785 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2786 { 2787 char faddr_str[48], laddr_str[48]; 2788 2789 db_print_indent(indent); 2790 db_printf("%s at %p\n", name, inc); 2791 2792 indent += 2; 2793 2794 #ifdef INET6 2795 if (inc->inc_flags & INC_ISIPV6) { 2796 /* IPv6. */ 2797 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2798 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2799 } else 2800 #endif 2801 { 2802 /* IPv4. */ 2803 inet_ntoa_r(inc->inc_laddr, laddr_str); 2804 inet_ntoa_r(inc->inc_faddr, faddr_str); 2805 } 2806 db_print_indent(indent); 2807 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2808 ntohs(inc->inc_lport)); 2809 db_print_indent(indent); 2810 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2811 ntohs(inc->inc_fport)); 2812 } 2813 2814 static void 2815 db_print_inpflags(int inp_flags) 2816 { 2817 int comma; 2818 2819 comma = 0; 2820 if (inp_flags & INP_RECVOPTS) { 2821 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2822 comma = 1; 2823 } 2824 if (inp_flags & INP_RECVRETOPTS) { 2825 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2826 comma = 1; 2827 } 2828 if (inp_flags & INP_RECVDSTADDR) { 2829 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2830 comma = 1; 2831 } 2832 if (inp_flags & INP_ORIGDSTADDR) { 2833 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2834 comma = 1; 2835 } 2836 if (inp_flags & INP_HDRINCL) { 2837 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2838 comma = 1; 2839 } 2840 if (inp_flags & INP_HIGHPORT) { 2841 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2842 comma = 1; 2843 } 2844 if (inp_flags & INP_LOWPORT) { 2845 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2846 comma = 1; 2847 } 2848 if (inp_flags & INP_ANONPORT) { 2849 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2850 comma = 1; 2851 } 2852 if (inp_flags & INP_RECVIF) { 2853 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2854 comma = 1; 2855 } 2856 if (inp_flags & INP_MTUDISC) { 2857 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2858 comma = 1; 2859 } 2860 if (inp_flags & INP_RECVTTL) { 2861 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2862 comma = 1; 2863 } 2864 if (inp_flags & INP_DONTFRAG) { 2865 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2866 comma = 1; 2867 } 2868 if (inp_flags & INP_RECVTOS) { 2869 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2870 comma = 1; 2871 } 2872 if (inp_flags & IN6P_IPV6_V6ONLY) { 2873 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2874 comma = 1; 2875 } 2876 if (inp_flags & IN6P_PKTINFO) { 2877 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2878 comma = 1; 2879 } 2880 if (inp_flags & IN6P_HOPLIMIT) { 2881 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2882 comma = 1; 2883 } 2884 if (inp_flags & IN6P_HOPOPTS) { 2885 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2886 comma = 1; 2887 } 2888 if (inp_flags & IN6P_DSTOPTS) { 2889 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2890 comma = 1; 2891 } 2892 if (inp_flags & IN6P_RTHDR) { 2893 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2894 comma = 1; 2895 } 2896 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2897 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2898 comma = 1; 2899 } 2900 if (inp_flags & IN6P_TCLASS) { 2901 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2902 comma = 1; 2903 } 2904 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2905 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2906 comma = 1; 2907 } 2908 if (inp_flags & INP_TIMEWAIT) { 2909 db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); 2910 comma = 1; 2911 } 2912 if (inp_flags & INP_ONESBCAST) { 2913 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2914 comma = 1; 2915 } 2916 if (inp_flags & INP_DROPPED) { 2917 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2918 comma = 1; 2919 } 2920 if (inp_flags & INP_SOCKREF) { 2921 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2922 comma = 1; 2923 } 2924 if (inp_flags & IN6P_RFC2292) { 2925 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2926 comma = 1; 2927 } 2928 if (inp_flags & IN6P_MTU) { 2929 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2930 comma = 1; 2931 } 2932 } 2933 2934 static void 2935 db_print_inpvflag(u_char inp_vflag) 2936 { 2937 int comma; 2938 2939 comma = 0; 2940 if (inp_vflag & INP_IPV4) { 2941 db_printf("%sINP_IPV4", comma ? ", " : ""); 2942 comma = 1; 2943 } 2944 if (inp_vflag & INP_IPV6) { 2945 db_printf("%sINP_IPV6", comma ? ", " : ""); 2946 comma = 1; 2947 } 2948 if (inp_vflag & INP_IPV6PROTO) { 2949 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2950 comma = 1; 2951 } 2952 } 2953 2954 static void 2955 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2956 { 2957 2958 db_print_indent(indent); 2959 db_printf("%s at %p\n", name, inp); 2960 2961 indent += 2; 2962 2963 db_print_indent(indent); 2964 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2965 2966 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2967 2968 db_print_indent(indent); 2969 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2970 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 2971 2972 db_print_indent(indent); 2973 db_printf("inp_label: %p inp_flags: 0x%x (", 2974 inp->inp_label, inp->inp_flags); 2975 db_print_inpflags(inp->inp_flags); 2976 db_printf(")\n"); 2977 2978 db_print_indent(indent); 2979 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 2980 inp->inp_vflag); 2981 db_print_inpvflag(inp->inp_vflag); 2982 db_printf(")\n"); 2983 2984 db_print_indent(indent); 2985 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 2986 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 2987 2988 db_print_indent(indent); 2989 #ifdef INET6 2990 if (inp->inp_vflag & INP_IPV6) { 2991 db_printf("in6p_options: %p in6p_outputopts: %p " 2992 "in6p_moptions: %p\n", inp->in6p_options, 2993 inp->in6p_outputopts, inp->in6p_moptions); 2994 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 2995 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 2996 inp->in6p_hops); 2997 } else 2998 #endif 2999 { 3000 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3001 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3002 inp->inp_options, inp->inp_moptions); 3003 } 3004 3005 db_print_indent(indent); 3006 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3007 (uintmax_t)inp->inp_gencnt); 3008 } 3009 3010 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3011 { 3012 struct inpcb *inp; 3013 3014 if (!have_addr) { 3015 db_printf("usage: show inpcb <addr>\n"); 3016 return; 3017 } 3018 inp = (struct inpcb *)addr; 3019 3020 db_print_inpcb(inp, "inpcb", 0); 3021 } 3022 #endif /* DDB */ 3023 3024 #ifdef RATELIMIT 3025 /* 3026 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3027 * if any. 3028 */ 3029 int 3030 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3031 { 3032 union if_snd_tag_modify_params params = { 3033 .rate_limit.max_rate = max_pacing_rate, 3034 .rate_limit.flags = M_NOWAIT, 3035 }; 3036 struct m_snd_tag *mst; 3037 int error; 3038 3039 mst = inp->inp_snd_tag; 3040 if (mst == NULL) 3041 return (EINVAL); 3042 3043 if (mst->sw->snd_tag_modify == NULL) { 3044 error = EOPNOTSUPP; 3045 } else { 3046 error = mst->sw->snd_tag_modify(mst, ¶ms); 3047 } 3048 return (error); 3049 } 3050 3051 /* 3052 * Query existing TX rate limit based on the existing 3053 * "inp->inp_snd_tag", if any. 3054 */ 3055 int 3056 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3057 { 3058 union if_snd_tag_query_params params = { }; 3059 struct m_snd_tag *mst; 3060 int error; 3061 3062 mst = inp->inp_snd_tag; 3063 if (mst == NULL) 3064 return (EINVAL); 3065 3066 if (mst->sw->snd_tag_query == NULL) { 3067 error = EOPNOTSUPP; 3068 } else { 3069 error = mst->sw->snd_tag_query(mst, ¶ms); 3070 if (error == 0 && p_max_pacing_rate != NULL) 3071 *p_max_pacing_rate = params.rate_limit.max_rate; 3072 } 3073 return (error); 3074 } 3075 3076 /* 3077 * Query existing TX queue level based on the existing 3078 * "inp->inp_snd_tag", if any. 3079 */ 3080 int 3081 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3082 { 3083 union if_snd_tag_query_params params = { }; 3084 struct m_snd_tag *mst; 3085 int error; 3086 3087 mst = inp->inp_snd_tag; 3088 if (mst == NULL) 3089 return (EINVAL); 3090 3091 if (mst->sw->snd_tag_query == NULL) 3092 return (EOPNOTSUPP); 3093 3094 error = mst->sw->snd_tag_query(mst, ¶ms); 3095 if (error == 0 && p_txqueue_level != NULL) 3096 *p_txqueue_level = params.rate_limit.queue_level; 3097 return (error); 3098 } 3099 3100 /* 3101 * Allocate a new TX rate limit send tag from the network interface 3102 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3103 */ 3104 int 3105 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3106 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3107 3108 { 3109 union if_snd_tag_alloc_params params = { 3110 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3111 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3112 .rate_limit.hdr.flowid = flowid, 3113 .rate_limit.hdr.flowtype = flowtype, 3114 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3115 .rate_limit.max_rate = max_pacing_rate, 3116 .rate_limit.flags = M_NOWAIT, 3117 }; 3118 int error; 3119 3120 INP_WLOCK_ASSERT(inp); 3121 3122 /* 3123 * If there is already a send tag, or the INP is being torn 3124 * down, allocating a new send tag is not allowed. Else send 3125 * tags may leak. 3126 */ 3127 if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0) 3128 return (EINVAL); 3129 3130 error = m_snd_tag_alloc(ifp, ¶ms, st); 3131 #ifdef INET 3132 if (error == 0) { 3133 counter_u64_add(rate_limit_set_ok, 1); 3134 counter_u64_add(rate_limit_active, 1); 3135 } else if (error != EOPNOTSUPP) 3136 counter_u64_add(rate_limit_alloc_fail, 1); 3137 #endif 3138 return (error); 3139 } 3140 3141 void 3142 in_pcbdetach_tag(struct m_snd_tag *mst) 3143 { 3144 3145 m_snd_tag_rele(mst); 3146 #ifdef INET 3147 counter_u64_add(rate_limit_active, -1); 3148 #endif 3149 } 3150 3151 /* 3152 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3153 * if any: 3154 */ 3155 void 3156 in_pcbdetach_txrtlmt(struct inpcb *inp) 3157 { 3158 struct m_snd_tag *mst; 3159 3160 INP_WLOCK_ASSERT(inp); 3161 3162 mst = inp->inp_snd_tag; 3163 inp->inp_snd_tag = NULL; 3164 3165 if (mst == NULL) 3166 return; 3167 3168 m_snd_tag_rele(mst); 3169 #ifdef INET 3170 counter_u64_add(rate_limit_active, -1); 3171 #endif 3172 } 3173 3174 int 3175 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3176 { 3177 int error; 3178 3179 /* 3180 * If the existing send tag is for the wrong interface due to 3181 * a route change, first drop the existing tag. Set the 3182 * CHANGED flag so that we will keep trying to allocate a new 3183 * tag if we fail to allocate one this time. 3184 */ 3185 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3186 in_pcbdetach_txrtlmt(inp); 3187 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3188 } 3189 3190 /* 3191 * NOTE: When attaching to a network interface a reference is 3192 * made to ensure the network interface doesn't go away until 3193 * all ratelimit connections are gone. The network interface 3194 * pointers compared below represent valid network interfaces, 3195 * except when comparing towards NULL. 3196 */ 3197 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3198 error = 0; 3199 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3200 if (inp->inp_snd_tag != NULL) 3201 in_pcbdetach_txrtlmt(inp); 3202 error = 0; 3203 } else if (inp->inp_snd_tag == NULL) { 3204 /* 3205 * In order to utilize packet pacing with RSS, we need 3206 * to wait until there is a valid RSS hash before we 3207 * can proceed: 3208 */ 3209 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3210 error = EAGAIN; 3211 } else { 3212 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3213 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3214 } 3215 } else { 3216 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3217 } 3218 if (error == 0 || error == EOPNOTSUPP) 3219 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3220 3221 return (error); 3222 } 3223 3224 /* 3225 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3226 * is set in the fast path and will attach/detach/modify the TX rate 3227 * limit send tag based on the socket's so_max_pacing_rate value. 3228 */ 3229 void 3230 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3231 { 3232 struct socket *socket; 3233 uint32_t max_pacing_rate; 3234 bool did_upgrade; 3235 int error; 3236 3237 if (inp == NULL) 3238 return; 3239 3240 socket = inp->inp_socket; 3241 if (socket == NULL) 3242 return; 3243 3244 if (!INP_WLOCKED(inp)) { 3245 /* 3246 * NOTE: If the write locking fails, we need to bail 3247 * out and use the non-ratelimited ring for the 3248 * transmit until there is a new chance to get the 3249 * write lock. 3250 */ 3251 if (!INP_TRY_UPGRADE(inp)) 3252 return; 3253 did_upgrade = 1; 3254 } else { 3255 did_upgrade = 0; 3256 } 3257 3258 /* 3259 * NOTE: The so_max_pacing_rate value is read unlocked, 3260 * because atomic updates are not required since the variable 3261 * is checked at every mbuf we send. It is assumed that the 3262 * variable read itself will be atomic. 3263 */ 3264 max_pacing_rate = socket->so_max_pacing_rate; 3265 3266 error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3267 3268 if (did_upgrade) 3269 INP_DOWNGRADE(inp); 3270 } 3271 3272 /* 3273 * Track route changes for TX rate limiting. 3274 */ 3275 void 3276 in_pcboutput_eagain(struct inpcb *inp) 3277 { 3278 bool did_upgrade; 3279 3280 if (inp == NULL) 3281 return; 3282 3283 if (inp->inp_snd_tag == NULL) 3284 return; 3285 3286 if (!INP_WLOCKED(inp)) { 3287 /* 3288 * NOTE: If the write locking fails, we need to bail 3289 * out and use the non-ratelimited ring for the 3290 * transmit until there is a new chance to get the 3291 * write lock. 3292 */ 3293 if (!INP_TRY_UPGRADE(inp)) 3294 return; 3295 did_upgrade = 1; 3296 } else { 3297 did_upgrade = 0; 3298 } 3299 3300 /* detach rate limiting */ 3301 in_pcbdetach_txrtlmt(inp); 3302 3303 /* make sure new mbuf send tag allocation is made */ 3304 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3305 3306 if (did_upgrade) 3307 INP_DOWNGRADE(inp); 3308 } 3309 3310 #ifdef INET 3311 static void 3312 rl_init(void *st) 3313 { 3314 rate_limit_new = counter_u64_alloc(M_WAITOK); 3315 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3316 rate_limit_active = counter_u64_alloc(M_WAITOK); 3317 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3318 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3319 } 3320 3321 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3322 #endif 3323 #endif /* RATELIMIT */ 3324