1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/lock.h> 54 #include <sys/malloc.h> 55 #include <sys/mbuf.h> 56 #include <sys/callout.h> 57 #include <sys/eventhandler.h> 58 #include <sys/domain.h> 59 #include <sys/protosw.h> 60 #include <sys/smp.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_types.h> 81 #include <net/if_llatbl.h> 82 #include <net/route.h> 83 #include <net/rss_config.h> 84 #include <net/vnet.h> 85 86 #if defined(INET) || defined(INET6) 87 #include <netinet/in.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/in_pcb_var.h> 90 #ifdef INET 91 #include <netinet/in_var.h> 92 #include <netinet/in_fib.h> 93 #endif 94 #include <netinet/ip_var.h> 95 #include <netinet/tcp_var.h> 96 #ifdef TCPHPTS 97 #include <netinet/tcp_hpts.h> 98 #endif 99 #include <netinet/udp.h> 100 #include <netinet/udp_var.h> 101 #ifdef INET6 102 #include <netinet/ip6.h> 103 #include <netinet6/in6_pcb.h> 104 #include <netinet6/in6_var.h> 105 #include <netinet6/ip6_var.h> 106 #endif /* INET6 */ 107 #include <net/route/nhop.h> 108 #endif 109 110 #include <netipsec/ipsec_support.h> 111 112 #include <security/mac/mac_framework.h> 113 114 #define INPCBLBGROUP_SIZMIN 8 115 #define INPCBLBGROUP_SIZMAX 256 116 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 117 118 static struct callout ipport_tick_callout; 119 120 /* 121 * These configure the range of local port addresses assigned to 122 * "unspecified" outgoing connections/packets/whatever. 123 */ 124 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 125 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 126 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 127 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 128 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 129 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 130 131 /* 132 * Reserved ports accessible only to root. There are significant 133 * security considerations that must be accounted for when changing these, 134 * but the security benefits can be great. Please be careful. 135 */ 136 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 137 VNET_DEFINE(int, ipport_reservedlow); 138 139 /* Variables dealing with random ephemeral port allocation. */ 140 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ 141 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ 142 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ 143 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ 144 VNET_DEFINE(int, ipport_tcpallocs); 145 VNET_DEFINE_STATIC(int, ipport_tcplastcount); 146 147 #define V_ipport_tcplastcount VNET(ipport_tcplastcount) 148 149 #ifdef INET 150 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 151 struct in_addr faddr, u_int fport_arg, 152 struct in_addr laddr, u_int lport_arg, 153 int lookupflags, struct ifnet *ifp, 154 uint8_t numa_domain); 155 156 #define RANGECHK(var, min, max) \ 157 if ((var) < (min)) { (var) = (min); } \ 158 else if ((var) > (max)) { (var) = (max); } 159 160 static int 161 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 162 { 163 int error; 164 165 error = sysctl_handle_int(oidp, arg1, arg2, req); 166 if (error == 0) { 167 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 168 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 169 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 170 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 171 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 172 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 173 } 174 return (error); 175 } 176 177 #undef RANGECHK 178 179 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 180 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 181 "IP Ports"); 182 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 196 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 197 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 198 ""); 199 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 200 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 201 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 202 ""); 203 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 204 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 205 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 206 ""); 207 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 208 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 209 &VNET_NAME(ipport_reservedhigh), 0, ""); 210 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 211 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 212 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 213 CTLFLAG_VNET | CTLFLAG_RW, 214 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 215 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, 216 CTLFLAG_VNET | CTLFLAG_RW, 217 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " 218 "allocations before switching to a sequential one"); 219 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, 220 CTLFLAG_VNET | CTLFLAG_RW, 221 &VNET_NAME(ipport_randomtime), 0, 222 "Minimum time to keep sequential port " 223 "allocation before switching to a random one"); 224 225 #ifdef RATELIMIT 226 counter_u64_t rate_limit_new; 227 counter_u64_t rate_limit_chg; 228 counter_u64_t rate_limit_active; 229 counter_u64_t rate_limit_alloc_fail; 230 counter_u64_t rate_limit_set_ok; 231 232 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 233 "IP Rate Limiting"); 234 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 235 &rate_limit_active, "Active rate limited connections"); 236 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 237 &rate_limit_alloc_fail, "Rate limited connection failures"); 238 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 239 &rate_limit_set_ok, "Rate limited setting succeeded"); 240 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 241 &rate_limit_new, "Total Rate limit new attempts"); 242 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 243 &rate_limit_chg, "Total Rate limited change attempts"); 244 245 #endif /* RATELIMIT */ 246 247 #endif /* INET */ 248 249 /* 250 * in_pcb.c: manage the Protocol Control Blocks. 251 * 252 * NOTE: It is assumed that most of these functions will be called with 253 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 254 * functions often modify hash chains or addresses in pcbs. 255 */ 256 257 static struct inpcblbgroup * 258 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, 259 uint16_t port, const union in_dependaddr *addr, int size, 260 uint8_t numa_domain) 261 { 262 struct inpcblbgroup *grp; 263 size_t bytes; 264 265 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 266 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 267 if (!grp) 268 return (NULL); 269 grp->il_vflag = vflag; 270 grp->il_lport = port; 271 grp->il_numa_domain = numa_domain; 272 grp->il_dependladdr = *addr; 273 grp->il_inpsiz = size; 274 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 275 return (grp); 276 } 277 278 static void 279 in_pcblbgroup_free_deferred(epoch_context_t ctx) 280 { 281 struct inpcblbgroup *grp; 282 283 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 284 free(grp, M_PCB); 285 } 286 287 static void 288 in_pcblbgroup_free(struct inpcblbgroup *grp) 289 { 290 291 CK_LIST_REMOVE(grp, il_list); 292 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 293 } 294 295 static struct inpcblbgroup * 296 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 297 struct inpcblbgroup *old_grp, int size) 298 { 299 struct inpcblbgroup *grp; 300 int i; 301 302 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, 303 old_grp->il_lport, &old_grp->il_dependladdr, size, 304 old_grp->il_numa_domain); 305 if (grp == NULL) 306 return (NULL); 307 308 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 309 ("invalid new local group size %d and old local group count %d", 310 grp->il_inpsiz, old_grp->il_inpcnt)); 311 312 for (i = 0; i < old_grp->il_inpcnt; ++i) 313 grp->il_inp[i] = old_grp->il_inp[i]; 314 grp->il_inpcnt = old_grp->il_inpcnt; 315 in_pcblbgroup_free(old_grp); 316 return (grp); 317 } 318 319 /* 320 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 321 * and shrink group if possible. 322 */ 323 static void 324 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 325 int i) 326 { 327 struct inpcblbgroup *grp, *new_grp; 328 329 grp = *grpp; 330 for (; i + 1 < grp->il_inpcnt; ++i) 331 grp->il_inp[i] = grp->il_inp[i + 1]; 332 grp->il_inpcnt--; 333 334 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 335 grp->il_inpcnt <= grp->il_inpsiz / 4) { 336 /* Shrink this group. */ 337 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 338 if (new_grp != NULL) 339 *grpp = new_grp; 340 } 341 } 342 343 /* 344 * Add PCB to load balance group for SO_REUSEPORT_LB option. 345 */ 346 static int 347 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 348 { 349 const static struct timeval interval = { 60, 0 }; 350 static struct timeval lastprint; 351 struct inpcbinfo *pcbinfo; 352 struct inpcblbgrouphead *hdr; 353 struct inpcblbgroup *grp; 354 uint32_t idx; 355 356 pcbinfo = inp->inp_pcbinfo; 357 358 INP_WLOCK_ASSERT(inp); 359 INP_HASH_WLOCK_ASSERT(pcbinfo); 360 361 /* 362 * Don't allow jailed socket to join local group. 363 */ 364 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) 365 return (0); 366 367 #ifdef INET6 368 /* 369 * Don't allow IPv4 mapped INET6 wild socket. 370 */ 371 if ((inp->inp_vflag & INP_IPV4) && 372 inp->inp_laddr.s_addr == INADDR_ANY && 373 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 374 return (0); 375 } 376 #endif 377 378 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 379 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 380 CK_LIST_FOREACH(grp, hdr, il_list) { 381 if (grp->il_vflag == inp->inp_vflag && 382 grp->il_lport == inp->inp_lport && 383 grp->il_numa_domain == numa_domain && 384 memcmp(&grp->il_dependladdr, 385 &inp->inp_inc.inc_ie.ie_dependladdr, 386 sizeof(grp->il_dependladdr)) == 0) 387 break; 388 } 389 if (grp == NULL) { 390 /* Create new load balance group. */ 391 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, 392 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 393 INPCBLBGROUP_SIZMIN, numa_domain); 394 if (grp == NULL) 395 return (ENOBUFS); 396 } else if (grp->il_inpcnt == grp->il_inpsiz) { 397 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 398 if (ratecheck(&lastprint, &interval)) 399 printf("lb group port %d, limit reached\n", 400 ntohs(grp->il_lport)); 401 return (0); 402 } 403 404 /* Expand this local group. */ 405 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 406 if (grp == NULL) 407 return (ENOBUFS); 408 } 409 410 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 411 ("invalid local group size %d and count %d", grp->il_inpsiz, 412 grp->il_inpcnt)); 413 414 grp->il_inp[grp->il_inpcnt] = inp; 415 grp->il_inpcnt++; 416 return (0); 417 } 418 419 /* 420 * Remove PCB from load balance group. 421 */ 422 static void 423 in_pcbremlbgrouphash(struct inpcb *inp) 424 { 425 struct inpcbinfo *pcbinfo; 426 struct inpcblbgrouphead *hdr; 427 struct inpcblbgroup *grp; 428 int i; 429 430 pcbinfo = inp->inp_pcbinfo; 431 432 INP_WLOCK_ASSERT(inp); 433 INP_HASH_WLOCK_ASSERT(pcbinfo); 434 435 hdr = &pcbinfo->ipi_lbgrouphashbase[ 436 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 437 CK_LIST_FOREACH(grp, hdr, il_list) { 438 for (i = 0; i < grp->il_inpcnt; ++i) { 439 if (grp->il_inp[i] != inp) 440 continue; 441 442 if (grp->il_inpcnt == 1) { 443 /* We are the last, free this local group. */ 444 in_pcblbgroup_free(grp); 445 } else { 446 /* Pull up inpcbs, shrink group if possible. */ 447 in_pcblbgroup_reorder(hdr, &grp, i); 448 } 449 return; 450 } 451 } 452 } 453 454 int 455 in_pcblbgroup_numa(struct inpcb *inp, int arg) 456 { 457 struct inpcbinfo *pcbinfo; 458 struct inpcblbgrouphead *hdr; 459 struct inpcblbgroup *grp; 460 int err, i; 461 uint8_t numa_domain; 462 463 switch (arg) { 464 case TCP_REUSPORT_LB_NUMA_NODOM: 465 numa_domain = M_NODOM; 466 break; 467 case TCP_REUSPORT_LB_NUMA_CURDOM: 468 numa_domain = PCPU_GET(domain); 469 break; 470 default: 471 if (arg < 0 || arg >= vm_ndomains) 472 return (EINVAL); 473 numa_domain = arg; 474 } 475 476 err = 0; 477 pcbinfo = inp->inp_pcbinfo; 478 INP_WLOCK_ASSERT(inp); 479 INP_HASH_WLOCK(pcbinfo); 480 hdr = &pcbinfo->ipi_lbgrouphashbase[ 481 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 482 CK_LIST_FOREACH(grp, hdr, il_list) { 483 for (i = 0; i < grp->il_inpcnt; ++i) { 484 if (grp->il_inp[i] != inp) 485 continue; 486 487 if (grp->il_numa_domain == numa_domain) { 488 goto abort_with_hash_wlock; 489 } 490 491 /* Remove it from the old group. */ 492 in_pcbremlbgrouphash(inp); 493 494 /* Add it to the new group based on numa domain. */ 495 in_pcbinslbgrouphash(inp, numa_domain); 496 goto abort_with_hash_wlock; 497 } 498 } 499 err = ENOENT; 500 abort_with_hash_wlock: 501 INP_HASH_WUNLOCK(pcbinfo); 502 return (err); 503 } 504 505 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 506 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 507 508 /* 509 * Initialize an inpcbinfo -- we should be able to reduce the number of 510 * arguments in time. 511 */ 512 static void inpcb_dtor(void *, int, void *); 513 static void inpcb_fini(void *, int); 514 void 515 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, 516 u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, 517 uma_init inpcbzone_init) 518 { 519 520 mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); 521 mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); 522 #ifdef VIMAGE 523 pcbinfo->ipi_vnet = curvnet; 524 #endif 525 CK_LIST_INIT(&pcbinfo->ipi_listhead); 526 pcbinfo->ipi_count = 0; 527 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 528 &pcbinfo->ipi_hashmask); 529 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 530 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 531 &pcbinfo->ipi_porthashmask); 532 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 533 &pcbinfo->ipi_lbgrouphashmask); 534 pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), 535 NULL, inpcb_dtor, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 536 UMA_ZONE_SMR); 537 uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); 538 uma_zone_set_warning(pcbinfo->ipi_zone, 539 "kern.ipc.maxsockets limit reached"); 540 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 541 pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, 542 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 543 uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); 544 } 545 546 /* 547 * Destroy an inpcbinfo. 548 */ 549 void 550 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 551 { 552 553 KASSERT(pcbinfo->ipi_count == 0, 554 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 555 556 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 557 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 558 pcbinfo->ipi_porthashmask); 559 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 560 pcbinfo->ipi_lbgrouphashmask); 561 uma_zdestroy(pcbinfo->ipi_zone); 562 uma_zdestroy(pcbinfo->ipi_portzone); 563 mtx_destroy(&pcbinfo->ipi_hash_lock); 564 mtx_destroy(&pcbinfo->ipi_lock); 565 } 566 567 /* 568 * Allocate a PCB and associate it with the socket. 569 * On success return with the PCB locked. 570 */ 571 int 572 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 573 { 574 struct inpcb *inp; 575 int error; 576 577 error = 0; 578 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 579 if (inp == NULL) 580 return (ENOBUFS); 581 bzero(&inp->inp_start_zero, inp_zero_size); 582 #ifdef NUMA 583 inp->inp_numa_domain = M_NODOM; 584 #endif 585 inp->inp_pcbinfo = pcbinfo; 586 inp->inp_socket = so; 587 inp->inp_cred = crhold(so->so_cred); 588 inp->inp_inc.inc_fibnum = so->so_fibnum; 589 #ifdef MAC 590 error = mac_inpcb_init(inp, M_NOWAIT); 591 if (error != 0) 592 goto out; 593 mac_inpcb_create(so, inp); 594 #endif 595 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 596 error = ipsec_init_pcbpolicy(inp); 597 if (error != 0) { 598 #ifdef MAC 599 mac_inpcb_destroy(inp); 600 #endif 601 goto out; 602 } 603 #endif /*IPSEC*/ 604 #ifdef INET6 605 if (INP_SOCKAF(so) == AF_INET6) { 606 inp->inp_vflag |= INP_IPV6PROTO; 607 if (V_ip6_v6only) 608 inp->inp_flags |= IN6P_IPV6_V6ONLY; 609 } 610 if (V_ip6_auto_flowlabel) 611 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 612 #endif 613 /* 614 * Routes in inpcb's can cache L2 as well; they are guaranteed 615 * to be cleaned up. 616 */ 617 inp->inp_route.ro_flags = RT_LLE_CACHE; 618 #ifdef TCPHPTS 619 /* 620 * If using hpts lets drop a random number in so 621 * not all new connections fall on the same CPU. 622 */ 623 inp->inp_hpts_cpu = inp->inp_dropq_cpu = hpts_random_cpu(inp); 624 #endif 625 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 626 INP_WLOCK(inp); 627 INP_INFO_WLOCK(pcbinfo); 628 pcbinfo->ipi_count++; 629 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 630 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 631 INP_INFO_WUNLOCK(pcbinfo); 632 so->so_pcb = inp; 633 634 return (0); 635 636 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 637 out: 638 uma_zfree_smr(pcbinfo->ipi_zone, inp); 639 return (error); 640 #endif 641 } 642 643 #ifdef INET 644 int 645 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 646 { 647 int anonport, error; 648 649 KASSERT(nam == NULL || nam->sa_family == AF_INET, 650 ("%s: invalid address family for %p", __func__, nam)); 651 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 652 ("%s: invalid address length for %p", __func__, nam)); 653 INP_WLOCK_ASSERT(inp); 654 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 655 656 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 657 return (EINVAL); 658 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 659 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 660 &inp->inp_lport, cred); 661 if (error) 662 return (error); 663 if (in_pcbinshash(inp) != 0) { 664 inp->inp_laddr.s_addr = INADDR_ANY; 665 inp->inp_lport = 0; 666 return (EAGAIN); 667 } 668 if (anonport) 669 inp->inp_flags |= INP_ANONPORT; 670 return (0); 671 } 672 #endif 673 674 #if defined(INET) || defined(INET6) 675 /* 676 * Assign a local port like in_pcb_lport(), but also used with connect() 677 * and a foreign address and port. If fsa is non-NULL, choose a local port 678 * that is unused with those, otherwise one that is completely unused. 679 * lsa can be NULL for IPv6. 680 */ 681 int 682 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 683 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 684 { 685 struct inpcbinfo *pcbinfo; 686 struct inpcb *tmpinp; 687 unsigned short *lastport; 688 int count, dorandom, error; 689 u_short aux, first, last, lport; 690 #ifdef INET 691 struct in_addr laddr, faddr; 692 #endif 693 #ifdef INET6 694 struct in6_addr *laddr6, *faddr6; 695 #endif 696 697 pcbinfo = inp->inp_pcbinfo; 698 699 /* 700 * Because no actual state changes occur here, a global write lock on 701 * the pcbinfo isn't required. 702 */ 703 INP_LOCK_ASSERT(inp); 704 INP_HASH_LOCK_ASSERT(pcbinfo); 705 706 if (inp->inp_flags & INP_HIGHPORT) { 707 first = V_ipport_hifirstauto; /* sysctl */ 708 last = V_ipport_hilastauto; 709 lastport = &pcbinfo->ipi_lasthi; 710 } else if (inp->inp_flags & INP_LOWPORT) { 711 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 712 if (error) 713 return (error); 714 first = V_ipport_lowfirstauto; /* 1023 */ 715 last = V_ipport_lowlastauto; /* 600 */ 716 lastport = &pcbinfo->ipi_lastlow; 717 } else { 718 first = V_ipport_firstauto; /* sysctl */ 719 last = V_ipport_lastauto; 720 lastport = &pcbinfo->ipi_lastport; 721 } 722 /* 723 * For UDP(-Lite), use random port allocation as long as the user 724 * allows it. For TCP (and as of yet unknown) connections, 725 * use random port allocation only if the user allows it AND 726 * ipport_tick() allows it. 727 */ 728 if (V_ipport_randomized && 729 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || 730 pcbinfo == &V_ulitecbinfo)) 731 dorandom = 1; 732 else 733 dorandom = 0; 734 /* 735 * It makes no sense to do random port allocation if 736 * we have the only port available. 737 */ 738 if (first == last) 739 dorandom = 0; 740 /* Make sure to not include UDP(-Lite) packets in the count. */ 741 if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo) 742 V_ipport_tcpallocs++; 743 /* 744 * Instead of having two loops further down counting up or down 745 * make sure that first is always <= last and go with only one 746 * code path implementing all logic. 747 */ 748 if (first > last) { 749 aux = first; 750 first = last; 751 last = aux; 752 } 753 754 #ifdef INET 755 laddr.s_addr = INADDR_ANY; 756 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 757 if (lsa != NULL) 758 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 759 if (fsa != NULL) 760 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 761 } 762 #endif 763 #ifdef INET6 764 laddr6 = NULL; 765 if ((inp->inp_vflag & INP_IPV6) != 0) { 766 if (lsa != NULL) 767 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 768 if (fsa != NULL) 769 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 770 } 771 #endif 772 773 tmpinp = NULL; 774 lport = *lportp; 775 776 if (dorandom) 777 *lastport = first + (arc4random() % (last - first)); 778 779 count = last - first; 780 781 do { 782 if (count-- < 0) /* completely used? */ 783 return (EADDRNOTAVAIL); 784 ++*lastport; 785 if (*lastport < first || *lastport > last) 786 *lastport = first; 787 lport = htons(*lastport); 788 789 if (fsa != NULL) { 790 #ifdef INET 791 if (lsa->sa_family == AF_INET) { 792 tmpinp = in_pcblookup_hash_locked(pcbinfo, 793 faddr, fport, laddr, lport, lookupflags, 794 NULL, M_NODOM); 795 } 796 #endif 797 #ifdef INET6 798 if (lsa->sa_family == AF_INET6) { 799 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 800 faddr6, fport, laddr6, lport, lookupflags, 801 NULL, M_NODOM); 802 } 803 #endif 804 } else { 805 #ifdef INET6 806 if ((inp->inp_vflag & INP_IPV6) != 0) 807 tmpinp = in6_pcblookup_local(pcbinfo, 808 &inp->in6p_laddr, lport, lookupflags, cred); 809 #endif 810 #if defined(INET) && defined(INET6) 811 else 812 #endif 813 #ifdef INET 814 tmpinp = in_pcblookup_local(pcbinfo, laddr, 815 lport, lookupflags, cred); 816 #endif 817 } 818 } while (tmpinp != NULL); 819 820 *lportp = lport; 821 822 return (0); 823 } 824 825 /* 826 * Select a local port (number) to use. 827 */ 828 int 829 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 830 struct ucred *cred, int lookupflags) 831 { 832 struct sockaddr_in laddr; 833 834 if (laddrp) { 835 bzero(&laddr, sizeof(laddr)); 836 laddr.sin_family = AF_INET; 837 laddr.sin_addr = *laddrp; 838 } 839 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 840 NULL, lportp, NULL, 0, cred, lookupflags)); 841 } 842 843 /* 844 * Return cached socket options. 845 */ 846 int 847 inp_so_options(const struct inpcb *inp) 848 { 849 int so_options; 850 851 so_options = 0; 852 853 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 854 so_options |= SO_REUSEPORT_LB; 855 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 856 so_options |= SO_REUSEPORT; 857 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 858 so_options |= SO_REUSEADDR; 859 return (so_options); 860 } 861 #endif /* INET || INET6 */ 862 863 /* 864 * Check if a new BINDMULTI socket is allowed to be created. 865 * 866 * ni points to the new inp. 867 * oi points to the exisitng inp. 868 * 869 * This checks whether the existing inp also has BINDMULTI and 870 * whether the credentials match. 871 */ 872 int 873 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 874 { 875 /* Check permissions match */ 876 if ((ni->inp_flags2 & INP_BINDMULTI) && 877 (ni->inp_cred->cr_uid != 878 oi->inp_cred->cr_uid)) 879 return (0); 880 881 /* Check the existing inp has BINDMULTI set */ 882 if ((ni->inp_flags2 & INP_BINDMULTI) && 883 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 884 return (0); 885 886 /* 887 * We're okay - either INP_BINDMULTI isn't set on ni, or 888 * it is and it matches the checks. 889 */ 890 return (1); 891 } 892 893 #ifdef INET 894 /* 895 * Set up a bind operation on a PCB, performing port allocation 896 * as required, but do not actually modify the PCB. Callers can 897 * either complete the bind by setting inp_laddr/inp_lport and 898 * calling in_pcbinshash(), or they can just use the resulting 899 * port and address to authorise the sending of a once-off packet. 900 * 901 * On error, the values of *laddrp and *lportp are not changed. 902 */ 903 int 904 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 905 u_short *lportp, struct ucred *cred) 906 { 907 struct socket *so = inp->inp_socket; 908 struct sockaddr_in *sin; 909 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 910 struct in_addr laddr; 911 u_short lport = 0; 912 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 913 int error; 914 915 /* 916 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 917 * so that we don't have to add to the (already messy) code below. 918 */ 919 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 920 921 /* 922 * No state changes, so read locks are sufficient here. 923 */ 924 INP_LOCK_ASSERT(inp); 925 INP_HASH_LOCK_ASSERT(pcbinfo); 926 927 laddr.s_addr = *laddrp; 928 if (nam != NULL && laddr.s_addr != INADDR_ANY) 929 return (EINVAL); 930 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 931 lookupflags = INPLOOKUP_WILDCARD; 932 if (nam == NULL) { 933 if ((error = prison_local_ip4(cred, &laddr)) != 0) 934 return (error); 935 } else { 936 sin = (struct sockaddr_in *)nam; 937 KASSERT(sin->sin_family == AF_INET, 938 ("%s: invalid family for address %p", __func__, sin)); 939 KASSERT(sin->sin_len == sizeof(*sin), 940 ("%s: invalid length for address %p", __func__, sin)); 941 942 error = prison_local_ip4(cred, &sin->sin_addr); 943 if (error) 944 return (error); 945 if (sin->sin_port != *lportp) { 946 /* Don't allow the port to change. */ 947 if (*lportp != 0) 948 return (EINVAL); 949 lport = sin->sin_port; 950 } 951 /* NB: lport is left as 0 if the port isn't being changed. */ 952 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 953 /* 954 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 955 * allow complete duplication of binding if 956 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 957 * and a multicast address is bound on both 958 * new and duplicated sockets. 959 */ 960 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 961 reuseport = SO_REUSEADDR|SO_REUSEPORT; 962 /* 963 * XXX: How to deal with SO_REUSEPORT_LB here? 964 * Treat same as SO_REUSEPORT for now. 965 */ 966 if ((so->so_options & 967 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 968 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 969 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 970 sin->sin_port = 0; /* yech... */ 971 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 972 /* 973 * Is the address a local IP address? 974 * If INP_BINDANY is set, then the socket may be bound 975 * to any endpoint address, local or not. 976 */ 977 if ((inp->inp_flags & INP_BINDANY) == 0 && 978 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 979 return (EADDRNOTAVAIL); 980 } 981 laddr = sin->sin_addr; 982 if (lport) { 983 struct inpcb *t; 984 struct tcptw *tw; 985 986 /* GROSS */ 987 if (ntohs(lport) <= V_ipport_reservedhigh && 988 ntohs(lport) >= V_ipport_reservedlow && 989 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 990 return (EACCES); 991 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 992 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 993 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 994 lport, INPLOOKUP_WILDCARD, cred); 995 /* 996 * XXX 997 * This entire block sorely needs a rewrite. 998 */ 999 if (t && 1000 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1001 ((t->inp_flags & INP_TIMEWAIT) == 0) && 1002 (so->so_type != SOCK_STREAM || 1003 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1004 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1005 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1006 (t->inp_flags2 & INP_REUSEPORT) || 1007 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1008 (inp->inp_cred->cr_uid != 1009 t->inp_cred->cr_uid)) 1010 return (EADDRINUSE); 1011 1012 /* 1013 * If the socket is a BINDMULTI socket, then 1014 * the credentials need to match and the 1015 * original socket also has to have been bound 1016 * with BINDMULTI. 1017 */ 1018 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1019 return (EADDRINUSE); 1020 } 1021 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1022 lport, lookupflags, cred); 1023 if (t && (t->inp_flags & INP_TIMEWAIT)) { 1024 /* 1025 * XXXRW: If an incpb has had its timewait 1026 * state recycled, we treat the address as 1027 * being in use (for now). This is better 1028 * than a panic, but not desirable. 1029 */ 1030 tw = intotw(t); 1031 if (tw == NULL || 1032 ((reuseport & tw->tw_so_options) == 0 && 1033 (reuseport_lb & 1034 tw->tw_so_options) == 0)) { 1035 return (EADDRINUSE); 1036 } 1037 } else if (t && 1038 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1039 (reuseport & inp_so_options(t)) == 0 && 1040 (reuseport_lb & inp_so_options(t)) == 0) { 1041 #ifdef INET6 1042 if (ntohl(sin->sin_addr.s_addr) != 1043 INADDR_ANY || 1044 ntohl(t->inp_laddr.s_addr) != 1045 INADDR_ANY || 1046 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1047 (t->inp_vflag & INP_IPV6PROTO) == 0) 1048 #endif 1049 return (EADDRINUSE); 1050 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1051 return (EADDRINUSE); 1052 } 1053 } 1054 } 1055 if (*lportp != 0) 1056 lport = *lportp; 1057 if (lport == 0) { 1058 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1059 if (error != 0) 1060 return (error); 1061 } 1062 *laddrp = laddr.s_addr; 1063 *lportp = lport; 1064 return (0); 1065 } 1066 1067 /* 1068 * Connect from a socket to a specified address. 1069 * Both address and port must be specified in argument sin. 1070 * If don't have a local address for this socket yet, 1071 * then pick one. 1072 */ 1073 int 1074 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1075 bool rehash) 1076 { 1077 u_short lport, fport; 1078 in_addr_t laddr, faddr; 1079 int anonport, error; 1080 1081 INP_WLOCK_ASSERT(inp); 1082 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1083 1084 lport = inp->inp_lport; 1085 laddr = inp->inp_laddr.s_addr; 1086 anonport = (lport == 0); 1087 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1088 NULL, cred); 1089 if (error) 1090 return (error); 1091 1092 /* Do the initial binding of the local address if required. */ 1093 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1094 KASSERT(rehash == true, 1095 ("Rehashing required for unbound inps")); 1096 inp->inp_lport = lport; 1097 inp->inp_laddr.s_addr = laddr; 1098 if (in_pcbinshash(inp) != 0) { 1099 inp->inp_laddr.s_addr = INADDR_ANY; 1100 inp->inp_lport = 0; 1101 return (EAGAIN); 1102 } 1103 } 1104 1105 /* Commit the remaining changes. */ 1106 inp->inp_lport = lport; 1107 inp->inp_laddr.s_addr = laddr; 1108 inp->inp_faddr.s_addr = faddr; 1109 inp->inp_fport = fport; 1110 if (rehash) { 1111 in_pcbrehash(inp); 1112 } else { 1113 in_pcbinshash(inp); 1114 } 1115 1116 if (anonport) 1117 inp->inp_flags |= INP_ANONPORT; 1118 return (0); 1119 } 1120 1121 /* 1122 * Do proper source address selection on an unbound socket in case 1123 * of connect. Take jails into account as well. 1124 */ 1125 int 1126 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1127 struct ucred *cred) 1128 { 1129 struct ifaddr *ifa; 1130 struct sockaddr *sa; 1131 struct sockaddr_in *sin, dst; 1132 struct nhop_object *nh; 1133 int error; 1134 1135 NET_EPOCH_ASSERT(); 1136 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1137 /* 1138 * Bypass source address selection and use the primary jail IP 1139 * if requested. 1140 */ 1141 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) 1142 return (0); 1143 1144 error = 0; 1145 1146 nh = NULL; 1147 bzero(&dst, sizeof(dst)); 1148 sin = &dst; 1149 sin->sin_family = AF_INET; 1150 sin->sin_len = sizeof(struct sockaddr_in); 1151 sin->sin_addr.s_addr = faddr->s_addr; 1152 1153 /* 1154 * If route is known our src addr is taken from the i/f, 1155 * else punt. 1156 * 1157 * Find out route to destination. 1158 */ 1159 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1160 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1161 0, NHR_NONE, 0); 1162 1163 /* 1164 * If we found a route, use the address corresponding to 1165 * the outgoing interface. 1166 * 1167 * Otherwise assume faddr is reachable on a directly connected 1168 * network and try to find a corresponding interface to take 1169 * the source address from. 1170 */ 1171 if (nh == NULL || nh->nh_ifp == NULL) { 1172 struct in_ifaddr *ia; 1173 struct ifnet *ifp; 1174 1175 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1176 inp->inp_socket->so_fibnum)); 1177 if (ia == NULL) { 1178 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1179 inp->inp_socket->so_fibnum)); 1180 } 1181 if (ia == NULL) { 1182 error = ENETUNREACH; 1183 goto done; 1184 } 1185 1186 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1187 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1188 goto done; 1189 } 1190 1191 ifp = ia->ia_ifp; 1192 ia = NULL; 1193 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1194 sa = ifa->ifa_addr; 1195 if (sa->sa_family != AF_INET) 1196 continue; 1197 sin = (struct sockaddr_in *)sa; 1198 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1199 ia = (struct in_ifaddr *)ifa; 1200 break; 1201 } 1202 } 1203 if (ia != NULL) { 1204 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1205 goto done; 1206 } 1207 1208 /* 3. As a last resort return the 'default' jail address. */ 1209 error = prison_get_ip4(cred, laddr); 1210 goto done; 1211 } 1212 1213 /* 1214 * If the outgoing interface on the route found is not 1215 * a loopback interface, use the address from that interface. 1216 * In case of jails do those three steps: 1217 * 1. check if the interface address belongs to the jail. If so use it. 1218 * 2. check if we have any address on the outgoing interface 1219 * belonging to this jail. If so use it. 1220 * 3. as a last resort return the 'default' jail address. 1221 */ 1222 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1223 struct in_ifaddr *ia; 1224 struct ifnet *ifp; 1225 1226 /* If not jailed, use the default returned. */ 1227 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1228 ia = (struct in_ifaddr *)nh->nh_ifa; 1229 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1230 goto done; 1231 } 1232 1233 /* Jailed. */ 1234 /* 1. Check if the iface address belongs to the jail. */ 1235 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1236 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1237 ia = (struct in_ifaddr *)nh->nh_ifa; 1238 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1239 goto done; 1240 } 1241 1242 /* 1243 * 2. Check if we have any address on the outgoing interface 1244 * belonging to this jail. 1245 */ 1246 ia = NULL; 1247 ifp = nh->nh_ifp; 1248 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1249 sa = ifa->ifa_addr; 1250 if (sa->sa_family != AF_INET) 1251 continue; 1252 sin = (struct sockaddr_in *)sa; 1253 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1254 ia = (struct in_ifaddr *)ifa; 1255 break; 1256 } 1257 } 1258 if (ia != NULL) { 1259 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1260 goto done; 1261 } 1262 1263 /* 3. As a last resort return the 'default' jail address. */ 1264 error = prison_get_ip4(cred, laddr); 1265 goto done; 1266 } 1267 1268 /* 1269 * The outgoing interface is marked with 'loopback net', so a route 1270 * to ourselves is here. 1271 * Try to find the interface of the destination address and then 1272 * take the address from there. That interface is not necessarily 1273 * a loopback interface. 1274 * In case of jails, check that it is an address of the jail 1275 * and if we cannot find, fall back to the 'default' jail address. 1276 */ 1277 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1278 struct in_ifaddr *ia; 1279 1280 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1281 inp->inp_socket->so_fibnum)); 1282 if (ia == NULL) 1283 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1284 inp->inp_socket->so_fibnum)); 1285 if (ia == NULL) 1286 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1287 1288 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1289 if (ia == NULL) { 1290 error = ENETUNREACH; 1291 goto done; 1292 } 1293 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1294 goto done; 1295 } 1296 1297 /* Jailed. */ 1298 if (ia != NULL) { 1299 struct ifnet *ifp; 1300 1301 ifp = ia->ia_ifp; 1302 ia = NULL; 1303 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1304 sa = ifa->ifa_addr; 1305 if (sa->sa_family != AF_INET) 1306 continue; 1307 sin = (struct sockaddr_in *)sa; 1308 if (prison_check_ip4(cred, 1309 &sin->sin_addr) == 0) { 1310 ia = (struct in_ifaddr *)ifa; 1311 break; 1312 } 1313 } 1314 if (ia != NULL) { 1315 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1316 goto done; 1317 } 1318 } 1319 1320 /* 3. As a last resort return the 'default' jail address. */ 1321 error = prison_get_ip4(cred, laddr); 1322 goto done; 1323 } 1324 1325 done: 1326 return (error); 1327 } 1328 1329 /* 1330 * Set up for a connect from a socket to the specified address. 1331 * On entry, *laddrp and *lportp should contain the current local 1332 * address and port for the PCB; these are updated to the values 1333 * that should be placed in inp_laddr and inp_lport to complete 1334 * the connect. 1335 * 1336 * On success, *faddrp and *fportp will be set to the remote address 1337 * and port. These are not updated in the error case. 1338 * 1339 * If the operation fails because the connection already exists, 1340 * *oinpp will be set to the PCB of that connection so that the 1341 * caller can decide to override it. In all other cases, *oinpp 1342 * is set to NULL. 1343 */ 1344 int 1345 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1346 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1347 struct inpcb **oinpp, struct ucred *cred) 1348 { 1349 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1350 struct in_ifaddr *ia; 1351 struct inpcb *oinp; 1352 struct in_addr laddr, faddr; 1353 u_short lport, fport; 1354 int error; 1355 1356 KASSERT(sin->sin_family == AF_INET, 1357 ("%s: invalid address family for %p", __func__, sin)); 1358 KASSERT(sin->sin_len == sizeof(*sin), 1359 ("%s: invalid address length for %p", __func__, sin)); 1360 1361 /* 1362 * Because a global state change doesn't actually occur here, a read 1363 * lock is sufficient. 1364 */ 1365 NET_EPOCH_ASSERT(); 1366 INP_LOCK_ASSERT(inp); 1367 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1368 1369 if (oinpp != NULL) 1370 *oinpp = NULL; 1371 if (sin->sin_port == 0) 1372 return (EADDRNOTAVAIL); 1373 laddr.s_addr = *laddrp; 1374 lport = *lportp; 1375 faddr = sin->sin_addr; 1376 fport = sin->sin_port; 1377 #ifdef ROUTE_MPATH 1378 if (CALC_FLOWID_OUTBOUND) { 1379 uint32_t hash_val, hash_type; 1380 1381 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1382 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1383 1384 inp->inp_flowid = hash_val; 1385 inp->inp_flowtype = hash_type; 1386 } 1387 #endif 1388 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1389 /* 1390 * If the destination address is INADDR_ANY, 1391 * use the primary local address. 1392 * If the supplied address is INADDR_BROADCAST, 1393 * and the primary interface supports broadcast, 1394 * choose the broadcast address for that interface. 1395 */ 1396 if (faddr.s_addr == INADDR_ANY) { 1397 faddr = 1398 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1399 if (cred != NULL && 1400 (error = prison_get_ip4(cred, &faddr)) != 0) 1401 return (error); 1402 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1403 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1404 IFF_BROADCAST) 1405 faddr = satosin(&CK_STAILQ_FIRST( 1406 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1407 } 1408 } 1409 if (laddr.s_addr == INADDR_ANY) { 1410 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1411 /* 1412 * If the destination address is multicast and an outgoing 1413 * interface has been set as a multicast option, prefer the 1414 * address of that interface as our source address. 1415 */ 1416 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1417 inp->inp_moptions != NULL) { 1418 struct ip_moptions *imo; 1419 struct ifnet *ifp; 1420 1421 imo = inp->inp_moptions; 1422 if (imo->imo_multicast_ifp != NULL) { 1423 ifp = imo->imo_multicast_ifp; 1424 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1425 if ((ia->ia_ifp == ifp) && 1426 (cred == NULL || 1427 prison_check_ip4(cred, 1428 &ia->ia_addr.sin_addr) == 0)) 1429 break; 1430 } 1431 if (ia == NULL) 1432 error = EADDRNOTAVAIL; 1433 else { 1434 laddr = ia->ia_addr.sin_addr; 1435 error = 0; 1436 } 1437 } 1438 } 1439 if (error) 1440 return (error); 1441 } 1442 1443 if (lport != 0) { 1444 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1445 fport, laddr, lport, 0, NULL, M_NODOM); 1446 if (oinp != NULL) { 1447 if (oinpp != NULL) 1448 *oinpp = oinp; 1449 return (EADDRINUSE); 1450 } 1451 } else { 1452 struct sockaddr_in lsin, fsin; 1453 1454 bzero(&lsin, sizeof(lsin)); 1455 bzero(&fsin, sizeof(fsin)); 1456 lsin.sin_family = AF_INET; 1457 lsin.sin_addr = laddr; 1458 fsin.sin_family = AF_INET; 1459 fsin.sin_addr = faddr; 1460 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1461 &lport, (struct sockaddr *)& fsin, fport, cred, 1462 INPLOOKUP_WILDCARD); 1463 if (error) 1464 return (error); 1465 } 1466 *laddrp = laddr.s_addr; 1467 *lportp = lport; 1468 *faddrp = faddr.s_addr; 1469 *fportp = fport; 1470 return (0); 1471 } 1472 1473 void 1474 in_pcbdisconnect(struct inpcb *inp) 1475 { 1476 1477 INP_WLOCK_ASSERT(inp); 1478 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1479 1480 inp->inp_faddr.s_addr = INADDR_ANY; 1481 inp->inp_fport = 0; 1482 in_pcbrehash(inp); 1483 } 1484 #endif /* INET */ 1485 1486 /* 1487 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1488 * For most protocols, this will be invoked immediately prior to calling 1489 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1490 * socket, in which case in_pcbfree() is deferred. 1491 */ 1492 void 1493 in_pcbdetach(struct inpcb *inp) 1494 { 1495 1496 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1497 1498 #ifdef RATELIMIT 1499 if (inp->inp_snd_tag != NULL) 1500 in_pcbdetach_txrtlmt(inp); 1501 #endif 1502 inp->inp_socket->so_pcb = NULL; 1503 inp->inp_socket = NULL; 1504 } 1505 1506 /* 1507 * inpcb hash lookups are protected by SMR section. 1508 * 1509 * Once desired pcb has been found, switching from SMR section to a pcb 1510 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1511 * here because SMR is a critical section. 1512 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1513 */ 1514 static inline void 1515 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1516 { 1517 1518 lock == INPLOOKUP_RLOCKPCB ? 1519 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1520 } 1521 1522 static inline void 1523 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1524 { 1525 1526 lock == INPLOOKUP_RLOCKPCB ? 1527 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1528 } 1529 1530 static inline int 1531 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1532 { 1533 1534 return (lock == INPLOOKUP_RLOCKPCB ? 1535 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1536 } 1537 1538 static inline bool 1539 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1540 { 1541 1542 return (lock == INPLOOKUP_RLOCKPCB ? 1543 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1544 } 1545 1546 bool 1547 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1548 { 1549 1550 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1551 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1552 1553 if (__predict_true(inp_trylock(inp, lock))) { 1554 if (__predict_false(inp->inp_flags & INP_FREED)) { 1555 smr_exit(inp->inp_pcbinfo->ipi_smr); 1556 inp_unlock(inp, lock); 1557 return (false); 1558 } 1559 smr_exit(inp->inp_pcbinfo->ipi_smr); 1560 return (true); 1561 } 1562 1563 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1564 smr_exit(inp->inp_pcbinfo->ipi_smr); 1565 inp_lock(inp, lock); 1566 if (__predict_false(in_pcbrele(inp, lock))) 1567 return (false); 1568 /* 1569 * inp acquired through refcount & lock for sure didn't went 1570 * through uma_zfree(). However, it may have already went 1571 * through in_pcbfree() and has another reference, that 1572 * prevented its release by our in_pcbrele(). 1573 */ 1574 if (__predict_false(inp->inp_flags & INP_FREED)) { 1575 inp_unlock(inp, lock); 1576 return (false); 1577 } 1578 return (true); 1579 } else { 1580 smr_exit(inp->inp_pcbinfo->ipi_smr); 1581 return (false); 1582 } 1583 } 1584 1585 /* 1586 * inp_next() - inpcb hash/list traversal iterator 1587 * 1588 * Requires initialized struct inpcb_iterator for context. 1589 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1590 * 1591 * - Iterator can have either write-lock or read-lock semantics, that can not 1592 * be changed later. 1593 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1594 * a single hash slot. Note: only rip_input() does the latter. 1595 * - Iterator may have optional bool matching function. The matching function 1596 * will be executed for each inpcb in the SMR context, so it can not acquire 1597 * locks and can safely access only immutable fields of inpcb. 1598 * 1599 * A fresh initialized iterator has NULL inpcb in its context and that 1600 * means that inp_next() call would return the very first inpcb on the list 1601 * locked with desired semantic. In all following calls the context pointer 1602 * shall hold the current inpcb pointer. The KPI user is not supposed to 1603 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1604 * and write NULL to its context. After end of traversal an iterator can be 1605 * reused. 1606 * 1607 * List traversals have the following features/constraints: 1608 * - New entries won't be seen, as they are always added to the head of a list. 1609 * - Removed entries won't stop traversal as long as they are not added to 1610 * a different list. This is violated by in_pcbrehash(). 1611 */ 1612 #define II_LIST_FIRST(ipi, hash) \ 1613 (((hash) == INP_ALL_LIST) ? \ 1614 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1615 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1616 #define II_LIST_NEXT(inp, hash) \ 1617 (((hash) == INP_ALL_LIST) ? \ 1618 CK_LIST_NEXT((inp), inp_list) : \ 1619 CK_LIST_NEXT((inp), inp_hash)) 1620 #define II_LOCK_ASSERT(inp, lock) \ 1621 rw_assert(&(inp)->inp_lock, \ 1622 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1623 struct inpcb * 1624 inp_next(struct inpcb_iterator *ii) 1625 { 1626 const struct inpcbinfo *ipi = ii->ipi; 1627 inp_match_t *match = ii->match; 1628 void *ctx = ii->ctx; 1629 inp_lookup_t lock = ii->lock; 1630 int hash = ii->hash; 1631 struct inpcb *inp; 1632 1633 if (ii->inp == NULL) { /* First call. */ 1634 smr_enter(ipi->ipi_smr); 1635 /* This is unrolled CK_LIST_FOREACH(). */ 1636 for (inp = II_LIST_FIRST(ipi, hash); 1637 inp != NULL; 1638 inp = II_LIST_NEXT(inp, hash)) { 1639 if (match != NULL && (match)(inp, ctx) == false) 1640 continue; 1641 if (__predict_true(inp_smr_lock(inp, lock))) 1642 break; 1643 else { 1644 smr_enter(ipi->ipi_smr); 1645 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1646 inp = II_LIST_FIRST(ipi, hash); 1647 } 1648 } 1649 1650 if (inp == NULL) 1651 smr_exit(ipi->ipi_smr); 1652 else 1653 ii->inp = inp; 1654 1655 return (inp); 1656 } 1657 1658 /* Not a first call. */ 1659 smr_enter(ipi->ipi_smr); 1660 restart: 1661 inp = ii->inp; 1662 II_LOCK_ASSERT(inp, lock); 1663 next: 1664 inp = II_LIST_NEXT(inp, hash); 1665 if (inp == NULL) { 1666 smr_exit(ipi->ipi_smr); 1667 goto found; 1668 } 1669 1670 if (match != NULL && (match)(inp, ctx) == false) 1671 goto next; 1672 1673 if (__predict_true(inp_trylock(inp, lock))) { 1674 if (__predict_false(inp->inp_flags & INP_FREED)) { 1675 /* 1676 * Entries are never inserted in middle of a list, thus 1677 * as long as we are in SMR, we can continue traversal. 1678 * Jump to 'restart' should yield in the same result, 1679 * but could produce unnecessary looping. Could this 1680 * looping be unbound? 1681 */ 1682 inp_unlock(inp, lock); 1683 goto next; 1684 } else { 1685 smr_exit(ipi->ipi_smr); 1686 goto found; 1687 } 1688 } 1689 1690 /* 1691 * Can't obtain lock immediately, thus going hard. Once we exit the 1692 * SMR section we can no longer jump to 'next', and our only stable 1693 * anchoring point is ii->inp, which we keep locked for this case, so 1694 * we jump to 'restart'. 1695 */ 1696 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1697 smr_exit(ipi->ipi_smr); 1698 inp_lock(inp, lock); 1699 if (__predict_false(in_pcbrele(inp, lock))) { 1700 smr_enter(ipi->ipi_smr); 1701 goto restart; 1702 } 1703 /* 1704 * See comment in inp_smr_lock(). 1705 */ 1706 if (__predict_false(inp->inp_flags & INP_FREED)) { 1707 inp_unlock(inp, lock); 1708 smr_enter(ipi->ipi_smr); 1709 goto restart; 1710 } 1711 } else 1712 goto next; 1713 1714 found: 1715 inp_unlock(ii->inp, lock); 1716 ii->inp = inp; 1717 1718 return (ii->inp); 1719 } 1720 1721 /* 1722 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1723 * stability of an inpcb pointer despite the inpcb lock being released or 1724 * SMR section exited. 1725 * 1726 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1727 */ 1728 void 1729 in_pcbref(struct inpcb *inp) 1730 { 1731 u_int old __diagused; 1732 1733 old = refcount_acquire(&inp->inp_refcount); 1734 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1735 } 1736 1737 /* 1738 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1739 * freeing the pcb, if the reference was very last. 1740 */ 1741 bool 1742 in_pcbrele_rlocked(struct inpcb *inp) 1743 { 1744 1745 INP_RLOCK_ASSERT(inp); 1746 1747 if (refcount_release(&inp->inp_refcount) == 0) 1748 return (false); 1749 1750 MPASS(inp->inp_flags & INP_FREED); 1751 MPASS(inp->inp_socket == NULL); 1752 MPASS(inp->inp_in_hpts == 0); 1753 MPASS(inp->inp_in_dropq == 0); 1754 INP_RUNLOCK(inp); 1755 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1756 return (true); 1757 } 1758 1759 bool 1760 in_pcbrele_wlocked(struct inpcb *inp) 1761 { 1762 1763 INP_WLOCK_ASSERT(inp); 1764 1765 if (refcount_release(&inp->inp_refcount) == 0) 1766 return (false); 1767 1768 MPASS(inp->inp_flags & INP_FREED); 1769 MPASS(inp->inp_socket == NULL); 1770 MPASS(inp->inp_in_hpts == 0); 1771 MPASS(inp->inp_in_dropq == 0); 1772 INP_WUNLOCK(inp); 1773 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1774 return (true); 1775 } 1776 1777 /* 1778 * Unconditionally schedule an inpcb to be freed by decrementing its 1779 * reference count, which should occur only after the inpcb has been detached 1780 * from its socket. If another thread holds a temporary reference (acquired 1781 * using in_pcbref()) then the free is deferred until that reference is 1782 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1783 * Almost all work, including removal from global lists, is done in this 1784 * context, where the pcbinfo lock is held. 1785 */ 1786 void 1787 in_pcbfree(struct inpcb *inp) 1788 { 1789 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1790 #ifdef INET 1791 struct ip_moptions *imo; 1792 #endif 1793 #ifdef INET6 1794 struct ip6_moptions *im6o; 1795 #endif 1796 1797 INP_WLOCK_ASSERT(inp); 1798 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1799 KASSERT((inp->inp_flags & INP_FREED) == 0, 1800 ("%s: called twice for pcb %p", __func__, inp)); 1801 1802 inp->inp_flags |= INP_FREED; 1803 INP_INFO_WLOCK(pcbinfo); 1804 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1805 pcbinfo->ipi_count--; 1806 CK_LIST_REMOVE(inp, inp_list); 1807 INP_INFO_WUNLOCK(pcbinfo); 1808 1809 if (inp->inp_flags & INP_INHASHLIST) { 1810 struct inpcbport *phd = inp->inp_phd; 1811 1812 INP_HASH_WLOCK(pcbinfo); 1813 /* XXX: Only do if SO_REUSEPORT_LB set? */ 1814 in_pcbremlbgrouphash(inp); 1815 1816 CK_LIST_REMOVE(inp, inp_hash); 1817 CK_LIST_REMOVE(inp, inp_portlist); 1818 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1819 CK_LIST_REMOVE(phd, phd_hash); 1820 uma_zfree_smr(pcbinfo->ipi_portzone, phd); 1821 } 1822 INP_HASH_WUNLOCK(pcbinfo); 1823 inp->inp_flags &= ~INP_INHASHLIST; 1824 } 1825 1826 RO_INVALIDATE_CACHE(&inp->inp_route); 1827 #ifdef MAC 1828 mac_inpcb_destroy(inp); 1829 #endif 1830 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1831 if (inp->inp_sp != NULL) 1832 ipsec_delete_pcbpolicy(inp); 1833 #endif 1834 #ifdef INET 1835 if (inp->inp_options) 1836 (void)m_free(inp->inp_options); 1837 imo = inp->inp_moptions; 1838 #endif 1839 #ifdef INET6 1840 if (inp->inp_vflag & INP_IPV6PROTO) { 1841 ip6_freepcbopts(inp->in6p_outputopts); 1842 im6o = inp->in6p_moptions; 1843 } else 1844 im6o = NULL; 1845 #endif 1846 1847 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1848 INP_WUNLOCK(inp); 1849 } 1850 #ifdef INET6 1851 ip6_freemoptions(im6o); 1852 #endif 1853 #ifdef INET 1854 inp_freemoptions(imo); 1855 #endif 1856 /* Destruction is finalized in inpcb_dtor(). */ 1857 } 1858 1859 static void 1860 inpcb_dtor(void *mem, int size, void *arg) 1861 { 1862 struct inpcb *inp = mem; 1863 1864 crfree(inp->inp_cred); 1865 #ifdef INVARIANTS 1866 inp->inp_cred = NULL; 1867 #endif 1868 } 1869 1870 /* 1871 * Different protocols initialize their inpcbs differently - giving 1872 * different name to the lock. But they all are disposed the same. 1873 */ 1874 static void 1875 inpcb_fini(void *mem, int size) 1876 { 1877 struct inpcb *inp = mem; 1878 1879 INP_LOCK_DESTROY(inp); 1880 } 1881 1882 /* 1883 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1884 * port reservation, and preventing it from being returned by inpcb lookups. 1885 * 1886 * It is used by TCP to mark an inpcb as unused and avoid future packet 1887 * delivery or event notification when a socket remains open but TCP has 1888 * closed. This might occur as a result of a shutdown()-initiated TCP close 1889 * or a RST on the wire, and allows the port binding to be reused while still 1890 * maintaining the invariant that so_pcb always points to a valid inpcb until 1891 * in_pcbdetach(). 1892 * 1893 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1894 * in_pcbnotifyall() and in_pcbpurgeif0()? 1895 */ 1896 void 1897 in_pcbdrop(struct inpcb *inp) 1898 { 1899 1900 INP_WLOCK_ASSERT(inp); 1901 #ifdef INVARIANTS 1902 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1903 MPASS(inp->inp_refcount > 1); 1904 #endif 1905 1906 /* 1907 * XXXRW: Possibly we should protect the setting of INP_DROPPED with 1908 * the hash lock...? 1909 */ 1910 inp->inp_flags |= INP_DROPPED; 1911 if (inp->inp_flags & INP_INHASHLIST) { 1912 struct inpcbport *phd = inp->inp_phd; 1913 1914 INP_HASH_WLOCK(inp->inp_pcbinfo); 1915 in_pcbremlbgrouphash(inp); 1916 CK_LIST_REMOVE(inp, inp_hash); 1917 CK_LIST_REMOVE(inp, inp_portlist); 1918 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1919 CK_LIST_REMOVE(phd, phd_hash); 1920 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 1921 } 1922 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1923 inp->inp_flags &= ~INP_INHASHLIST; 1924 } 1925 } 1926 1927 #ifdef INET 1928 /* 1929 * Common routines to return the socket addresses associated with inpcbs. 1930 */ 1931 struct sockaddr * 1932 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1933 { 1934 struct sockaddr_in *sin; 1935 1936 sin = malloc(sizeof *sin, M_SONAME, 1937 M_WAITOK | M_ZERO); 1938 sin->sin_family = AF_INET; 1939 sin->sin_len = sizeof(*sin); 1940 sin->sin_addr = *addr_p; 1941 sin->sin_port = port; 1942 1943 return (struct sockaddr *)sin; 1944 } 1945 1946 int 1947 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1948 { 1949 struct inpcb *inp; 1950 struct in_addr addr; 1951 in_port_t port; 1952 1953 inp = sotoinpcb(so); 1954 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1955 1956 INP_RLOCK(inp); 1957 port = inp->inp_lport; 1958 addr = inp->inp_laddr; 1959 INP_RUNLOCK(inp); 1960 1961 *nam = in_sockaddr(port, &addr); 1962 return 0; 1963 } 1964 1965 int 1966 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1967 { 1968 struct inpcb *inp; 1969 struct in_addr addr; 1970 in_port_t port; 1971 1972 inp = sotoinpcb(so); 1973 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1974 1975 INP_RLOCK(inp); 1976 port = inp->inp_fport; 1977 addr = inp->inp_faddr; 1978 INP_RUNLOCK(inp); 1979 1980 *nam = in_sockaddr(port, &addr); 1981 return 0; 1982 } 1983 1984 void 1985 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1986 struct inpcb *(*notify)(struct inpcb *, int)) 1987 { 1988 struct inpcb *inp, *inp_temp; 1989 1990 INP_INFO_WLOCK(pcbinfo); 1991 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 1992 INP_WLOCK(inp); 1993 #ifdef INET6 1994 if ((inp->inp_vflag & INP_IPV4) == 0) { 1995 INP_WUNLOCK(inp); 1996 continue; 1997 } 1998 #endif 1999 if (inp->inp_faddr.s_addr != faddr.s_addr || 2000 inp->inp_socket == NULL) { 2001 INP_WUNLOCK(inp); 2002 continue; 2003 } 2004 if ((*notify)(inp, errno)) 2005 INP_WUNLOCK(inp); 2006 } 2007 INP_INFO_WUNLOCK(pcbinfo); 2008 } 2009 2010 static bool 2011 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 2012 { 2013 2014 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 2015 return (true); 2016 else 2017 return (false); 2018 } 2019 2020 void 2021 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 2022 { 2023 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 2024 inp_v4_multi_match, NULL); 2025 struct inpcb *inp; 2026 struct in_multi *inm; 2027 struct in_mfilter *imf; 2028 struct ip_moptions *imo; 2029 2030 IN_MULTI_LOCK_ASSERT(); 2031 2032 while ((inp = inp_next(&inpi)) != NULL) { 2033 INP_WLOCK_ASSERT(inp); 2034 2035 imo = inp->inp_moptions; 2036 /* 2037 * Unselect the outgoing interface if it is being 2038 * detached. 2039 */ 2040 if (imo->imo_multicast_ifp == ifp) 2041 imo->imo_multicast_ifp = NULL; 2042 2043 /* 2044 * Drop multicast group membership if we joined 2045 * through the interface being detached. 2046 * 2047 * XXX This can all be deferred to an epoch_call 2048 */ 2049 restart: 2050 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2051 if ((inm = imf->imf_inm) == NULL) 2052 continue; 2053 if (inm->inm_ifp != ifp) 2054 continue; 2055 ip_mfilter_remove(&imo->imo_head, imf); 2056 in_leavegroup_locked(inm, NULL); 2057 ip_mfilter_free(imf); 2058 goto restart; 2059 } 2060 } 2061 } 2062 2063 /* 2064 * Lookup a PCB based on the local address and port. Caller must hold the 2065 * hash lock. No inpcb locks or references are acquired. 2066 */ 2067 #define INP_LOOKUP_MAPPED_PCB_COST 3 2068 struct inpcb * 2069 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2070 u_short lport, int lookupflags, struct ucred *cred) 2071 { 2072 struct inpcb *inp; 2073 #ifdef INET6 2074 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2075 #else 2076 int matchwild = 3; 2077 #endif 2078 int wildcard; 2079 2080 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2081 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2082 INP_HASH_LOCK_ASSERT(pcbinfo); 2083 2084 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2085 struct inpcbhead *head; 2086 /* 2087 * Look for an unconnected (wildcard foreign addr) PCB that 2088 * matches the local address and port we're looking for. 2089 */ 2090 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 2091 0, pcbinfo->ipi_hashmask)]; 2092 CK_LIST_FOREACH(inp, head, inp_hash) { 2093 #ifdef INET6 2094 /* XXX inp locking */ 2095 if ((inp->inp_vflag & INP_IPV4) == 0) 2096 continue; 2097 #endif 2098 if (inp->inp_faddr.s_addr == INADDR_ANY && 2099 inp->inp_laddr.s_addr == laddr.s_addr && 2100 inp->inp_lport == lport) { 2101 /* 2102 * Found? 2103 */ 2104 if (cred == NULL || 2105 prison_equal_ip4(cred->cr_prison, 2106 inp->inp_cred->cr_prison)) 2107 return (inp); 2108 } 2109 } 2110 /* 2111 * Not found. 2112 */ 2113 return (NULL); 2114 } else { 2115 struct inpcbporthead *porthash; 2116 struct inpcbport *phd; 2117 struct inpcb *match = NULL; 2118 /* 2119 * Best fit PCB lookup. 2120 * 2121 * First see if this local port is in use by looking on the 2122 * port hash list. 2123 */ 2124 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2125 pcbinfo->ipi_porthashmask)]; 2126 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2127 if (phd->phd_port == lport) 2128 break; 2129 } 2130 if (phd != NULL) { 2131 /* 2132 * Port is in use by one or more PCBs. Look for best 2133 * fit. 2134 */ 2135 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2136 wildcard = 0; 2137 if (cred != NULL && 2138 !prison_equal_ip4(inp->inp_cred->cr_prison, 2139 cred->cr_prison)) 2140 continue; 2141 #ifdef INET6 2142 /* XXX inp locking */ 2143 if ((inp->inp_vflag & INP_IPV4) == 0) 2144 continue; 2145 /* 2146 * We never select the PCB that has 2147 * INP_IPV6 flag and is bound to :: if 2148 * we have another PCB which is bound 2149 * to 0.0.0.0. If a PCB has the 2150 * INP_IPV6 flag, then we set its cost 2151 * higher than IPv4 only PCBs. 2152 * 2153 * Note that the case only happens 2154 * when a socket is bound to ::, under 2155 * the condition that the use of the 2156 * mapped address is allowed. 2157 */ 2158 if ((inp->inp_vflag & INP_IPV6) != 0) 2159 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2160 #endif 2161 if (inp->inp_faddr.s_addr != INADDR_ANY) 2162 wildcard++; 2163 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2164 if (laddr.s_addr == INADDR_ANY) 2165 wildcard++; 2166 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2167 continue; 2168 } else { 2169 if (laddr.s_addr != INADDR_ANY) 2170 wildcard++; 2171 } 2172 if (wildcard < matchwild) { 2173 match = inp; 2174 matchwild = wildcard; 2175 if (matchwild == 0) 2176 break; 2177 } 2178 } 2179 } 2180 return (match); 2181 } 2182 } 2183 #undef INP_LOOKUP_MAPPED_PCB_COST 2184 2185 static struct inpcb * 2186 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2187 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2188 uint16_t fport, int lookupflags, int numa_domain) 2189 { 2190 struct inpcb *local_wild, *numa_wild; 2191 const struct inpcblbgrouphead *hdr; 2192 struct inpcblbgroup *grp; 2193 uint32_t idx; 2194 2195 INP_HASH_LOCK_ASSERT(pcbinfo); 2196 2197 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2198 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2199 2200 /* 2201 * Order of socket selection: 2202 * 1. non-wild. 2203 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). 2204 * 2205 * NOTE: 2206 * - Load balanced group does not contain jailed sockets 2207 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets 2208 */ 2209 local_wild = NULL; 2210 numa_wild = NULL; 2211 CK_LIST_FOREACH(grp, hdr, il_list) { 2212 #ifdef INET6 2213 if (!(grp->il_vflag & INP_IPV4)) 2214 continue; 2215 #endif 2216 if (grp->il_lport != lport) 2217 continue; 2218 2219 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % 2220 grp->il_inpcnt; 2221 if (grp->il_laddr.s_addr == laddr->s_addr) { 2222 if (numa_domain == M_NODOM || 2223 grp->il_numa_domain == numa_domain) { 2224 return (grp->il_inp[idx]); 2225 } else { 2226 numa_wild = grp->il_inp[idx]; 2227 } 2228 } 2229 if (grp->il_laddr.s_addr == INADDR_ANY && 2230 (lookupflags & INPLOOKUP_WILDCARD) != 0 && 2231 (local_wild == NULL || numa_domain == M_NODOM || 2232 grp->il_numa_domain == numa_domain)) { 2233 local_wild = grp->il_inp[idx]; 2234 } 2235 } 2236 if (numa_wild != NULL) 2237 return (numa_wild); 2238 2239 return (local_wild); 2240 } 2241 2242 /* 2243 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2244 * that the caller has either locked the hash list, which usually happens 2245 * for bind(2) operations, or is in SMR section, which happens when sorting 2246 * out incoming packets. 2247 */ 2248 static struct inpcb * 2249 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2250 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2251 struct ifnet *ifp, uint8_t numa_domain) 2252 { 2253 struct inpcbhead *head; 2254 struct inpcb *inp, *tmpinp; 2255 u_short fport = fport_arg, lport = lport_arg; 2256 2257 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2258 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2259 INP_HASH_LOCK_ASSERT(pcbinfo); 2260 2261 /* 2262 * First look for an exact match. 2263 */ 2264 tmpinp = NULL; 2265 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, 2266 pcbinfo->ipi_hashmask)]; 2267 CK_LIST_FOREACH(inp, head, inp_hash) { 2268 #ifdef INET6 2269 /* XXX inp locking */ 2270 if ((inp->inp_vflag & INP_IPV4) == 0) 2271 continue; 2272 #endif 2273 if (inp->inp_faddr.s_addr == faddr.s_addr && 2274 inp->inp_laddr.s_addr == laddr.s_addr && 2275 inp->inp_fport == fport && 2276 inp->inp_lport == lport) { 2277 /* 2278 * XXX We should be able to directly return 2279 * the inp here, without any checks. 2280 * Well unless both bound with SO_REUSEPORT? 2281 */ 2282 if (prison_flag(inp->inp_cred, PR_IP4)) 2283 return (inp); 2284 if (tmpinp == NULL) 2285 tmpinp = inp; 2286 } 2287 } 2288 if (tmpinp != NULL) 2289 return (tmpinp); 2290 2291 /* 2292 * Then look in lb group (for wildcard match). 2293 */ 2294 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2295 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2296 fport, lookupflags, numa_domain); 2297 if (inp != NULL) 2298 return (inp); 2299 } 2300 2301 /* 2302 * Then look for a wildcard match, if requested. 2303 */ 2304 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2305 struct inpcb *local_wild = NULL, *local_exact = NULL; 2306 #ifdef INET6 2307 struct inpcb *local_wild_mapped = NULL; 2308 #endif 2309 struct inpcb *jail_wild = NULL; 2310 int injail; 2311 2312 /* 2313 * Order of socket selection - we always prefer jails. 2314 * 1. jailed, non-wild. 2315 * 2. jailed, wild. 2316 * 3. non-jailed, non-wild. 2317 * 4. non-jailed, wild. 2318 */ 2319 2320 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 2321 0, pcbinfo->ipi_hashmask)]; 2322 CK_LIST_FOREACH(inp, head, inp_hash) { 2323 #ifdef INET6 2324 /* XXX inp locking */ 2325 if ((inp->inp_vflag & INP_IPV4) == 0) 2326 continue; 2327 #endif 2328 if (inp->inp_faddr.s_addr != INADDR_ANY || 2329 inp->inp_lport != lport) 2330 continue; 2331 2332 injail = prison_flag(inp->inp_cred, PR_IP4); 2333 if (injail) { 2334 if (prison_check_ip4_locked( 2335 inp->inp_cred->cr_prison, &laddr) != 0) 2336 continue; 2337 } else { 2338 if (local_exact != NULL) 2339 continue; 2340 } 2341 2342 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2343 if (injail) 2344 return (inp); 2345 else 2346 local_exact = inp; 2347 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2348 #ifdef INET6 2349 /* XXX inp locking, NULL check */ 2350 if (inp->inp_vflag & INP_IPV6PROTO) 2351 local_wild_mapped = inp; 2352 else 2353 #endif 2354 if (injail) 2355 jail_wild = inp; 2356 else 2357 local_wild = inp; 2358 } 2359 } /* LIST_FOREACH */ 2360 if (jail_wild != NULL) 2361 return (jail_wild); 2362 if (local_exact != NULL) 2363 return (local_exact); 2364 if (local_wild != NULL) 2365 return (local_wild); 2366 #ifdef INET6 2367 if (local_wild_mapped != NULL) 2368 return (local_wild_mapped); 2369 #endif 2370 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2371 2372 return (NULL); 2373 } 2374 2375 /* 2376 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2377 * hash list lock, and will return the inpcb locked (i.e., requires 2378 * INPLOOKUP_LOCKPCB). 2379 */ 2380 static struct inpcb * 2381 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2382 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2383 struct ifnet *ifp, uint8_t numa_domain) 2384 { 2385 struct inpcb *inp; 2386 2387 smr_enter(pcbinfo->ipi_smr); 2388 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2389 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2390 if (inp != NULL) { 2391 if (__predict_false(inp_smr_lock(inp, 2392 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2393 inp = NULL; 2394 } else 2395 smr_exit(pcbinfo->ipi_smr); 2396 2397 return (inp); 2398 } 2399 2400 /* 2401 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2402 * from which a pre-calculated hash value may be extracted. 2403 */ 2404 struct inpcb * 2405 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2406 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2407 { 2408 2409 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2410 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2411 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2412 ("%s: LOCKPCB not set", __func__)); 2413 2414 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2415 lookupflags, ifp, M_NODOM)); 2416 } 2417 2418 struct inpcb * 2419 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2420 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2421 struct ifnet *ifp, struct mbuf *m) 2422 { 2423 2424 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2425 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2426 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2427 ("%s: LOCKPCB not set", __func__)); 2428 2429 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2430 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2431 } 2432 #endif /* INET */ 2433 2434 /* 2435 * Insert PCB onto various hash lists. 2436 */ 2437 int 2438 in_pcbinshash(struct inpcb *inp) 2439 { 2440 struct inpcbhead *pcbhash; 2441 struct inpcbporthead *pcbporthash; 2442 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2443 struct inpcbport *phd; 2444 u_int32_t hashkey_faddr; 2445 int so_options; 2446 2447 INP_WLOCK_ASSERT(inp); 2448 INP_HASH_WLOCK_ASSERT(pcbinfo); 2449 2450 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2451 ("in_pcbinshash: INP_INHASHLIST")); 2452 2453 #ifdef INET6 2454 if (inp->inp_vflag & INP_IPV6) 2455 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2456 else 2457 #endif 2458 hashkey_faddr = inp->inp_faddr.s_addr; 2459 2460 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2461 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2462 2463 pcbporthash = &pcbinfo->ipi_porthashbase[ 2464 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2465 2466 /* 2467 * Add entry to load balance group. 2468 * Only do this if SO_REUSEPORT_LB is set. 2469 */ 2470 so_options = inp_so_options(inp); 2471 if (so_options & SO_REUSEPORT_LB) { 2472 int ret = in_pcbinslbgrouphash(inp, M_NODOM); 2473 if (ret) { 2474 /* pcb lb group malloc fail (ret=ENOBUFS). */ 2475 return (ret); 2476 } 2477 } 2478 2479 /* 2480 * Go through port list and look for a head for this lport. 2481 */ 2482 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2483 if (phd->phd_port == inp->inp_lport) 2484 break; 2485 } 2486 /* 2487 * If none exists, malloc one and tack it on. 2488 */ 2489 if (phd == NULL) { 2490 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2491 if (phd == NULL) { 2492 return (ENOBUFS); /* XXX */ 2493 } 2494 phd->phd_port = inp->inp_lport; 2495 CK_LIST_INIT(&phd->phd_pcblist); 2496 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2497 } 2498 inp->inp_phd = phd; 2499 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2500 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2501 inp->inp_flags |= INP_INHASHLIST; 2502 2503 return (0); 2504 } 2505 2506 /* 2507 * Move PCB to the proper hash bucket when { faddr, fport } have been 2508 * changed. NOTE: This does not handle the case of the lport changing (the 2509 * hashed port list would have to be updated as well), so the lport must 2510 * not change after in_pcbinshash() has been called. 2511 * 2512 * XXXGL: a race between this function and SMR-protected hash iterator 2513 * will lead to iterator traversing a possibly wrong hash list. However, 2514 * this race should have been here since change from rwlock to epoch. 2515 */ 2516 void 2517 in_pcbrehash(struct inpcb *inp) 2518 { 2519 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2520 struct inpcbhead *head; 2521 u_int32_t hashkey_faddr; 2522 2523 INP_WLOCK_ASSERT(inp); 2524 INP_HASH_WLOCK_ASSERT(pcbinfo); 2525 2526 KASSERT(inp->inp_flags & INP_INHASHLIST, 2527 ("in_pcbrehash: !INP_INHASHLIST")); 2528 2529 #ifdef INET6 2530 if (inp->inp_vflag & INP_IPV6) 2531 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2532 else 2533 #endif 2534 hashkey_faddr = inp->inp_faddr.s_addr; 2535 2536 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2537 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2538 2539 CK_LIST_REMOVE(inp, inp_hash); 2540 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2541 } 2542 2543 /* 2544 * Check for alternatives when higher level complains 2545 * about service problems. For now, invalidate cached 2546 * routing information. If the route was created dynamically 2547 * (by a redirect), time to try a default gateway again. 2548 */ 2549 void 2550 in_losing(struct inpcb *inp) 2551 { 2552 2553 RO_INVALIDATE_CACHE(&inp->inp_route); 2554 return; 2555 } 2556 2557 /* 2558 * A set label operation has occurred at the socket layer, propagate the 2559 * label change into the in_pcb for the socket. 2560 */ 2561 void 2562 in_pcbsosetlabel(struct socket *so) 2563 { 2564 #ifdef MAC 2565 struct inpcb *inp; 2566 2567 inp = sotoinpcb(so); 2568 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2569 2570 INP_WLOCK(inp); 2571 SOCK_LOCK(so); 2572 mac_inpcb_sosetlabel(so, inp); 2573 SOCK_UNLOCK(so); 2574 INP_WUNLOCK(inp); 2575 #endif 2576 } 2577 2578 /* 2579 * ipport_tick runs once per second, determining if random port allocation 2580 * should be continued. If more than ipport_randomcps ports have been 2581 * allocated in the last second, then we return to sequential port 2582 * allocation. We return to random allocation only once we drop below 2583 * ipport_randomcps for at least ipport_randomtime seconds. 2584 */ 2585 static void 2586 ipport_tick(void *xtp) 2587 { 2588 VNET_ITERATOR_DECL(vnet_iter); 2589 2590 VNET_LIST_RLOCK_NOSLEEP(); 2591 VNET_FOREACH(vnet_iter) { 2592 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ 2593 if (V_ipport_tcpallocs - V_ipport_tcplastcount <= 2594 V_ipport_randomcps) { 2595 if (V_ipport_stoprandom > 0) 2596 V_ipport_stoprandom--; 2597 } else 2598 V_ipport_stoprandom = V_ipport_randomtime; 2599 V_ipport_tcplastcount = V_ipport_tcpallocs; 2600 CURVNET_RESTORE(); 2601 } 2602 VNET_LIST_RUNLOCK_NOSLEEP(); 2603 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); 2604 } 2605 2606 static void 2607 ip_fini(void *xtp) 2608 { 2609 2610 callout_stop(&ipport_tick_callout); 2611 } 2612 2613 /* 2614 * The ipport_callout should start running at about the time we attach the 2615 * inet or inet6 domains. 2616 */ 2617 static void 2618 ipport_tick_init(const void *unused __unused) 2619 { 2620 2621 /* Start ipport_tick. */ 2622 callout_init(&ipport_tick_callout, 1); 2623 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 2624 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 2625 SHUTDOWN_PRI_DEFAULT); 2626 } 2627 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 2628 ipport_tick_init, NULL); 2629 2630 void 2631 inp_wlock(struct inpcb *inp) 2632 { 2633 2634 INP_WLOCK(inp); 2635 } 2636 2637 void 2638 inp_wunlock(struct inpcb *inp) 2639 { 2640 2641 INP_WUNLOCK(inp); 2642 } 2643 2644 void 2645 inp_rlock(struct inpcb *inp) 2646 { 2647 2648 INP_RLOCK(inp); 2649 } 2650 2651 void 2652 inp_runlock(struct inpcb *inp) 2653 { 2654 2655 INP_RUNLOCK(inp); 2656 } 2657 2658 #ifdef INVARIANT_SUPPORT 2659 void 2660 inp_lock_assert(struct inpcb *inp) 2661 { 2662 2663 INP_WLOCK_ASSERT(inp); 2664 } 2665 2666 void 2667 inp_unlock_assert(struct inpcb *inp) 2668 { 2669 2670 INP_UNLOCK_ASSERT(inp); 2671 } 2672 #endif 2673 2674 void 2675 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) 2676 { 2677 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 2678 INPLOOKUP_WLOCKPCB); 2679 struct inpcb *inp; 2680 2681 while ((inp = inp_next(&inpi)) != NULL) 2682 func(inp, arg); 2683 } 2684 2685 struct socket * 2686 inp_inpcbtosocket(struct inpcb *inp) 2687 { 2688 2689 INP_WLOCK_ASSERT(inp); 2690 return (inp->inp_socket); 2691 } 2692 2693 struct tcpcb * 2694 inp_inpcbtotcpcb(struct inpcb *inp) 2695 { 2696 2697 INP_WLOCK_ASSERT(inp); 2698 return ((struct tcpcb *)inp->inp_ppcb); 2699 } 2700 2701 int 2702 inp_ip_tos_get(const struct inpcb *inp) 2703 { 2704 2705 return (inp->inp_ip_tos); 2706 } 2707 2708 void 2709 inp_ip_tos_set(struct inpcb *inp, int val) 2710 { 2711 2712 inp->inp_ip_tos = val; 2713 } 2714 2715 void 2716 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2717 uint32_t *faddr, uint16_t *fp) 2718 { 2719 2720 INP_LOCK_ASSERT(inp); 2721 *laddr = inp->inp_laddr.s_addr; 2722 *faddr = inp->inp_faddr.s_addr; 2723 *lp = inp->inp_lport; 2724 *fp = inp->inp_fport; 2725 } 2726 2727 struct inpcb * 2728 so_sotoinpcb(struct socket *so) 2729 { 2730 2731 return (sotoinpcb(so)); 2732 } 2733 2734 struct tcpcb * 2735 so_sototcpcb(struct socket *so) 2736 { 2737 2738 return (sototcpcb(so)); 2739 } 2740 2741 /* 2742 * Create an external-format (``xinpcb'') structure using the information in 2743 * the kernel-format in_pcb structure pointed to by inp. This is done to 2744 * reduce the spew of irrelevant information over this interface, to isolate 2745 * user code from changes in the kernel structure, and potentially to provide 2746 * information-hiding if we decide that some of this information should be 2747 * hidden from users. 2748 */ 2749 void 2750 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2751 { 2752 2753 bzero(xi, sizeof(*xi)); 2754 xi->xi_len = sizeof(struct xinpcb); 2755 if (inp->inp_socket) 2756 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2757 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2758 xi->inp_gencnt = inp->inp_gencnt; 2759 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2760 xi->inp_flow = inp->inp_flow; 2761 xi->inp_flowid = inp->inp_flowid; 2762 xi->inp_flowtype = inp->inp_flowtype; 2763 xi->inp_flags = inp->inp_flags; 2764 xi->inp_flags2 = inp->inp_flags2; 2765 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2766 xi->in6p_cksum = inp->in6p_cksum; 2767 xi->in6p_hops = inp->in6p_hops; 2768 xi->inp_ip_tos = inp->inp_ip_tos; 2769 xi->inp_vflag = inp->inp_vflag; 2770 xi->inp_ip_ttl = inp->inp_ip_ttl; 2771 xi->inp_ip_p = inp->inp_ip_p; 2772 xi->inp_ip_minttl = inp->inp_ip_minttl; 2773 } 2774 2775 #ifdef DDB 2776 static void 2777 db_print_indent(int indent) 2778 { 2779 int i; 2780 2781 for (i = 0; i < indent; i++) 2782 db_printf(" "); 2783 } 2784 2785 static void 2786 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2787 { 2788 char faddr_str[48], laddr_str[48]; 2789 2790 db_print_indent(indent); 2791 db_printf("%s at %p\n", name, inc); 2792 2793 indent += 2; 2794 2795 #ifdef INET6 2796 if (inc->inc_flags & INC_ISIPV6) { 2797 /* IPv6. */ 2798 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2799 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2800 } else 2801 #endif 2802 { 2803 /* IPv4. */ 2804 inet_ntoa_r(inc->inc_laddr, laddr_str); 2805 inet_ntoa_r(inc->inc_faddr, faddr_str); 2806 } 2807 db_print_indent(indent); 2808 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2809 ntohs(inc->inc_lport)); 2810 db_print_indent(indent); 2811 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2812 ntohs(inc->inc_fport)); 2813 } 2814 2815 static void 2816 db_print_inpflags(int inp_flags) 2817 { 2818 int comma; 2819 2820 comma = 0; 2821 if (inp_flags & INP_RECVOPTS) { 2822 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2823 comma = 1; 2824 } 2825 if (inp_flags & INP_RECVRETOPTS) { 2826 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2827 comma = 1; 2828 } 2829 if (inp_flags & INP_RECVDSTADDR) { 2830 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2831 comma = 1; 2832 } 2833 if (inp_flags & INP_ORIGDSTADDR) { 2834 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2835 comma = 1; 2836 } 2837 if (inp_flags & INP_HDRINCL) { 2838 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2839 comma = 1; 2840 } 2841 if (inp_flags & INP_HIGHPORT) { 2842 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2843 comma = 1; 2844 } 2845 if (inp_flags & INP_LOWPORT) { 2846 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2847 comma = 1; 2848 } 2849 if (inp_flags & INP_ANONPORT) { 2850 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2851 comma = 1; 2852 } 2853 if (inp_flags & INP_RECVIF) { 2854 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2855 comma = 1; 2856 } 2857 if (inp_flags & INP_MTUDISC) { 2858 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2859 comma = 1; 2860 } 2861 if (inp_flags & INP_RECVTTL) { 2862 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2863 comma = 1; 2864 } 2865 if (inp_flags & INP_DONTFRAG) { 2866 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2867 comma = 1; 2868 } 2869 if (inp_flags & INP_RECVTOS) { 2870 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2871 comma = 1; 2872 } 2873 if (inp_flags & IN6P_IPV6_V6ONLY) { 2874 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2875 comma = 1; 2876 } 2877 if (inp_flags & IN6P_PKTINFO) { 2878 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2879 comma = 1; 2880 } 2881 if (inp_flags & IN6P_HOPLIMIT) { 2882 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2883 comma = 1; 2884 } 2885 if (inp_flags & IN6P_HOPOPTS) { 2886 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 2887 comma = 1; 2888 } 2889 if (inp_flags & IN6P_DSTOPTS) { 2890 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 2891 comma = 1; 2892 } 2893 if (inp_flags & IN6P_RTHDR) { 2894 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 2895 comma = 1; 2896 } 2897 if (inp_flags & IN6P_RTHDRDSTOPTS) { 2898 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 2899 comma = 1; 2900 } 2901 if (inp_flags & IN6P_TCLASS) { 2902 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 2903 comma = 1; 2904 } 2905 if (inp_flags & IN6P_AUTOFLOWLABEL) { 2906 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 2907 comma = 1; 2908 } 2909 if (inp_flags & INP_TIMEWAIT) { 2910 db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); 2911 comma = 1; 2912 } 2913 if (inp_flags & INP_ONESBCAST) { 2914 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 2915 comma = 1; 2916 } 2917 if (inp_flags & INP_DROPPED) { 2918 db_printf("%sINP_DROPPED", comma ? ", " : ""); 2919 comma = 1; 2920 } 2921 if (inp_flags & INP_SOCKREF) { 2922 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 2923 comma = 1; 2924 } 2925 if (inp_flags & IN6P_RFC2292) { 2926 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 2927 comma = 1; 2928 } 2929 if (inp_flags & IN6P_MTU) { 2930 db_printf("IN6P_MTU%s", comma ? ", " : ""); 2931 comma = 1; 2932 } 2933 } 2934 2935 static void 2936 db_print_inpvflag(u_char inp_vflag) 2937 { 2938 int comma; 2939 2940 comma = 0; 2941 if (inp_vflag & INP_IPV4) { 2942 db_printf("%sINP_IPV4", comma ? ", " : ""); 2943 comma = 1; 2944 } 2945 if (inp_vflag & INP_IPV6) { 2946 db_printf("%sINP_IPV6", comma ? ", " : ""); 2947 comma = 1; 2948 } 2949 if (inp_vflag & INP_IPV6PROTO) { 2950 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 2951 comma = 1; 2952 } 2953 } 2954 2955 static void 2956 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 2957 { 2958 2959 db_print_indent(indent); 2960 db_printf("%s at %p\n", name, inp); 2961 2962 indent += 2; 2963 2964 db_print_indent(indent); 2965 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 2966 2967 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 2968 2969 db_print_indent(indent); 2970 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 2971 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 2972 2973 db_print_indent(indent); 2974 db_printf("inp_label: %p inp_flags: 0x%x (", 2975 inp->inp_label, inp->inp_flags); 2976 db_print_inpflags(inp->inp_flags); 2977 db_printf(")\n"); 2978 2979 db_print_indent(indent); 2980 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 2981 inp->inp_vflag); 2982 db_print_inpvflag(inp->inp_vflag); 2983 db_printf(")\n"); 2984 2985 db_print_indent(indent); 2986 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 2987 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 2988 2989 db_print_indent(indent); 2990 #ifdef INET6 2991 if (inp->inp_vflag & INP_IPV6) { 2992 db_printf("in6p_options: %p in6p_outputopts: %p " 2993 "in6p_moptions: %p\n", inp->in6p_options, 2994 inp->in6p_outputopts, inp->in6p_moptions); 2995 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 2996 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 2997 inp->in6p_hops); 2998 } else 2999 #endif 3000 { 3001 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3002 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3003 inp->inp_options, inp->inp_moptions); 3004 } 3005 3006 db_print_indent(indent); 3007 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3008 (uintmax_t)inp->inp_gencnt); 3009 } 3010 3011 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3012 { 3013 struct inpcb *inp; 3014 3015 if (!have_addr) { 3016 db_printf("usage: show inpcb <addr>\n"); 3017 return; 3018 } 3019 inp = (struct inpcb *)addr; 3020 3021 db_print_inpcb(inp, "inpcb", 0); 3022 } 3023 #endif /* DDB */ 3024 3025 #ifdef RATELIMIT 3026 /* 3027 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3028 * if any. 3029 */ 3030 int 3031 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3032 { 3033 union if_snd_tag_modify_params params = { 3034 .rate_limit.max_rate = max_pacing_rate, 3035 .rate_limit.flags = M_NOWAIT, 3036 }; 3037 struct m_snd_tag *mst; 3038 int error; 3039 3040 mst = inp->inp_snd_tag; 3041 if (mst == NULL) 3042 return (EINVAL); 3043 3044 if (mst->sw->snd_tag_modify == NULL) { 3045 error = EOPNOTSUPP; 3046 } else { 3047 error = mst->sw->snd_tag_modify(mst, ¶ms); 3048 } 3049 return (error); 3050 } 3051 3052 /* 3053 * Query existing TX rate limit based on the existing 3054 * "inp->inp_snd_tag", if any. 3055 */ 3056 int 3057 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3058 { 3059 union if_snd_tag_query_params params = { }; 3060 struct m_snd_tag *mst; 3061 int error; 3062 3063 mst = inp->inp_snd_tag; 3064 if (mst == NULL) 3065 return (EINVAL); 3066 3067 if (mst->sw->snd_tag_query == NULL) { 3068 error = EOPNOTSUPP; 3069 } else { 3070 error = mst->sw->snd_tag_query(mst, ¶ms); 3071 if (error == 0 && p_max_pacing_rate != NULL) 3072 *p_max_pacing_rate = params.rate_limit.max_rate; 3073 } 3074 return (error); 3075 } 3076 3077 /* 3078 * Query existing TX queue level based on the existing 3079 * "inp->inp_snd_tag", if any. 3080 */ 3081 int 3082 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3083 { 3084 union if_snd_tag_query_params params = { }; 3085 struct m_snd_tag *mst; 3086 int error; 3087 3088 mst = inp->inp_snd_tag; 3089 if (mst == NULL) 3090 return (EINVAL); 3091 3092 if (mst->sw->snd_tag_query == NULL) 3093 return (EOPNOTSUPP); 3094 3095 error = mst->sw->snd_tag_query(mst, ¶ms); 3096 if (error == 0 && p_txqueue_level != NULL) 3097 *p_txqueue_level = params.rate_limit.queue_level; 3098 return (error); 3099 } 3100 3101 /* 3102 * Allocate a new TX rate limit send tag from the network interface 3103 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3104 */ 3105 int 3106 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3107 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3108 3109 { 3110 union if_snd_tag_alloc_params params = { 3111 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3112 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3113 .rate_limit.hdr.flowid = flowid, 3114 .rate_limit.hdr.flowtype = flowtype, 3115 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3116 .rate_limit.max_rate = max_pacing_rate, 3117 .rate_limit.flags = M_NOWAIT, 3118 }; 3119 int error; 3120 3121 INP_WLOCK_ASSERT(inp); 3122 3123 /* 3124 * If there is already a send tag, or the INP is being torn 3125 * down, allocating a new send tag is not allowed. Else send 3126 * tags may leak. 3127 */ 3128 if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0) 3129 return (EINVAL); 3130 3131 error = m_snd_tag_alloc(ifp, ¶ms, st); 3132 #ifdef INET 3133 if (error == 0) { 3134 counter_u64_add(rate_limit_set_ok, 1); 3135 counter_u64_add(rate_limit_active, 1); 3136 } else if (error != EOPNOTSUPP) 3137 counter_u64_add(rate_limit_alloc_fail, 1); 3138 #endif 3139 return (error); 3140 } 3141 3142 void 3143 in_pcbdetach_tag(struct m_snd_tag *mst) 3144 { 3145 3146 m_snd_tag_rele(mst); 3147 #ifdef INET 3148 counter_u64_add(rate_limit_active, -1); 3149 #endif 3150 } 3151 3152 /* 3153 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3154 * if any: 3155 */ 3156 void 3157 in_pcbdetach_txrtlmt(struct inpcb *inp) 3158 { 3159 struct m_snd_tag *mst; 3160 3161 INP_WLOCK_ASSERT(inp); 3162 3163 mst = inp->inp_snd_tag; 3164 inp->inp_snd_tag = NULL; 3165 3166 if (mst == NULL) 3167 return; 3168 3169 m_snd_tag_rele(mst); 3170 #ifdef INET 3171 counter_u64_add(rate_limit_active, -1); 3172 #endif 3173 } 3174 3175 int 3176 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3177 { 3178 int error; 3179 3180 /* 3181 * If the existing send tag is for the wrong interface due to 3182 * a route change, first drop the existing tag. Set the 3183 * CHANGED flag so that we will keep trying to allocate a new 3184 * tag if we fail to allocate one this time. 3185 */ 3186 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3187 in_pcbdetach_txrtlmt(inp); 3188 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3189 } 3190 3191 /* 3192 * NOTE: When attaching to a network interface a reference is 3193 * made to ensure the network interface doesn't go away until 3194 * all ratelimit connections are gone. The network interface 3195 * pointers compared below represent valid network interfaces, 3196 * except when comparing towards NULL. 3197 */ 3198 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3199 error = 0; 3200 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3201 if (inp->inp_snd_tag != NULL) 3202 in_pcbdetach_txrtlmt(inp); 3203 error = 0; 3204 } else if (inp->inp_snd_tag == NULL) { 3205 /* 3206 * In order to utilize packet pacing with RSS, we need 3207 * to wait until there is a valid RSS hash before we 3208 * can proceed: 3209 */ 3210 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3211 error = EAGAIN; 3212 } else { 3213 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3214 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3215 } 3216 } else { 3217 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3218 } 3219 if (error == 0 || error == EOPNOTSUPP) 3220 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3221 3222 return (error); 3223 } 3224 3225 /* 3226 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3227 * is set in the fast path and will attach/detach/modify the TX rate 3228 * limit send tag based on the socket's so_max_pacing_rate value. 3229 */ 3230 void 3231 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3232 { 3233 struct socket *socket; 3234 uint32_t max_pacing_rate; 3235 bool did_upgrade; 3236 int error; 3237 3238 if (inp == NULL) 3239 return; 3240 3241 socket = inp->inp_socket; 3242 if (socket == NULL) 3243 return; 3244 3245 if (!INP_WLOCKED(inp)) { 3246 /* 3247 * NOTE: If the write locking fails, we need to bail 3248 * out and use the non-ratelimited ring for the 3249 * transmit until there is a new chance to get the 3250 * write lock. 3251 */ 3252 if (!INP_TRY_UPGRADE(inp)) 3253 return; 3254 did_upgrade = 1; 3255 } else { 3256 did_upgrade = 0; 3257 } 3258 3259 /* 3260 * NOTE: The so_max_pacing_rate value is read unlocked, 3261 * because atomic updates are not required since the variable 3262 * is checked at every mbuf we send. It is assumed that the 3263 * variable read itself will be atomic. 3264 */ 3265 max_pacing_rate = socket->so_max_pacing_rate; 3266 3267 error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3268 3269 if (did_upgrade) 3270 INP_DOWNGRADE(inp); 3271 } 3272 3273 /* 3274 * Track route changes for TX rate limiting. 3275 */ 3276 void 3277 in_pcboutput_eagain(struct inpcb *inp) 3278 { 3279 bool did_upgrade; 3280 3281 if (inp == NULL) 3282 return; 3283 3284 if (inp->inp_snd_tag == NULL) 3285 return; 3286 3287 if (!INP_WLOCKED(inp)) { 3288 /* 3289 * NOTE: If the write locking fails, we need to bail 3290 * out and use the non-ratelimited ring for the 3291 * transmit until there is a new chance to get the 3292 * write lock. 3293 */ 3294 if (!INP_TRY_UPGRADE(inp)) 3295 return; 3296 did_upgrade = 1; 3297 } else { 3298 did_upgrade = 0; 3299 } 3300 3301 /* detach rate limiting */ 3302 in_pcbdetach_txrtlmt(inp); 3303 3304 /* make sure new mbuf send tag allocation is made */ 3305 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3306 3307 if (did_upgrade) 3308 INP_DOWNGRADE(inp); 3309 } 3310 3311 #ifdef INET 3312 static void 3313 rl_init(void *st) 3314 { 3315 rate_limit_new = counter_u64_alloc(M_WAITOK); 3316 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3317 rate_limit_active = counter_u64_alloc(M_WAITOK); 3318 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3319 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3320 } 3321 3322 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3323 #endif 3324 #endif /* RATELIMIT */ 3325