1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include <sys/cdefs.h> 40 #include "opt_ddb.h" 41 #include "opt_ipsec.h" 42 #include "opt_inet.h" 43 #include "opt_inet6.h" 44 #include "opt_ratelimit.h" 45 #include "opt_route.h" 46 #include "opt_rss.h" 47 48 #include <sys/param.h> 49 #include <sys/hash.h> 50 #include <sys/systm.h> 51 #include <sys/libkern.h> 52 #include <sys/lock.h> 53 #include <sys/malloc.h> 54 #include <sys/mbuf.h> 55 #include <sys/eventhandler.h> 56 #include <sys/domain.h> 57 #include <sys/proc.h> 58 #include <sys/protosw.h> 59 #include <sys/smp.h> 60 #include <sys/smr.h> 61 #include <sys/socket.h> 62 #include <sys/socketvar.h> 63 #include <sys/sockio.h> 64 #include <sys/priv.h> 65 #include <sys/proc.h> 66 #include <sys/refcount.h> 67 #include <sys/jail.h> 68 #include <sys/kernel.h> 69 #include <sys/sysctl.h> 70 71 #ifdef DDB 72 #include <ddb/ddb.h> 73 #endif 74 75 #include <vm/uma.h> 76 #include <vm/vm.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_private.h> 81 #include <net/if_types.h> 82 #include <net/if_llatbl.h> 83 #include <net/route.h> 84 #include <net/rss_config.h> 85 #include <net/vnet.h> 86 87 #if defined(INET) || defined(INET6) 88 #include <netinet/in.h> 89 #include <netinet/in_pcb.h> 90 #include <netinet/in_pcb_var.h> 91 #include <netinet/tcp.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #ifdef INET6 98 #include <netinet/ip6.h> 99 #include <netinet6/in6_pcb.h> 100 #include <netinet6/in6_var.h> 101 #include <netinet6/ip6_var.h> 102 #endif /* INET6 */ 103 #include <net/route/nhop.h> 104 #endif 105 106 #include <netipsec/ipsec_support.h> 107 108 #include <security/mac/mac_framework.h> 109 110 #define INPCBLBGROUP_SIZMIN 8 111 #define INPCBLBGROUP_SIZMAX 256 112 113 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 114 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 115 116 /* 117 * These configure the range of local port addresses assigned to 118 * "unspecified" outgoing connections/packets/whatever. 119 */ 120 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 121 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 122 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 123 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 124 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 125 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 126 127 /* 128 * Reserved ports accessible only to root. There are significant 129 * security considerations that must be accounted for when changing these, 130 * but the security benefits can be great. Please be careful. 131 */ 132 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 133 VNET_DEFINE(int, ipport_reservedlow); 134 135 /* Enable random ephemeral port allocation by default. */ 136 VNET_DEFINE(int, ipport_randomized) = 1; 137 138 #ifdef INET 139 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 140 struct in_addr faddr, u_int fport_arg, 141 struct in_addr laddr, u_int lport_arg, 142 int lookupflags, uint8_t numa_domain); 143 144 #define RANGECHK(var, min, max) \ 145 if ((var) < (min)) { (var) = (min); } \ 146 else if ((var) > (max)) { (var) = (max); } 147 148 static int 149 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 150 { 151 int error; 152 153 error = sysctl_handle_int(oidp, arg1, arg2, req); 154 if (error == 0) { 155 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 157 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 160 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 161 } 162 return (error); 163 } 164 165 #undef RANGECHK 166 167 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 168 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 169 "IP Ports"); 170 171 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 172 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 173 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 174 ""); 175 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 176 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 177 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 178 ""); 179 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 180 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 181 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 182 ""); 183 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 184 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 185 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 186 ""); 187 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 188 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 189 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 190 ""); 191 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 192 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 193 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 194 ""); 195 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 196 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 197 &VNET_NAME(ipport_reservedhigh), 0, ""); 198 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 199 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 200 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 201 CTLFLAG_VNET | CTLFLAG_RW, 202 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 203 204 #ifdef RATELIMIT 205 counter_u64_t rate_limit_new; 206 counter_u64_t rate_limit_chg; 207 counter_u64_t rate_limit_active; 208 counter_u64_t rate_limit_alloc_fail; 209 counter_u64_t rate_limit_set_ok; 210 211 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 212 "IP Rate Limiting"); 213 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 214 &rate_limit_active, "Active rate limited connections"); 215 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 216 &rate_limit_alloc_fail, "Rate limited connection failures"); 217 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 218 &rate_limit_set_ok, "Rate limited setting succeeded"); 219 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 220 &rate_limit_new, "Total Rate limit new attempts"); 221 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 222 &rate_limit_chg, "Total Rate limited change attempts"); 223 #endif /* RATELIMIT */ 224 225 #endif /* INET */ 226 227 VNET_DEFINE(uint32_t, in_pcbhashseed); 228 static void 229 in_pcbhashseed_init(void) 230 { 231 232 V_in_pcbhashseed = arc4random(); 233 } 234 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 235 in_pcbhashseed_init, 0); 236 237 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 1; 238 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 239 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 240 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 241 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 242 243 static void in_pcbremhash(struct inpcb *); 244 245 /* 246 * in_pcb.c: manage the Protocol Control Blocks. 247 * 248 * NOTE: It is assumed that most of these functions will be called with 249 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 250 * functions often modify hash chains or addresses in pcbs. 251 */ 252 253 static struct inpcblbgroup * 254 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, struct ucred *cred, 255 u_char vflag, uint16_t port, const union in_dependaddr *addr, int size, 256 uint8_t numa_domain) 257 { 258 struct inpcblbgroup *grp; 259 size_t bytes; 260 261 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 262 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 263 if (grp == NULL) 264 return (NULL); 265 grp->il_cred = crhold(cred); 266 grp->il_vflag = vflag; 267 grp->il_lport = port; 268 grp->il_numa_domain = numa_domain; 269 grp->il_dependladdr = *addr; 270 grp->il_inpsiz = size; 271 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 272 return (grp); 273 } 274 275 static void 276 in_pcblbgroup_free_deferred(epoch_context_t ctx) 277 { 278 struct inpcblbgroup *grp; 279 280 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 281 crfree(grp->il_cred); 282 free(grp, M_PCB); 283 } 284 285 static void 286 in_pcblbgroup_free(struct inpcblbgroup *grp) 287 { 288 289 CK_LIST_REMOVE(grp, il_list); 290 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 291 } 292 293 static struct inpcblbgroup * 294 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 295 struct inpcblbgroup *old_grp, int size) 296 { 297 struct inpcblbgroup *grp; 298 int i; 299 300 grp = in_pcblbgroup_alloc(hdr, old_grp->il_cred, old_grp->il_vflag, 301 old_grp->il_lport, &old_grp->il_dependladdr, size, 302 old_grp->il_numa_domain); 303 if (grp == NULL) 304 return (NULL); 305 306 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 307 ("invalid new local group size %d and old local group count %d", 308 grp->il_inpsiz, old_grp->il_inpcnt)); 309 310 for (i = 0; i < old_grp->il_inpcnt; ++i) 311 grp->il_inp[i] = old_grp->il_inp[i]; 312 grp->il_inpcnt = old_grp->il_inpcnt; 313 in_pcblbgroup_free(old_grp); 314 return (grp); 315 } 316 317 /* 318 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 319 * and shrink group if possible. 320 */ 321 static void 322 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 323 int i) 324 { 325 struct inpcblbgroup *grp, *new_grp; 326 327 grp = *grpp; 328 for (; i + 1 < grp->il_inpcnt; ++i) 329 grp->il_inp[i] = grp->il_inp[i + 1]; 330 grp->il_inpcnt--; 331 332 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 333 grp->il_inpcnt <= grp->il_inpsiz / 4) { 334 /* Shrink this group. */ 335 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 336 if (new_grp != NULL) 337 *grpp = new_grp; 338 } 339 } 340 341 /* 342 * Add PCB to load balance group for SO_REUSEPORT_LB option. 343 */ 344 static int 345 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 346 { 347 const static struct timeval interval = { 60, 0 }; 348 static struct timeval lastprint; 349 struct inpcbinfo *pcbinfo; 350 struct inpcblbgrouphead *hdr; 351 struct inpcblbgroup *grp; 352 uint32_t idx; 353 354 pcbinfo = inp->inp_pcbinfo; 355 356 INP_WLOCK_ASSERT(inp); 357 INP_HASH_WLOCK_ASSERT(pcbinfo); 358 359 #ifdef INET6 360 /* 361 * Don't allow IPv4 mapped INET6 wild socket. 362 */ 363 if ((inp->inp_vflag & INP_IPV4) && 364 inp->inp_laddr.s_addr == INADDR_ANY && 365 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 366 return (0); 367 } 368 #endif 369 370 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 371 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 372 CK_LIST_FOREACH(grp, hdr, il_list) { 373 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 374 grp->il_vflag == inp->inp_vflag && 375 grp->il_lport == inp->inp_lport && 376 grp->il_numa_domain == numa_domain && 377 memcmp(&grp->il_dependladdr, 378 &inp->inp_inc.inc_ie.ie_dependladdr, 379 sizeof(grp->il_dependladdr)) == 0) { 380 break; 381 } 382 } 383 if (grp == NULL) { 384 /* Create new load balance group. */ 385 grp = in_pcblbgroup_alloc(hdr, inp->inp_cred, inp->inp_vflag, 386 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 387 INPCBLBGROUP_SIZMIN, numa_domain); 388 if (grp == NULL) 389 return (ENOBUFS); 390 } else if (grp->il_inpcnt == grp->il_inpsiz) { 391 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 392 if (ratecheck(&lastprint, &interval)) 393 printf("lb group port %d, limit reached\n", 394 ntohs(grp->il_lport)); 395 return (0); 396 } 397 398 /* Expand this local group. */ 399 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 400 if (grp == NULL) 401 return (ENOBUFS); 402 } 403 404 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 405 ("invalid local group size %d and count %d", grp->il_inpsiz, 406 grp->il_inpcnt)); 407 408 grp->il_inp[grp->il_inpcnt] = inp; 409 grp->il_inpcnt++; 410 inp->inp_flags |= INP_INLBGROUP; 411 return (0); 412 } 413 414 /* 415 * Remove PCB from load balance group. 416 */ 417 static void 418 in_pcbremlbgrouphash(struct inpcb *inp) 419 { 420 struct inpcbinfo *pcbinfo; 421 struct inpcblbgrouphead *hdr; 422 struct inpcblbgroup *grp; 423 int i; 424 425 pcbinfo = inp->inp_pcbinfo; 426 427 INP_WLOCK_ASSERT(inp); 428 MPASS(inp->inp_flags & INP_INLBGROUP); 429 INP_HASH_WLOCK_ASSERT(pcbinfo); 430 431 hdr = &pcbinfo->ipi_lbgrouphashbase[ 432 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 433 CK_LIST_FOREACH(grp, hdr, il_list) { 434 for (i = 0; i < grp->il_inpcnt; ++i) { 435 if (grp->il_inp[i] != inp) 436 continue; 437 438 if (grp->il_inpcnt == 1) { 439 /* We are the last, free this local group. */ 440 in_pcblbgroup_free(grp); 441 } else { 442 /* Pull up inpcbs, shrink group if possible. */ 443 in_pcblbgroup_reorder(hdr, &grp, i); 444 } 445 inp->inp_flags &= ~INP_INLBGROUP; 446 return; 447 } 448 } 449 KASSERT(0, ("%s: did not find %p", __func__, inp)); 450 } 451 452 int 453 in_pcblbgroup_numa(struct inpcb *inp, int arg) 454 { 455 struct inpcbinfo *pcbinfo; 456 struct inpcblbgrouphead *hdr; 457 struct inpcblbgroup *grp; 458 int err, i; 459 uint8_t numa_domain; 460 461 switch (arg) { 462 case TCP_REUSPORT_LB_NUMA_NODOM: 463 numa_domain = M_NODOM; 464 break; 465 case TCP_REUSPORT_LB_NUMA_CURDOM: 466 numa_domain = PCPU_GET(domain); 467 break; 468 default: 469 if (arg < 0 || arg >= vm_ndomains) 470 return (EINVAL); 471 numa_domain = arg; 472 } 473 474 err = 0; 475 pcbinfo = inp->inp_pcbinfo; 476 INP_WLOCK_ASSERT(inp); 477 INP_HASH_WLOCK(pcbinfo); 478 hdr = &pcbinfo->ipi_lbgrouphashbase[ 479 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 480 CK_LIST_FOREACH(grp, hdr, il_list) { 481 for (i = 0; i < grp->il_inpcnt; ++i) { 482 if (grp->il_inp[i] != inp) 483 continue; 484 485 if (grp->il_numa_domain == numa_domain) { 486 goto abort_with_hash_wlock; 487 } 488 489 /* Remove it from the old group. */ 490 in_pcbremlbgrouphash(inp); 491 492 /* Add it to the new group based on numa domain. */ 493 in_pcbinslbgrouphash(inp, numa_domain); 494 goto abort_with_hash_wlock; 495 } 496 } 497 err = ENOENT; 498 abort_with_hash_wlock: 499 INP_HASH_WUNLOCK(pcbinfo); 500 return (err); 501 } 502 503 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 504 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 505 506 /* 507 * Initialize an inpcbinfo - a per-VNET instance of connections db. 508 */ 509 void 510 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 511 u_int hash_nelements, u_int porthash_nelements) 512 { 513 514 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 515 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 516 NULL, MTX_DEF); 517 #ifdef VIMAGE 518 pcbinfo->ipi_vnet = curvnet; 519 #endif 520 CK_LIST_INIT(&pcbinfo->ipi_listhead); 521 pcbinfo->ipi_count = 0; 522 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 523 &pcbinfo->ipi_hashmask); 524 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 525 &pcbinfo->ipi_hashmask); 526 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 527 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 528 &pcbinfo->ipi_porthashmask); 529 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 530 &pcbinfo->ipi_lbgrouphashmask); 531 pcbinfo->ipi_zone = pcbstor->ips_zone; 532 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 533 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 534 } 535 536 /* 537 * Destroy an inpcbinfo. 538 */ 539 void 540 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 541 { 542 543 KASSERT(pcbinfo->ipi_count == 0, 544 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 545 546 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 547 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 548 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 549 pcbinfo->ipi_porthashmask); 550 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 551 pcbinfo->ipi_lbgrouphashmask); 552 mtx_destroy(&pcbinfo->ipi_hash_lock); 553 mtx_destroy(&pcbinfo->ipi_lock); 554 } 555 556 /* 557 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 558 */ 559 static void inpcb_fini(void *, int); 560 void 561 in_pcbstorage_init(void *arg) 562 { 563 struct inpcbstorage *pcbstor = arg; 564 565 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 566 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 567 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 568 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 569 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 570 uma_zone_set_smr(pcbstor->ips_portzone, 571 uma_zone_get_smr(pcbstor->ips_zone)); 572 } 573 574 /* 575 * Destroy a pcbstorage - used by unloadable protocols. 576 */ 577 void 578 in_pcbstorage_destroy(void *arg) 579 { 580 struct inpcbstorage *pcbstor = arg; 581 582 uma_zdestroy(pcbstor->ips_zone); 583 uma_zdestroy(pcbstor->ips_portzone); 584 } 585 586 /* 587 * Allocate a PCB and associate it with the socket. 588 * On success return with the PCB locked. 589 */ 590 int 591 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 592 { 593 struct inpcb *inp; 594 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 595 int error; 596 #endif 597 598 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 599 if (inp == NULL) 600 return (ENOBUFS); 601 bzero(&inp->inp_start_zero, inp_zero_size); 602 #ifdef NUMA 603 inp->inp_numa_domain = M_NODOM; 604 #endif 605 inp->inp_pcbinfo = pcbinfo; 606 inp->inp_socket = so; 607 inp->inp_cred = crhold(so->so_cred); 608 inp->inp_inc.inc_fibnum = so->so_fibnum; 609 #ifdef MAC 610 error = mac_inpcb_init(inp, M_NOWAIT); 611 if (error != 0) 612 goto out; 613 mac_inpcb_create(so, inp); 614 #endif 615 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 616 error = ipsec_init_pcbpolicy(inp); 617 if (error != 0) { 618 #ifdef MAC 619 mac_inpcb_destroy(inp); 620 #endif 621 goto out; 622 } 623 #endif /*IPSEC*/ 624 #ifdef INET6 625 if (INP_SOCKAF(so) == AF_INET6) { 626 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 627 if (V_ip6_v6only) 628 inp->inp_flags |= IN6P_IPV6_V6ONLY; 629 #ifdef INET 630 else 631 inp->inp_vflag |= INP_IPV4; 632 #endif 633 if (V_ip6_auto_flowlabel) 634 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 635 inp->in6p_hops = -1; /* use kernel default */ 636 } 637 #endif 638 #if defined(INET) && defined(INET6) 639 else 640 #endif 641 #ifdef INET 642 inp->inp_vflag |= INP_IPV4; 643 #endif 644 inp->inp_smr = SMR_SEQ_INVALID; 645 646 /* 647 * Routes in inpcb's can cache L2 as well; they are guaranteed 648 * to be cleaned up. 649 */ 650 inp->inp_route.ro_flags = RT_LLE_CACHE; 651 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 652 INP_WLOCK(inp); 653 INP_INFO_WLOCK(pcbinfo); 654 pcbinfo->ipi_count++; 655 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 656 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 657 INP_INFO_WUNLOCK(pcbinfo); 658 so->so_pcb = inp; 659 660 return (0); 661 662 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 663 out: 664 crfree(inp->inp_cred); 665 #ifdef INVARIANTS 666 inp->inp_cred = NULL; 667 #endif 668 uma_zfree_smr(pcbinfo->ipi_zone, inp); 669 return (error); 670 #endif 671 } 672 673 #ifdef INET 674 int 675 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 676 { 677 int anonport, error; 678 679 KASSERT(sin == NULL || sin->sin_family == AF_INET, 680 ("%s: invalid address family for %p", __func__, sin)); 681 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 682 ("%s: invalid address length for %p", __func__, sin)); 683 INP_WLOCK_ASSERT(inp); 684 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 685 686 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 687 return (EINVAL); 688 anonport = sin == NULL || sin->sin_port == 0; 689 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 690 &inp->inp_lport, cred); 691 if (error) 692 return (error); 693 if (in_pcbinshash(inp) != 0) { 694 inp->inp_laddr.s_addr = INADDR_ANY; 695 inp->inp_lport = 0; 696 return (EAGAIN); 697 } 698 if (anonport) 699 inp->inp_flags |= INP_ANONPORT; 700 return (0); 701 } 702 #endif 703 704 #if defined(INET) || defined(INET6) 705 /* 706 * Assign a local port like in_pcb_lport(), but also used with connect() 707 * and a foreign address and port. If fsa is non-NULL, choose a local port 708 * that is unused with those, otherwise one that is completely unused. 709 * lsa can be NULL for IPv6. 710 */ 711 int 712 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 713 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 714 { 715 struct inpcbinfo *pcbinfo; 716 struct inpcb *tmpinp; 717 unsigned short *lastport; 718 int count, error; 719 u_short aux, first, last, lport; 720 #ifdef INET 721 struct in_addr laddr, faddr; 722 #endif 723 #ifdef INET6 724 struct in6_addr *laddr6, *faddr6; 725 #endif 726 727 pcbinfo = inp->inp_pcbinfo; 728 729 /* 730 * Because no actual state changes occur here, a global write lock on 731 * the pcbinfo isn't required. 732 */ 733 INP_LOCK_ASSERT(inp); 734 INP_HASH_LOCK_ASSERT(pcbinfo); 735 736 if (inp->inp_flags & INP_HIGHPORT) { 737 first = V_ipport_hifirstauto; /* sysctl */ 738 last = V_ipport_hilastauto; 739 lastport = &pcbinfo->ipi_lasthi; 740 } else if (inp->inp_flags & INP_LOWPORT) { 741 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 742 if (error) 743 return (error); 744 first = V_ipport_lowfirstauto; /* 1023 */ 745 last = V_ipport_lowlastauto; /* 600 */ 746 lastport = &pcbinfo->ipi_lastlow; 747 } else { 748 first = V_ipport_firstauto; /* sysctl */ 749 last = V_ipport_lastauto; 750 lastport = &pcbinfo->ipi_lastport; 751 } 752 753 /* 754 * Instead of having two loops further down counting up or down 755 * make sure that first is always <= last and go with only one 756 * code path implementing all logic. 757 */ 758 if (first > last) { 759 aux = first; 760 first = last; 761 last = aux; 762 } 763 764 #ifdef INET 765 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 766 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 767 if (lsa != NULL) 768 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 769 if (fsa != NULL) 770 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 771 } 772 #endif 773 #ifdef INET6 774 laddr6 = NULL; 775 if ((inp->inp_vflag & INP_IPV6) != 0) { 776 if (lsa != NULL) 777 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 778 if (fsa != NULL) 779 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 780 } 781 #endif 782 783 tmpinp = NULL; 784 lport = *lportp; 785 786 if (V_ipport_randomized) 787 *lastport = first + (arc4random() % (last - first)); 788 789 count = last - first; 790 791 do { 792 if (count-- < 0) /* completely used? */ 793 return (EADDRNOTAVAIL); 794 ++*lastport; 795 if (*lastport < first || *lastport > last) 796 *lastport = first; 797 lport = htons(*lastport); 798 799 if (fsa != NULL) { 800 #ifdef INET 801 if (lsa->sa_family == AF_INET) { 802 tmpinp = in_pcblookup_hash_locked(pcbinfo, 803 faddr, fport, laddr, lport, lookupflags, 804 M_NODOM); 805 } 806 #endif 807 #ifdef INET6 808 if (lsa->sa_family == AF_INET6) { 809 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 810 faddr6, fport, laddr6, lport, lookupflags, 811 M_NODOM); 812 } 813 #endif 814 } else { 815 #ifdef INET6 816 if ((inp->inp_vflag & INP_IPV6) != 0) { 817 tmpinp = in6_pcblookup_local(pcbinfo, 818 &inp->in6p_laddr, lport, lookupflags, cred); 819 #ifdef INET 820 if (tmpinp == NULL && 821 (inp->inp_vflag & INP_IPV4)) 822 tmpinp = in_pcblookup_local(pcbinfo, 823 laddr, lport, lookupflags, cred); 824 #endif 825 } 826 #endif 827 #if defined(INET) && defined(INET6) 828 else 829 #endif 830 #ifdef INET 831 tmpinp = in_pcblookup_local(pcbinfo, laddr, 832 lport, lookupflags, cred); 833 #endif 834 } 835 } while (tmpinp != NULL); 836 837 *lportp = lport; 838 839 return (0); 840 } 841 842 /* 843 * Select a local port (number) to use. 844 */ 845 int 846 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 847 struct ucred *cred, int lookupflags) 848 { 849 struct sockaddr_in laddr; 850 851 if (laddrp) { 852 bzero(&laddr, sizeof(laddr)); 853 laddr.sin_family = AF_INET; 854 laddr.sin_addr = *laddrp; 855 } 856 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 857 NULL, lportp, NULL, 0, cred, lookupflags)); 858 } 859 #endif /* INET || INET6 */ 860 861 #ifdef INET 862 /* 863 * Set up a bind operation on a PCB, performing port allocation 864 * as required, but do not actually modify the PCB. Callers can 865 * either complete the bind by setting inp_laddr/inp_lport and 866 * calling in_pcbinshash(), or they can just use the resulting 867 * port and address to authorise the sending of a once-off packet. 868 * 869 * On error, the values of *laddrp and *lportp are not changed. 870 */ 871 int 872 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 873 u_short *lportp, struct ucred *cred) 874 { 875 struct socket *so = inp->inp_socket; 876 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 877 struct in_addr laddr; 878 u_short lport = 0; 879 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 880 int error; 881 882 /* 883 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 884 * so that we don't have to add to the (already messy) code below. 885 */ 886 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 887 888 /* 889 * No state changes, so read locks are sufficient here. 890 */ 891 INP_LOCK_ASSERT(inp); 892 INP_HASH_LOCK_ASSERT(pcbinfo); 893 894 laddr.s_addr = *laddrp; 895 if (sin != NULL && laddr.s_addr != INADDR_ANY) 896 return (EINVAL); 897 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 898 lookupflags = INPLOOKUP_WILDCARD; 899 if (sin == NULL) { 900 if ((error = prison_local_ip4(cred, &laddr)) != 0) 901 return (error); 902 } else { 903 KASSERT(sin->sin_family == AF_INET, 904 ("%s: invalid family for address %p", __func__, sin)); 905 KASSERT(sin->sin_len == sizeof(*sin), 906 ("%s: invalid length for address %p", __func__, sin)); 907 908 error = prison_local_ip4(cred, &sin->sin_addr); 909 if (error) 910 return (error); 911 if (sin->sin_port != *lportp) { 912 /* Don't allow the port to change. */ 913 if (*lportp != 0) 914 return (EINVAL); 915 lport = sin->sin_port; 916 } 917 /* NB: lport is left as 0 if the port isn't being changed. */ 918 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 919 /* 920 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 921 * allow complete duplication of binding if 922 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 923 * and a multicast address is bound on both 924 * new and duplicated sockets. 925 */ 926 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 927 reuseport = SO_REUSEADDR|SO_REUSEPORT; 928 /* 929 * XXX: How to deal with SO_REUSEPORT_LB here? 930 * Treat same as SO_REUSEPORT for now. 931 */ 932 if ((so->so_options & 933 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 934 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 935 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 936 sin->sin_port = 0; /* yech... */ 937 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 938 /* 939 * Is the address a local IP address? 940 * If INP_BINDANY is set, then the socket may be bound 941 * to any endpoint address, local or not. 942 */ 943 if ((inp->inp_flags & INP_BINDANY) == 0 && 944 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 945 return (EADDRNOTAVAIL); 946 } 947 laddr = sin->sin_addr; 948 if (lport) { 949 struct inpcb *t; 950 951 /* GROSS */ 952 if (ntohs(lport) <= V_ipport_reservedhigh && 953 ntohs(lport) >= V_ipport_reservedlow && 954 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 955 return (EACCES); 956 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 957 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 958 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 959 lport, INPLOOKUP_WILDCARD, cred); 960 /* 961 * XXX 962 * This entire block sorely needs a rewrite. 963 */ 964 if (t != NULL && 965 (so->so_type != SOCK_STREAM || 966 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 967 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 968 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 969 (t->inp_socket->so_options & SO_REUSEPORT) || 970 (t->inp_socket->so_options & SO_REUSEPORT_LB) == 0) && 971 (inp->inp_cred->cr_uid != 972 t->inp_cred->cr_uid)) 973 return (EADDRINUSE); 974 } 975 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 976 lport, lookupflags, cred); 977 if (t != NULL && (reuseport & t->inp_socket->so_options) == 0 && 978 (reuseport_lb & t->inp_socket->so_options) == 0) { 979 #ifdef INET6 980 if (ntohl(sin->sin_addr.s_addr) != 981 INADDR_ANY || 982 ntohl(t->inp_laddr.s_addr) != 983 INADDR_ANY || 984 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 985 (t->inp_vflag & INP_IPV6PROTO) == 0) 986 #endif 987 return (EADDRINUSE); 988 } 989 } 990 } 991 if (*lportp != 0) 992 lport = *lportp; 993 if (lport == 0) { 994 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 995 if (error != 0) 996 return (error); 997 } 998 *laddrp = laddr.s_addr; 999 *lportp = lport; 1000 return (0); 1001 } 1002 1003 /* 1004 * Connect from a socket to a specified address. 1005 * Both address and port must be specified in argument sin. 1006 * If don't have a local address for this socket yet, 1007 * then pick one. 1008 */ 1009 int 1010 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, 1011 bool rehash __unused) 1012 { 1013 u_short lport, fport; 1014 in_addr_t laddr, faddr; 1015 int anonport, error; 1016 1017 INP_WLOCK_ASSERT(inp); 1018 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1019 KASSERT(in_nullhost(inp->inp_faddr), 1020 ("%s: inp is already connected", __func__)); 1021 1022 lport = inp->inp_lport; 1023 laddr = inp->inp_laddr.s_addr; 1024 anonport = (lport == 0); 1025 error = in_pcbconnect_setup(inp, sin, &laddr, &lport, &faddr, &fport, 1026 cred); 1027 if (error) 1028 return (error); 1029 1030 inp->inp_faddr.s_addr = faddr; 1031 inp->inp_fport = fport; 1032 1033 /* Do the initial binding of the local address if required. */ 1034 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1035 inp->inp_lport = lport; 1036 inp->inp_laddr.s_addr = laddr; 1037 if (in_pcbinshash(inp) != 0) { 1038 inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = 1039 INADDR_ANY; 1040 inp->inp_lport = inp->inp_fport = 0; 1041 return (EAGAIN); 1042 } 1043 } else { 1044 inp->inp_lport = lport; 1045 inp->inp_laddr.s_addr = laddr; 1046 if ((inp->inp_flags & INP_INHASHLIST) != 0) 1047 in_pcbrehash(inp); 1048 else 1049 in_pcbinshash(inp); 1050 } 1051 1052 if (anonport) 1053 inp->inp_flags |= INP_ANONPORT; 1054 return (0); 1055 } 1056 1057 /* 1058 * Do proper source address selection on an unbound socket in case 1059 * of connect. Take jails into account as well. 1060 */ 1061 int 1062 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1063 struct ucred *cred) 1064 { 1065 struct ifaddr *ifa; 1066 struct sockaddr *sa; 1067 struct sockaddr_in *sin, dst; 1068 struct nhop_object *nh; 1069 int error; 1070 1071 NET_EPOCH_ASSERT(); 1072 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1073 1074 /* 1075 * Bypass source address selection and use the primary jail IP 1076 * if requested. 1077 */ 1078 if (!prison_saddrsel_ip4(cred, laddr)) 1079 return (0); 1080 1081 error = 0; 1082 1083 nh = NULL; 1084 bzero(&dst, sizeof(dst)); 1085 sin = &dst; 1086 sin->sin_family = AF_INET; 1087 sin->sin_len = sizeof(struct sockaddr_in); 1088 sin->sin_addr.s_addr = faddr->s_addr; 1089 1090 /* 1091 * If route is known our src addr is taken from the i/f, 1092 * else punt. 1093 * 1094 * Find out route to destination. 1095 */ 1096 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1097 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1098 0, NHR_NONE, 0); 1099 1100 /* 1101 * If we found a route, use the address corresponding to 1102 * the outgoing interface. 1103 * 1104 * Otherwise assume faddr is reachable on a directly connected 1105 * network and try to find a corresponding interface to take 1106 * the source address from. 1107 */ 1108 if (nh == NULL || nh->nh_ifp == NULL) { 1109 struct in_ifaddr *ia; 1110 struct ifnet *ifp; 1111 1112 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1113 inp->inp_socket->so_fibnum)); 1114 if (ia == NULL) { 1115 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1116 inp->inp_socket->so_fibnum)); 1117 } 1118 if (ia == NULL) { 1119 error = ENETUNREACH; 1120 goto done; 1121 } 1122 1123 if (!prison_flag(cred, PR_IP4)) { 1124 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1125 goto done; 1126 } 1127 1128 ifp = ia->ia_ifp; 1129 ia = NULL; 1130 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1131 sa = ifa->ifa_addr; 1132 if (sa->sa_family != AF_INET) 1133 continue; 1134 sin = (struct sockaddr_in *)sa; 1135 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1136 ia = (struct in_ifaddr *)ifa; 1137 break; 1138 } 1139 } 1140 if (ia != NULL) { 1141 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1142 goto done; 1143 } 1144 1145 /* 3. As a last resort return the 'default' jail address. */ 1146 error = prison_get_ip4(cred, laddr); 1147 goto done; 1148 } 1149 1150 /* 1151 * If the outgoing interface on the route found is not 1152 * a loopback interface, use the address from that interface. 1153 * In case of jails do those three steps: 1154 * 1. check if the interface address belongs to the jail. If so use it. 1155 * 2. check if we have any address on the outgoing interface 1156 * belonging to this jail. If so use it. 1157 * 3. as a last resort return the 'default' jail address. 1158 */ 1159 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1160 struct in_ifaddr *ia; 1161 struct ifnet *ifp; 1162 1163 /* If not jailed, use the default returned. */ 1164 if (!prison_flag(cred, PR_IP4)) { 1165 ia = (struct in_ifaddr *)nh->nh_ifa; 1166 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1167 goto done; 1168 } 1169 1170 /* Jailed. */ 1171 /* 1. Check if the iface address belongs to the jail. */ 1172 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1173 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1174 ia = (struct in_ifaddr *)nh->nh_ifa; 1175 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1176 goto done; 1177 } 1178 1179 /* 1180 * 2. Check if we have any address on the outgoing interface 1181 * belonging to this jail. 1182 */ 1183 ia = NULL; 1184 ifp = nh->nh_ifp; 1185 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1186 sa = ifa->ifa_addr; 1187 if (sa->sa_family != AF_INET) 1188 continue; 1189 sin = (struct sockaddr_in *)sa; 1190 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1191 ia = (struct in_ifaddr *)ifa; 1192 break; 1193 } 1194 } 1195 if (ia != NULL) { 1196 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1197 goto done; 1198 } 1199 1200 /* 3. As a last resort return the 'default' jail address. */ 1201 error = prison_get_ip4(cred, laddr); 1202 goto done; 1203 } 1204 1205 /* 1206 * The outgoing interface is marked with 'loopback net', so a route 1207 * to ourselves is here. 1208 * Try to find the interface of the destination address and then 1209 * take the address from there. That interface is not necessarily 1210 * a loopback interface. 1211 * In case of jails, check that it is an address of the jail 1212 * and if we cannot find, fall back to the 'default' jail address. 1213 */ 1214 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1215 struct in_ifaddr *ia; 1216 1217 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1218 inp->inp_socket->so_fibnum)); 1219 if (ia == NULL) 1220 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1221 inp->inp_socket->so_fibnum)); 1222 if (ia == NULL) 1223 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1224 1225 if (!prison_flag(cred, PR_IP4)) { 1226 if (ia == NULL) { 1227 error = ENETUNREACH; 1228 goto done; 1229 } 1230 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1231 goto done; 1232 } 1233 1234 /* Jailed. */ 1235 if (ia != NULL) { 1236 struct ifnet *ifp; 1237 1238 ifp = ia->ia_ifp; 1239 ia = NULL; 1240 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1241 sa = ifa->ifa_addr; 1242 if (sa->sa_family != AF_INET) 1243 continue; 1244 sin = (struct sockaddr_in *)sa; 1245 if (prison_check_ip4(cred, 1246 &sin->sin_addr) == 0) { 1247 ia = (struct in_ifaddr *)ifa; 1248 break; 1249 } 1250 } 1251 if (ia != NULL) { 1252 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1253 goto done; 1254 } 1255 } 1256 1257 /* 3. As a last resort return the 'default' jail address. */ 1258 error = prison_get_ip4(cred, laddr); 1259 goto done; 1260 } 1261 1262 done: 1263 if (error == 0 && laddr->s_addr == INADDR_ANY) 1264 return (EHOSTUNREACH); 1265 return (error); 1266 } 1267 1268 /* 1269 * Set up for a connect from a socket to the specified address. 1270 * On entry, *laddrp and *lportp should contain the current local 1271 * address and port for the PCB; these are updated to the values 1272 * that should be placed in inp_laddr and inp_lport to complete 1273 * the connect. 1274 * 1275 * On success, *faddrp and *fportp will be set to the remote address 1276 * and port. These are not updated in the error case. 1277 */ 1278 int 1279 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr_in *sin, 1280 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1281 struct ucred *cred) 1282 { 1283 struct in_ifaddr *ia; 1284 struct in_addr laddr, faddr; 1285 u_short lport, fport; 1286 int error; 1287 1288 KASSERT(sin->sin_family == AF_INET, 1289 ("%s: invalid address family for %p", __func__, sin)); 1290 KASSERT(sin->sin_len == sizeof(*sin), 1291 ("%s: invalid address length for %p", __func__, sin)); 1292 1293 /* 1294 * Because a global state change doesn't actually occur here, a read 1295 * lock is sufficient. 1296 */ 1297 NET_EPOCH_ASSERT(); 1298 INP_LOCK_ASSERT(inp); 1299 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1300 1301 if (sin->sin_port == 0) 1302 return (EADDRNOTAVAIL); 1303 laddr.s_addr = *laddrp; 1304 lport = *lportp; 1305 faddr = sin->sin_addr; 1306 fport = sin->sin_port; 1307 #ifdef ROUTE_MPATH 1308 if (CALC_FLOWID_OUTBOUND) { 1309 uint32_t hash_val, hash_type; 1310 1311 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1312 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1313 1314 inp->inp_flowid = hash_val; 1315 inp->inp_flowtype = hash_type; 1316 } 1317 #endif 1318 if (V_connect_inaddr_wild && !CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1319 /* 1320 * If the destination address is INADDR_ANY, 1321 * use the primary local address. 1322 * If the supplied address is INADDR_BROADCAST, 1323 * and the primary interface supports broadcast, 1324 * choose the broadcast address for that interface. 1325 */ 1326 if (faddr.s_addr == INADDR_ANY) { 1327 faddr = 1328 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1329 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1330 return (error); 1331 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1332 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1333 IFF_BROADCAST) 1334 faddr = satosin(&CK_STAILQ_FIRST( 1335 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1336 } 1337 } 1338 if (laddr.s_addr == INADDR_ANY) { 1339 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1340 /* 1341 * If the destination address is multicast and an outgoing 1342 * interface has been set as a multicast option, prefer the 1343 * address of that interface as our source address. 1344 */ 1345 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1346 inp->inp_moptions != NULL) { 1347 struct ip_moptions *imo; 1348 struct ifnet *ifp; 1349 1350 imo = inp->inp_moptions; 1351 if (imo->imo_multicast_ifp != NULL) { 1352 ifp = imo->imo_multicast_ifp; 1353 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1354 if (ia->ia_ifp == ifp && 1355 prison_check_ip4(cred, 1356 &ia->ia_addr.sin_addr) == 0) 1357 break; 1358 } 1359 if (ia == NULL) 1360 error = EADDRNOTAVAIL; 1361 else { 1362 laddr = ia->ia_addr.sin_addr; 1363 error = 0; 1364 } 1365 } 1366 } 1367 if (error) 1368 return (error); 1369 } 1370 1371 if (lport != 0) { 1372 if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1373 fport, laddr, lport, 0, M_NODOM) != NULL) 1374 return (EADDRINUSE); 1375 } else { 1376 struct sockaddr_in lsin, fsin; 1377 1378 bzero(&lsin, sizeof(lsin)); 1379 bzero(&fsin, sizeof(fsin)); 1380 lsin.sin_family = AF_INET; 1381 lsin.sin_addr = laddr; 1382 fsin.sin_family = AF_INET; 1383 fsin.sin_addr = faddr; 1384 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1385 &lport, (struct sockaddr *)& fsin, fport, cred, 1386 INPLOOKUP_WILDCARD); 1387 if (error) 1388 return (error); 1389 } 1390 *laddrp = laddr.s_addr; 1391 *lportp = lport; 1392 *faddrp = faddr.s_addr; 1393 *fportp = fport; 1394 return (0); 1395 } 1396 1397 void 1398 in_pcbdisconnect(struct inpcb *inp) 1399 { 1400 1401 INP_WLOCK_ASSERT(inp); 1402 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1403 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1404 ("%s: inp %p was already disconnected", __func__, inp)); 1405 1406 in_pcbremhash_locked(inp); 1407 1408 /* See the comment in in_pcbinshash(). */ 1409 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1410 inp->inp_laddr.s_addr = INADDR_ANY; 1411 inp->inp_faddr.s_addr = INADDR_ANY; 1412 inp->inp_fport = 0; 1413 } 1414 #endif /* INET */ 1415 1416 /* 1417 * inpcb hash lookups are protected by SMR section. 1418 * 1419 * Once desired pcb has been found, switching from SMR section to a pcb 1420 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1421 * here because SMR is a critical section. 1422 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1423 */ 1424 void 1425 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1426 { 1427 1428 lock == INPLOOKUP_RLOCKPCB ? 1429 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1430 } 1431 1432 void 1433 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1434 { 1435 1436 lock == INPLOOKUP_RLOCKPCB ? 1437 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1438 } 1439 1440 int 1441 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1442 { 1443 1444 return (lock == INPLOOKUP_RLOCKPCB ? 1445 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1446 } 1447 1448 static inline bool 1449 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1450 { 1451 1452 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1453 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1454 1455 if (__predict_true(inp_trylock(inp, lock))) { 1456 if (__predict_false(inp->inp_flags & ignflags)) { 1457 smr_exit(inp->inp_pcbinfo->ipi_smr); 1458 inp_unlock(inp, lock); 1459 return (false); 1460 } 1461 smr_exit(inp->inp_pcbinfo->ipi_smr); 1462 return (true); 1463 } 1464 1465 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1466 smr_exit(inp->inp_pcbinfo->ipi_smr); 1467 inp_lock(inp, lock); 1468 if (__predict_false(in_pcbrele(inp, lock))) 1469 return (false); 1470 /* 1471 * inp acquired through refcount & lock for sure didn't went 1472 * through uma_zfree(). However, it may have already went 1473 * through in_pcbfree() and has another reference, that 1474 * prevented its release by our in_pcbrele(). 1475 */ 1476 if (__predict_false(inp->inp_flags & ignflags)) { 1477 inp_unlock(inp, lock); 1478 return (false); 1479 } 1480 return (true); 1481 } else { 1482 smr_exit(inp->inp_pcbinfo->ipi_smr); 1483 return (false); 1484 } 1485 } 1486 1487 bool 1488 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1489 { 1490 1491 /* 1492 * in_pcblookup() family of functions ignore not only freed entries, 1493 * that may be found due to lockless access to the hash, but dropped 1494 * entries, too. 1495 */ 1496 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1497 } 1498 1499 /* 1500 * inp_next() - inpcb hash/list traversal iterator 1501 * 1502 * Requires initialized struct inpcb_iterator for context. 1503 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1504 * 1505 * - Iterator can have either write-lock or read-lock semantics, that can not 1506 * be changed later. 1507 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1508 * a single hash slot. Note: only rip_input() does the latter. 1509 * - Iterator may have optional bool matching function. The matching function 1510 * will be executed for each inpcb in the SMR context, so it can not acquire 1511 * locks and can safely access only immutable fields of inpcb. 1512 * 1513 * A fresh initialized iterator has NULL inpcb in its context and that 1514 * means that inp_next() call would return the very first inpcb on the list 1515 * locked with desired semantic. In all following calls the context pointer 1516 * shall hold the current inpcb pointer. The KPI user is not supposed to 1517 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1518 * and write NULL to its context. After end of traversal an iterator can be 1519 * reused. 1520 * 1521 * List traversals have the following features/constraints: 1522 * - New entries won't be seen, as they are always added to the head of a list. 1523 * - Removed entries won't stop traversal as long as they are not added to 1524 * a different list. This is violated by in_pcbrehash(). 1525 */ 1526 #define II_LIST_FIRST(ipi, hash) \ 1527 (((hash) == INP_ALL_LIST) ? \ 1528 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1529 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1530 #define II_LIST_NEXT(inp, hash) \ 1531 (((hash) == INP_ALL_LIST) ? \ 1532 CK_LIST_NEXT((inp), inp_list) : \ 1533 CK_LIST_NEXT((inp), inp_hash_exact)) 1534 #define II_LOCK_ASSERT(inp, lock) \ 1535 rw_assert(&(inp)->inp_lock, \ 1536 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1537 struct inpcb * 1538 inp_next(struct inpcb_iterator *ii) 1539 { 1540 const struct inpcbinfo *ipi = ii->ipi; 1541 inp_match_t *match = ii->match; 1542 void *ctx = ii->ctx; 1543 inp_lookup_t lock = ii->lock; 1544 int hash = ii->hash; 1545 struct inpcb *inp; 1546 1547 if (ii->inp == NULL) { /* First call. */ 1548 smr_enter(ipi->ipi_smr); 1549 /* This is unrolled CK_LIST_FOREACH(). */ 1550 for (inp = II_LIST_FIRST(ipi, hash); 1551 inp != NULL; 1552 inp = II_LIST_NEXT(inp, hash)) { 1553 if (match != NULL && (match)(inp, ctx) == false) 1554 continue; 1555 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1556 break; 1557 else { 1558 smr_enter(ipi->ipi_smr); 1559 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1560 inp = II_LIST_FIRST(ipi, hash); 1561 if (inp == NULL) 1562 break; 1563 } 1564 } 1565 1566 if (inp == NULL) 1567 smr_exit(ipi->ipi_smr); 1568 else 1569 ii->inp = inp; 1570 1571 return (inp); 1572 } 1573 1574 /* Not a first call. */ 1575 smr_enter(ipi->ipi_smr); 1576 restart: 1577 inp = ii->inp; 1578 II_LOCK_ASSERT(inp, lock); 1579 next: 1580 inp = II_LIST_NEXT(inp, hash); 1581 if (inp == NULL) { 1582 smr_exit(ipi->ipi_smr); 1583 goto found; 1584 } 1585 1586 if (match != NULL && (match)(inp, ctx) == false) 1587 goto next; 1588 1589 if (__predict_true(inp_trylock(inp, lock))) { 1590 if (__predict_false(inp->inp_flags & INP_FREED)) { 1591 /* 1592 * Entries are never inserted in middle of a list, thus 1593 * as long as we are in SMR, we can continue traversal. 1594 * Jump to 'restart' should yield in the same result, 1595 * but could produce unnecessary looping. Could this 1596 * looping be unbound? 1597 */ 1598 inp_unlock(inp, lock); 1599 goto next; 1600 } else { 1601 smr_exit(ipi->ipi_smr); 1602 goto found; 1603 } 1604 } 1605 1606 /* 1607 * Can't obtain lock immediately, thus going hard. Once we exit the 1608 * SMR section we can no longer jump to 'next', and our only stable 1609 * anchoring point is ii->inp, which we keep locked for this case, so 1610 * we jump to 'restart'. 1611 */ 1612 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1613 smr_exit(ipi->ipi_smr); 1614 inp_lock(inp, lock); 1615 if (__predict_false(in_pcbrele(inp, lock))) { 1616 smr_enter(ipi->ipi_smr); 1617 goto restart; 1618 } 1619 /* 1620 * See comment in inp_smr_lock(). 1621 */ 1622 if (__predict_false(inp->inp_flags & INP_FREED)) { 1623 inp_unlock(inp, lock); 1624 smr_enter(ipi->ipi_smr); 1625 goto restart; 1626 } 1627 } else 1628 goto next; 1629 1630 found: 1631 inp_unlock(ii->inp, lock); 1632 ii->inp = inp; 1633 1634 return (ii->inp); 1635 } 1636 1637 /* 1638 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1639 * stability of an inpcb pointer despite the inpcb lock being released or 1640 * SMR section exited. 1641 * 1642 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1643 */ 1644 void 1645 in_pcbref(struct inpcb *inp) 1646 { 1647 u_int old __diagused; 1648 1649 old = refcount_acquire(&inp->inp_refcount); 1650 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1651 } 1652 1653 /* 1654 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1655 * freeing the pcb, if the reference was very last. 1656 */ 1657 bool 1658 in_pcbrele_rlocked(struct inpcb *inp) 1659 { 1660 1661 INP_RLOCK_ASSERT(inp); 1662 1663 if (!refcount_release(&inp->inp_refcount)) 1664 return (false); 1665 1666 MPASS(inp->inp_flags & INP_FREED); 1667 MPASS(inp->inp_socket == NULL); 1668 crfree(inp->inp_cred); 1669 #ifdef INVARIANTS 1670 inp->inp_cred = NULL; 1671 #endif 1672 INP_RUNLOCK(inp); 1673 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1674 return (true); 1675 } 1676 1677 bool 1678 in_pcbrele_wlocked(struct inpcb *inp) 1679 { 1680 1681 INP_WLOCK_ASSERT(inp); 1682 1683 if (!refcount_release(&inp->inp_refcount)) 1684 return (false); 1685 1686 MPASS(inp->inp_flags & INP_FREED); 1687 MPASS(inp->inp_socket == NULL); 1688 crfree(inp->inp_cred); 1689 #ifdef INVARIANTS 1690 inp->inp_cred = NULL; 1691 #endif 1692 INP_WUNLOCK(inp); 1693 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1694 return (true); 1695 } 1696 1697 bool 1698 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1699 { 1700 1701 return (lock == INPLOOKUP_RLOCKPCB ? 1702 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1703 } 1704 1705 /* 1706 * Unconditionally schedule an inpcb to be freed by decrementing its 1707 * reference count, which should occur only after the inpcb has been detached 1708 * from its socket. If another thread holds a temporary reference (acquired 1709 * using in_pcbref()) then the free is deferred until that reference is 1710 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1711 * Almost all work, including removal from global lists, is done in this 1712 * context, where the pcbinfo lock is held. 1713 */ 1714 void 1715 in_pcbfree(struct inpcb *inp) 1716 { 1717 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1718 #ifdef INET 1719 struct ip_moptions *imo; 1720 #endif 1721 #ifdef INET6 1722 struct ip6_moptions *im6o; 1723 #endif 1724 1725 INP_WLOCK_ASSERT(inp); 1726 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1727 KASSERT((inp->inp_flags & INP_FREED) == 0, 1728 ("%s: called twice for pcb %p", __func__, inp)); 1729 1730 /* 1731 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1732 * from the hash without acquiring inpcb lock, they rely on the hash 1733 * lock, thus in_pcbremhash() should be the first action. 1734 */ 1735 if (inp->inp_flags & INP_INHASHLIST) 1736 in_pcbremhash(inp); 1737 INP_INFO_WLOCK(pcbinfo); 1738 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1739 pcbinfo->ipi_count--; 1740 CK_LIST_REMOVE(inp, inp_list); 1741 INP_INFO_WUNLOCK(pcbinfo); 1742 1743 #ifdef RATELIMIT 1744 if (inp->inp_snd_tag != NULL) 1745 in_pcbdetach_txrtlmt(inp); 1746 #endif 1747 inp->inp_flags |= INP_FREED; 1748 inp->inp_socket->so_pcb = NULL; 1749 inp->inp_socket = NULL; 1750 1751 RO_INVALIDATE_CACHE(&inp->inp_route); 1752 #ifdef MAC 1753 mac_inpcb_destroy(inp); 1754 #endif 1755 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1756 if (inp->inp_sp != NULL) 1757 ipsec_delete_pcbpolicy(inp); 1758 #endif 1759 #ifdef INET 1760 if (inp->inp_options) 1761 (void)m_free(inp->inp_options); 1762 DEBUG_POISON_POINTER(inp->inp_options); 1763 imo = inp->inp_moptions; 1764 DEBUG_POISON_POINTER(inp->inp_moptions); 1765 #endif 1766 #ifdef INET6 1767 if (inp->inp_vflag & INP_IPV6PROTO) { 1768 ip6_freepcbopts(inp->in6p_outputopts); 1769 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1770 im6o = inp->in6p_moptions; 1771 DEBUG_POISON_POINTER(inp->in6p_moptions); 1772 } else 1773 im6o = NULL; 1774 #endif 1775 1776 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1777 INP_WUNLOCK(inp); 1778 } 1779 #ifdef INET6 1780 ip6_freemoptions(im6o); 1781 #endif 1782 #ifdef INET 1783 inp_freemoptions(imo); 1784 #endif 1785 } 1786 1787 /* 1788 * Different protocols initialize their inpcbs differently - giving 1789 * different name to the lock. But they all are disposed the same. 1790 */ 1791 static void 1792 inpcb_fini(void *mem, int size) 1793 { 1794 struct inpcb *inp = mem; 1795 1796 INP_LOCK_DESTROY(inp); 1797 } 1798 1799 /* 1800 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1801 * port reservation, and preventing it from being returned by inpcb lookups. 1802 * 1803 * It is used by TCP to mark an inpcb as unused and avoid future packet 1804 * delivery or event notification when a socket remains open but TCP has 1805 * closed. This might occur as a result of a shutdown()-initiated TCP close 1806 * or a RST on the wire, and allows the port binding to be reused while still 1807 * maintaining the invariant that so_pcb always points to a valid inpcb until 1808 * in_pcbdetach(). 1809 * 1810 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1811 * in_pcbpurgeif0()? 1812 */ 1813 void 1814 in_pcbdrop(struct inpcb *inp) 1815 { 1816 1817 INP_WLOCK_ASSERT(inp); 1818 1819 inp->inp_flags |= INP_DROPPED; 1820 if (inp->inp_flags & INP_INHASHLIST) 1821 in_pcbremhash(inp); 1822 } 1823 1824 #ifdef INET 1825 /* 1826 * Common routines to return the socket addresses associated with inpcbs. 1827 */ 1828 int 1829 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1830 { 1831 struct inpcb *inp; 1832 1833 inp = sotoinpcb(so); 1834 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1835 1836 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1837 .sin_len = sizeof(struct sockaddr_in), 1838 .sin_family = AF_INET, 1839 .sin_port = inp->inp_lport, 1840 .sin_addr = inp->inp_laddr, 1841 }; 1842 1843 return (0); 1844 } 1845 1846 int 1847 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1848 { 1849 struct inpcb *inp; 1850 1851 inp = sotoinpcb(so); 1852 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1853 1854 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1855 .sin_len = sizeof(struct sockaddr_in), 1856 .sin_family = AF_INET, 1857 .sin_port = inp->inp_fport, 1858 .sin_addr = inp->inp_faddr, 1859 }; 1860 1861 return (0); 1862 } 1863 1864 static bool 1865 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1866 { 1867 1868 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1869 return (true); 1870 else 1871 return (false); 1872 } 1873 1874 void 1875 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1876 { 1877 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1878 inp_v4_multi_match, NULL); 1879 struct inpcb *inp; 1880 struct in_multi *inm; 1881 struct in_mfilter *imf; 1882 struct ip_moptions *imo; 1883 1884 IN_MULTI_LOCK_ASSERT(); 1885 1886 while ((inp = inp_next(&inpi)) != NULL) { 1887 INP_WLOCK_ASSERT(inp); 1888 1889 imo = inp->inp_moptions; 1890 /* 1891 * Unselect the outgoing interface if it is being 1892 * detached. 1893 */ 1894 if (imo->imo_multicast_ifp == ifp) 1895 imo->imo_multicast_ifp = NULL; 1896 1897 /* 1898 * Drop multicast group membership if we joined 1899 * through the interface being detached. 1900 * 1901 * XXX This can all be deferred to an epoch_call 1902 */ 1903 restart: 1904 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1905 if ((inm = imf->imf_inm) == NULL) 1906 continue; 1907 if (inm->inm_ifp != ifp) 1908 continue; 1909 ip_mfilter_remove(&imo->imo_head, imf); 1910 in_leavegroup_locked(inm, NULL); 1911 ip_mfilter_free(imf); 1912 goto restart; 1913 } 1914 } 1915 } 1916 1917 /* 1918 * Lookup a PCB based on the local address and port. Caller must hold the 1919 * hash lock. No inpcb locks or references are acquired. 1920 */ 1921 #define INP_LOOKUP_MAPPED_PCB_COST 3 1922 struct inpcb * 1923 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1924 u_short lport, int lookupflags, struct ucred *cred) 1925 { 1926 struct inpcb *inp; 1927 #ifdef INET6 1928 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1929 #else 1930 int matchwild = 3; 1931 #endif 1932 int wildcard; 1933 1934 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1935 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1936 INP_HASH_LOCK_ASSERT(pcbinfo); 1937 1938 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 1939 struct inpcbhead *head; 1940 /* 1941 * Look for an unconnected (wildcard foreign addr) PCB that 1942 * matches the local address and port we're looking for. 1943 */ 1944 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 1945 pcbinfo->ipi_hashmask)]; 1946 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 1947 #ifdef INET6 1948 /* XXX inp locking */ 1949 if ((inp->inp_vflag & INP_IPV4) == 0) 1950 continue; 1951 #endif 1952 if (inp->inp_faddr.s_addr == INADDR_ANY && 1953 inp->inp_laddr.s_addr == laddr.s_addr && 1954 inp->inp_lport == lport) { 1955 /* 1956 * Found? 1957 */ 1958 if (prison_equal_ip4(cred->cr_prison, 1959 inp->inp_cred->cr_prison)) 1960 return (inp); 1961 } 1962 } 1963 /* 1964 * Not found. 1965 */ 1966 return (NULL); 1967 } else { 1968 struct inpcbporthead *porthash; 1969 struct inpcbport *phd; 1970 struct inpcb *match = NULL; 1971 /* 1972 * Best fit PCB lookup. 1973 * 1974 * First see if this local port is in use by looking on the 1975 * port hash list. 1976 */ 1977 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 1978 pcbinfo->ipi_porthashmask)]; 1979 CK_LIST_FOREACH(phd, porthash, phd_hash) { 1980 if (phd->phd_port == lport) 1981 break; 1982 } 1983 if (phd != NULL) { 1984 /* 1985 * Port is in use by one or more PCBs. Look for best 1986 * fit. 1987 */ 1988 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1989 wildcard = 0; 1990 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 1991 cred->cr_prison)) 1992 continue; 1993 #ifdef INET6 1994 /* XXX inp locking */ 1995 if ((inp->inp_vflag & INP_IPV4) == 0) 1996 continue; 1997 /* 1998 * We never select the PCB that has 1999 * INP_IPV6 flag and is bound to :: if 2000 * we have another PCB which is bound 2001 * to 0.0.0.0. If a PCB has the 2002 * INP_IPV6 flag, then we set its cost 2003 * higher than IPv4 only PCBs. 2004 * 2005 * Note that the case only happens 2006 * when a socket is bound to ::, under 2007 * the condition that the use of the 2008 * mapped address is allowed. 2009 */ 2010 if ((inp->inp_vflag & INP_IPV6) != 0) 2011 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2012 #endif 2013 if (inp->inp_faddr.s_addr != INADDR_ANY) 2014 wildcard++; 2015 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2016 if (laddr.s_addr == INADDR_ANY) 2017 wildcard++; 2018 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2019 continue; 2020 } else { 2021 if (laddr.s_addr != INADDR_ANY) 2022 wildcard++; 2023 } 2024 if (wildcard < matchwild) { 2025 match = inp; 2026 matchwild = wildcard; 2027 if (matchwild == 0) 2028 break; 2029 } 2030 } 2031 } 2032 return (match); 2033 } 2034 } 2035 #undef INP_LOOKUP_MAPPED_PCB_COST 2036 2037 static bool 2038 in_pcblookup_lb_numa_match(const struct inpcblbgroup *grp, int domain) 2039 { 2040 return (domain == M_NODOM || domain == grp->il_numa_domain); 2041 } 2042 2043 static struct inpcb * 2044 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2045 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2046 uint16_t lport, int domain) 2047 { 2048 const struct inpcblbgrouphead *hdr; 2049 struct inpcblbgroup *grp; 2050 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2051 2052 INP_HASH_LOCK_ASSERT(pcbinfo); 2053 2054 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2055 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2056 2057 /* 2058 * Search for an LB group match based on the following criteria: 2059 * - prefer jailed groups to non-jailed groups 2060 * - prefer exact source address matches to wildcard matches 2061 * - prefer groups bound to the specified NUMA domain 2062 */ 2063 jail_exact = jail_wild = local_exact = local_wild = NULL; 2064 CK_LIST_FOREACH(grp, hdr, il_list) { 2065 bool injail; 2066 2067 #ifdef INET6 2068 if (!(grp->il_vflag & INP_IPV4)) 2069 continue; 2070 #endif 2071 if (grp->il_lport != lport) 2072 continue; 2073 2074 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2075 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2076 laddr) != 0) 2077 continue; 2078 2079 if (grp->il_laddr.s_addr == laddr->s_addr) { 2080 if (injail) { 2081 jail_exact = grp; 2082 if (in_pcblookup_lb_numa_match(grp, domain)) 2083 /* This is a perfect match. */ 2084 goto out; 2085 } else if (local_exact == NULL || 2086 in_pcblookup_lb_numa_match(grp, domain)) { 2087 local_exact = grp; 2088 } 2089 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2090 if (injail) { 2091 if (jail_wild == NULL || 2092 in_pcblookup_lb_numa_match(grp, domain)) 2093 jail_wild = grp; 2094 } else if (local_wild == NULL || 2095 in_pcblookup_lb_numa_match(grp, domain)) { 2096 local_wild = grp; 2097 } 2098 } 2099 } 2100 2101 if (jail_exact != NULL) 2102 grp = jail_exact; 2103 else if (jail_wild != NULL) 2104 grp = jail_wild; 2105 else if (local_exact != NULL) 2106 grp = local_exact; 2107 else 2108 grp = local_wild; 2109 if (grp == NULL) 2110 return (NULL); 2111 out: 2112 return (grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2113 grp->il_inpcnt]); 2114 } 2115 2116 static bool 2117 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2118 u_short fport, struct in_addr laddr, u_short lport) 2119 { 2120 #ifdef INET6 2121 /* XXX inp locking */ 2122 if ((inp->inp_vflag & INP_IPV4) == 0) 2123 return (false); 2124 #endif 2125 if (inp->inp_faddr.s_addr == faddr.s_addr && 2126 inp->inp_laddr.s_addr == laddr.s_addr && 2127 inp->inp_fport == fport && 2128 inp->inp_lport == lport) 2129 return (true); 2130 return (false); 2131 } 2132 2133 static struct inpcb * 2134 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2135 u_short fport, struct in_addr laddr, u_short lport) 2136 { 2137 struct inpcbhead *head; 2138 struct inpcb *inp; 2139 2140 INP_HASH_LOCK_ASSERT(pcbinfo); 2141 2142 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2143 pcbinfo->ipi_hashmask)]; 2144 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2145 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2146 return (inp); 2147 } 2148 return (NULL); 2149 } 2150 2151 typedef enum { 2152 INPLOOKUP_MATCH_NONE = 0, 2153 INPLOOKUP_MATCH_WILD = 1, 2154 INPLOOKUP_MATCH_LADDR = 2, 2155 } inp_lookup_match_t; 2156 2157 static inp_lookup_match_t 2158 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2159 u_short lport) 2160 { 2161 #ifdef INET6 2162 /* XXX inp locking */ 2163 if ((inp->inp_vflag & INP_IPV4) == 0) 2164 return (INPLOOKUP_MATCH_NONE); 2165 #endif 2166 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2167 return (INPLOOKUP_MATCH_NONE); 2168 if (inp->inp_laddr.s_addr == INADDR_ANY) 2169 return (INPLOOKUP_MATCH_WILD); 2170 if (inp->inp_laddr.s_addr == laddr.s_addr) 2171 return (INPLOOKUP_MATCH_LADDR); 2172 return (INPLOOKUP_MATCH_NONE); 2173 } 2174 2175 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2176 2177 static struct inpcb * 2178 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2179 u_short fport, struct in_addr laddr, u_short lport, 2180 const inp_lookup_t lockflags) 2181 { 2182 struct inpcbhead *head; 2183 struct inpcb *inp; 2184 2185 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2186 ("%s: not in SMR read section", __func__)); 2187 2188 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2189 pcbinfo->ipi_hashmask)]; 2190 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2191 inp_lookup_match_t match; 2192 2193 match = in_pcblookup_wild_match(inp, laddr, lport); 2194 if (match == INPLOOKUP_MATCH_NONE) 2195 continue; 2196 2197 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2198 match = in_pcblookup_wild_match(inp, laddr, lport); 2199 if (match != INPLOOKUP_MATCH_NONE && 2200 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2201 &laddr) == 0) 2202 return (inp); 2203 inp_unlock(inp, lockflags); 2204 } 2205 2206 /* 2207 * The matching socket disappeared out from under us. Fall back 2208 * to a serialized lookup. 2209 */ 2210 return (INP_LOOKUP_AGAIN); 2211 } 2212 return (NULL); 2213 } 2214 2215 static struct inpcb * 2216 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2217 u_short fport, struct in_addr laddr, u_short lport) 2218 { 2219 struct inpcbhead *head; 2220 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2221 #ifdef INET6 2222 struct inpcb *local_wild_mapped; 2223 #endif 2224 2225 INP_HASH_LOCK_ASSERT(pcbinfo); 2226 2227 /* 2228 * Order of socket selection - we always prefer jails. 2229 * 1. jailed, non-wild. 2230 * 2. jailed, wild. 2231 * 3. non-jailed, non-wild. 2232 * 4. non-jailed, wild. 2233 */ 2234 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2235 pcbinfo->ipi_hashmask)]; 2236 local_wild = local_exact = jail_wild = NULL; 2237 #ifdef INET6 2238 local_wild_mapped = NULL; 2239 #endif 2240 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2241 inp_lookup_match_t match; 2242 bool injail; 2243 2244 match = in_pcblookup_wild_match(inp, laddr, lport); 2245 if (match == INPLOOKUP_MATCH_NONE) 2246 continue; 2247 2248 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2249 if (injail) { 2250 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2251 &laddr) != 0) 2252 continue; 2253 } else { 2254 if (local_exact != NULL) 2255 continue; 2256 } 2257 2258 if (match == INPLOOKUP_MATCH_LADDR) { 2259 if (injail) 2260 return (inp); 2261 local_exact = inp; 2262 } else { 2263 #ifdef INET6 2264 /* XXX inp locking, NULL check */ 2265 if (inp->inp_vflag & INP_IPV6PROTO) 2266 local_wild_mapped = inp; 2267 else 2268 #endif 2269 if (injail) 2270 jail_wild = inp; 2271 else 2272 local_wild = inp; 2273 } 2274 } 2275 if (jail_wild != NULL) 2276 return (jail_wild); 2277 if (local_exact != NULL) 2278 return (local_exact); 2279 if (local_wild != NULL) 2280 return (local_wild); 2281 #ifdef INET6 2282 if (local_wild_mapped != NULL) 2283 return (local_wild_mapped); 2284 #endif 2285 return (NULL); 2286 } 2287 2288 /* 2289 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2290 * that the caller has either locked the hash list, which usually happens 2291 * for bind(2) operations, or is in SMR section, which happens when sorting 2292 * out incoming packets. 2293 */ 2294 static struct inpcb * 2295 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2296 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2297 uint8_t numa_domain) 2298 { 2299 struct inpcb *inp; 2300 const u_short fport = fport_arg, lport = lport_arg; 2301 2302 KASSERT((lookupflags & ~INPLOOKUP_WILDCARD) == 0, 2303 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2304 KASSERT(faddr.s_addr != INADDR_ANY, 2305 ("%s: invalid foreign address", __func__)); 2306 KASSERT(laddr.s_addr != INADDR_ANY, 2307 ("%s: invalid local address", __func__)); 2308 INP_HASH_WLOCK_ASSERT(pcbinfo); 2309 2310 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2311 if (inp != NULL) 2312 return (inp); 2313 2314 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2315 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2316 &laddr, lport, numa_domain); 2317 if (inp == NULL) { 2318 inp = in_pcblookup_hash_wild_locked(pcbinfo, faddr, 2319 fport, laddr, lport); 2320 } 2321 } 2322 2323 return (inp); 2324 } 2325 2326 static struct inpcb * 2327 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2328 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2329 uint8_t numa_domain) 2330 { 2331 struct inpcb *inp; 2332 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2333 2334 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2335 ("%s: LOCKPCB not set", __func__)); 2336 2337 INP_HASH_WLOCK(pcbinfo); 2338 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2339 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain); 2340 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2341 in_pcbref(inp); 2342 INP_HASH_WUNLOCK(pcbinfo); 2343 inp_lock(inp, lockflags); 2344 if (in_pcbrele(inp, lockflags)) 2345 /* XXX-MJ or retry until we get a negative match? */ 2346 inp = NULL; 2347 } else { 2348 INP_HASH_WUNLOCK(pcbinfo); 2349 } 2350 return (inp); 2351 } 2352 2353 static struct inpcb * 2354 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2355 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2356 uint8_t numa_domain) 2357 { 2358 struct inpcb *inp; 2359 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2360 const u_short fport = fport_arg, lport = lport_arg; 2361 2362 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2363 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2364 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2365 ("%s: LOCKPCB not set", __func__)); 2366 2367 smr_enter(pcbinfo->ipi_smr); 2368 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2369 if (inp != NULL) { 2370 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2371 /* 2372 * Revalidate the 4-tuple, the socket could have been 2373 * disconnected. 2374 */ 2375 if (__predict_true(in_pcblookup_exact_match(inp, 2376 faddr, fport, laddr, lport))) 2377 return (inp); 2378 inp_unlock(inp, lockflags); 2379 } 2380 2381 /* 2382 * We failed to lock the inpcb, or its connection state changed 2383 * out from under us. Fall back to a precise search. 2384 */ 2385 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2386 lookupflags, numa_domain)); 2387 } 2388 2389 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2390 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2391 &laddr, lport, numa_domain); 2392 if (inp != NULL) { 2393 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2394 if (__predict_true(in_pcblookup_wild_match(inp, 2395 laddr, lport) != INPLOOKUP_MATCH_NONE)) 2396 return (inp); 2397 inp_unlock(inp, lockflags); 2398 } 2399 inp = INP_LOOKUP_AGAIN; 2400 } else { 2401 inp = in_pcblookup_hash_wild_smr(pcbinfo, faddr, fport, 2402 laddr, lport, lockflags); 2403 } 2404 if (inp == INP_LOOKUP_AGAIN) { 2405 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2406 lport, lookupflags, numa_domain)); 2407 } 2408 } 2409 2410 if (inp == NULL) 2411 smr_exit(pcbinfo->ipi_smr); 2412 2413 return (inp); 2414 } 2415 2416 /* 2417 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2418 * from which a pre-calculated hash value may be extracted. 2419 */ 2420 struct inpcb * 2421 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2422 struct in_addr laddr, u_int lport, int lookupflags, 2423 struct ifnet *ifp __unused) 2424 { 2425 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2426 lookupflags, M_NODOM)); 2427 } 2428 2429 struct inpcb * 2430 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2431 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2432 struct ifnet *ifp __unused, struct mbuf *m) 2433 { 2434 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2435 lookupflags, m->m_pkthdr.numa_domain)); 2436 } 2437 #endif /* INET */ 2438 2439 static bool 2440 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2441 { 2442 return (prison_flag(inp->inp_cred, flag) != 0); 2443 } 2444 2445 /* 2446 * Insert the PCB into a hash chain using ordering rules which ensure that 2447 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2448 * 2449 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2450 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2451 * always appear last no matter whether they are jailed. 2452 */ 2453 static void 2454 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2455 { 2456 struct inpcb *last; 2457 bool bound, injail; 2458 2459 INP_LOCK_ASSERT(inp); 2460 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2461 2462 last = NULL; 2463 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2464 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2465 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2466 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2467 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2468 return; 2469 } 2470 } 2471 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2472 return; 2473 } 2474 2475 injail = in_pcbjailed(inp, PR_IP4); 2476 if (!injail) { 2477 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2478 if (!in_pcbjailed(last, PR_IP4)) 2479 break; 2480 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2481 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2482 return; 2483 } 2484 } 2485 } else if (!CK_LIST_EMPTY(pcbhash) && 2486 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2487 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2488 return; 2489 } 2490 if (!bound) { 2491 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2492 if (last->inp_laddr.s_addr == INADDR_ANY) 2493 break; 2494 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2495 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2496 return; 2497 } 2498 } 2499 } 2500 if (last == NULL) 2501 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2502 else 2503 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2504 } 2505 2506 #ifdef INET6 2507 /* 2508 * See the comment above _in_pcbinshash_wild(). 2509 */ 2510 static void 2511 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2512 { 2513 struct inpcb *last; 2514 bool bound, injail; 2515 2516 INP_LOCK_ASSERT(inp); 2517 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2518 2519 last = NULL; 2520 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2521 injail = in_pcbjailed(inp, PR_IP6); 2522 if (!injail) { 2523 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2524 if (!in_pcbjailed(last, PR_IP6)) 2525 break; 2526 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2527 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2528 return; 2529 } 2530 } 2531 } else if (!CK_LIST_EMPTY(pcbhash) && 2532 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2533 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2534 return; 2535 } 2536 if (!bound) { 2537 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2538 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2539 break; 2540 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2541 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2542 return; 2543 } 2544 } 2545 } 2546 if (last == NULL) 2547 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2548 else 2549 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2550 } 2551 #endif 2552 2553 /* 2554 * Insert PCB onto various hash lists. 2555 */ 2556 int 2557 in_pcbinshash(struct inpcb *inp) 2558 { 2559 struct inpcbhead *pcbhash; 2560 struct inpcbporthead *pcbporthash; 2561 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2562 struct inpcbport *phd; 2563 uint32_t hash; 2564 bool connected; 2565 2566 INP_WLOCK_ASSERT(inp); 2567 INP_HASH_WLOCK_ASSERT(pcbinfo); 2568 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2569 ("in_pcbinshash: INP_INHASHLIST")); 2570 2571 #ifdef INET6 2572 if (inp->inp_vflag & INP_IPV6) { 2573 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2574 inp->inp_fport, pcbinfo->ipi_hashmask); 2575 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2576 } else 2577 #endif 2578 { 2579 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2580 inp->inp_fport, pcbinfo->ipi_hashmask); 2581 connected = !in_nullhost(inp->inp_faddr); 2582 } 2583 2584 if (connected) 2585 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2586 else 2587 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2588 2589 pcbporthash = &pcbinfo->ipi_porthashbase[ 2590 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2591 2592 /* 2593 * Add entry to load balance group. 2594 * Only do this if SO_REUSEPORT_LB is set. 2595 */ 2596 if ((inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2597 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2598 if (error != 0) 2599 return (error); 2600 } 2601 2602 /* 2603 * Go through port list and look for a head for this lport. 2604 */ 2605 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2606 if (phd->phd_port == inp->inp_lport) 2607 break; 2608 } 2609 2610 /* 2611 * If none exists, malloc one and tack it on. 2612 */ 2613 if (phd == NULL) { 2614 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2615 if (phd == NULL) { 2616 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2617 in_pcbremlbgrouphash(inp); 2618 return (ENOMEM); 2619 } 2620 phd->phd_port = inp->inp_lport; 2621 CK_LIST_INIT(&phd->phd_pcblist); 2622 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2623 } 2624 inp->inp_phd = phd; 2625 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2626 2627 /* 2628 * The PCB may have been disconnected in the past. Before we can safely 2629 * make it visible in the hash table, we must wait for all readers which 2630 * may be traversing this PCB to finish. 2631 */ 2632 if (inp->inp_smr != SMR_SEQ_INVALID) { 2633 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2634 inp->inp_smr = SMR_SEQ_INVALID; 2635 } 2636 2637 if (connected) 2638 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2639 else { 2640 #ifdef INET6 2641 if ((inp->inp_vflag & INP_IPV6) != 0) 2642 _in6_pcbinshash_wild(pcbhash, inp); 2643 else 2644 #endif 2645 _in_pcbinshash_wild(pcbhash, inp); 2646 } 2647 inp->inp_flags |= INP_INHASHLIST; 2648 2649 return (0); 2650 } 2651 2652 void 2653 in_pcbremhash_locked(struct inpcb *inp) 2654 { 2655 struct inpcbport *phd = inp->inp_phd; 2656 2657 INP_WLOCK_ASSERT(inp); 2658 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2659 MPASS(inp->inp_flags & INP_INHASHLIST); 2660 2661 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2662 in_pcbremlbgrouphash(inp); 2663 #ifdef INET6 2664 if (inp->inp_vflag & INP_IPV6) { 2665 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2666 CK_LIST_REMOVE(inp, inp_hash_wild); 2667 else 2668 CK_LIST_REMOVE(inp, inp_hash_exact); 2669 } else 2670 #endif 2671 { 2672 if (in_nullhost(inp->inp_faddr)) 2673 CK_LIST_REMOVE(inp, inp_hash_wild); 2674 else 2675 CK_LIST_REMOVE(inp, inp_hash_exact); 2676 } 2677 CK_LIST_REMOVE(inp, inp_portlist); 2678 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2679 CK_LIST_REMOVE(phd, phd_hash); 2680 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 2681 } 2682 inp->inp_flags &= ~INP_INHASHLIST; 2683 } 2684 2685 static void 2686 in_pcbremhash(struct inpcb *inp) 2687 { 2688 INP_HASH_WLOCK(inp->inp_pcbinfo); 2689 in_pcbremhash_locked(inp); 2690 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2691 } 2692 2693 /* 2694 * Move PCB to the proper hash bucket when { faddr, fport } have been 2695 * changed. NOTE: This does not handle the case of the lport changing (the 2696 * hashed port list would have to be updated as well), so the lport must 2697 * not change after in_pcbinshash() has been called. 2698 */ 2699 void 2700 in_pcbrehash(struct inpcb *inp) 2701 { 2702 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2703 struct inpcbhead *head; 2704 uint32_t hash; 2705 bool connected; 2706 2707 INP_WLOCK_ASSERT(inp); 2708 INP_HASH_WLOCK_ASSERT(pcbinfo); 2709 KASSERT(inp->inp_flags & INP_INHASHLIST, 2710 ("%s: !INP_INHASHLIST", __func__)); 2711 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2712 ("%s: inp was disconnected", __func__)); 2713 2714 #ifdef INET6 2715 if (inp->inp_vflag & INP_IPV6) { 2716 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2717 inp->inp_fport, pcbinfo->ipi_hashmask); 2718 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2719 } else 2720 #endif 2721 { 2722 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2723 inp->inp_fport, pcbinfo->ipi_hashmask); 2724 connected = !in_nullhost(inp->inp_faddr); 2725 } 2726 2727 /* 2728 * When rehashing, the caller must ensure that either the new or the old 2729 * foreign address was unspecified. 2730 */ 2731 if (connected) 2732 CK_LIST_REMOVE(inp, inp_hash_wild); 2733 else 2734 CK_LIST_REMOVE(inp, inp_hash_exact); 2735 2736 if (connected) { 2737 head = &pcbinfo->ipi_hash_exact[hash]; 2738 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2739 } else { 2740 head = &pcbinfo->ipi_hash_wild[hash]; 2741 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2742 } 2743 } 2744 2745 /* 2746 * Check for alternatives when higher level complains 2747 * about service problems. For now, invalidate cached 2748 * routing information. If the route was created dynamically 2749 * (by a redirect), time to try a default gateway again. 2750 */ 2751 void 2752 in_losing(struct inpcb *inp) 2753 { 2754 2755 RO_INVALIDATE_CACHE(&inp->inp_route); 2756 return; 2757 } 2758 2759 /* 2760 * A set label operation has occurred at the socket layer, propagate the 2761 * label change into the in_pcb for the socket. 2762 */ 2763 void 2764 in_pcbsosetlabel(struct socket *so) 2765 { 2766 #ifdef MAC 2767 struct inpcb *inp; 2768 2769 inp = sotoinpcb(so); 2770 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2771 2772 INP_WLOCK(inp); 2773 SOCK_LOCK(so); 2774 mac_inpcb_sosetlabel(so, inp); 2775 SOCK_UNLOCK(so); 2776 INP_WUNLOCK(inp); 2777 #endif 2778 } 2779 2780 void 2781 inp_wlock(struct inpcb *inp) 2782 { 2783 2784 INP_WLOCK(inp); 2785 } 2786 2787 void 2788 inp_wunlock(struct inpcb *inp) 2789 { 2790 2791 INP_WUNLOCK(inp); 2792 } 2793 2794 void 2795 inp_rlock(struct inpcb *inp) 2796 { 2797 2798 INP_RLOCK(inp); 2799 } 2800 2801 void 2802 inp_runlock(struct inpcb *inp) 2803 { 2804 2805 INP_RUNLOCK(inp); 2806 } 2807 2808 #ifdef INVARIANT_SUPPORT 2809 void 2810 inp_lock_assert(struct inpcb *inp) 2811 { 2812 2813 INP_WLOCK_ASSERT(inp); 2814 } 2815 2816 void 2817 inp_unlock_assert(struct inpcb *inp) 2818 { 2819 2820 INP_UNLOCK_ASSERT(inp); 2821 } 2822 #endif 2823 2824 void 2825 inp_apply_all(struct inpcbinfo *pcbinfo, 2826 void (*func)(struct inpcb *, void *), void *arg) 2827 { 2828 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2829 INPLOOKUP_WLOCKPCB); 2830 struct inpcb *inp; 2831 2832 while ((inp = inp_next(&inpi)) != NULL) 2833 func(inp, arg); 2834 } 2835 2836 struct socket * 2837 inp_inpcbtosocket(struct inpcb *inp) 2838 { 2839 2840 INP_WLOCK_ASSERT(inp); 2841 return (inp->inp_socket); 2842 } 2843 2844 void 2845 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2846 uint32_t *faddr, uint16_t *fp) 2847 { 2848 2849 INP_LOCK_ASSERT(inp); 2850 *laddr = inp->inp_laddr.s_addr; 2851 *faddr = inp->inp_faddr.s_addr; 2852 *lp = inp->inp_lport; 2853 *fp = inp->inp_fport; 2854 } 2855 2856 /* 2857 * Create an external-format (``xinpcb'') structure using the information in 2858 * the kernel-format in_pcb structure pointed to by inp. This is done to 2859 * reduce the spew of irrelevant information over this interface, to isolate 2860 * user code from changes in the kernel structure, and potentially to provide 2861 * information-hiding if we decide that some of this information should be 2862 * hidden from users. 2863 */ 2864 void 2865 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2866 { 2867 2868 bzero(xi, sizeof(*xi)); 2869 xi->xi_len = sizeof(struct xinpcb); 2870 if (inp->inp_socket) 2871 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2872 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2873 xi->inp_gencnt = inp->inp_gencnt; 2874 xi->inp_flow = inp->inp_flow; 2875 xi->inp_flowid = inp->inp_flowid; 2876 xi->inp_flowtype = inp->inp_flowtype; 2877 xi->inp_flags = inp->inp_flags; 2878 xi->inp_flags2 = inp->inp_flags2; 2879 xi->in6p_cksum = inp->in6p_cksum; 2880 xi->in6p_hops = inp->in6p_hops; 2881 xi->inp_ip_tos = inp->inp_ip_tos; 2882 xi->inp_vflag = inp->inp_vflag; 2883 xi->inp_ip_ttl = inp->inp_ip_ttl; 2884 xi->inp_ip_p = inp->inp_ip_p; 2885 xi->inp_ip_minttl = inp->inp_ip_minttl; 2886 } 2887 2888 int 2889 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2890 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2891 { 2892 struct sockopt sopt; 2893 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2894 INPLOOKUP_WLOCKPCB); 2895 struct inpcb *inp; 2896 struct sockopt_parameters *params; 2897 struct socket *so; 2898 int error; 2899 char buf[1024]; 2900 2901 if (req->oldptr != NULL || req->oldlen != 0) 2902 return (EINVAL); 2903 if (req->newptr == NULL) 2904 return (EPERM); 2905 if (req->newlen > sizeof(buf)) 2906 return (ENOMEM); 2907 error = SYSCTL_IN(req, buf, req->newlen); 2908 if (error != 0) 2909 return (error); 2910 if (req->newlen < sizeof(struct sockopt_parameters)) 2911 return (EINVAL); 2912 params = (struct sockopt_parameters *)buf; 2913 sopt.sopt_level = params->sop_level; 2914 sopt.sopt_name = params->sop_optname; 2915 sopt.sopt_dir = SOPT_SET; 2916 sopt.sopt_val = params->sop_optval; 2917 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2918 sopt.sopt_td = NULL; 2919 #ifdef INET6 2920 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2921 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2922 params->sop_inc.inc6_laddr.s6_addr16[1] = 2923 htons(params->sop_inc.inc6_zoneid & 0xffff); 2924 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2925 params->sop_inc.inc6_faddr.s6_addr16[1] = 2926 htons(params->sop_inc.inc6_zoneid & 0xffff); 2927 } 2928 #endif 2929 if (params->sop_inc.inc_lport != htons(0) && 2930 params->sop_inc.inc_fport != htons(0)) { 2931 #ifdef INET6 2932 if (params->sop_inc.inc_flags & INC_ISIPV6) 2933 inpi.hash = INP6_PCBHASH( 2934 ¶ms->sop_inc.inc6_faddr, 2935 params->sop_inc.inc_lport, 2936 params->sop_inc.inc_fport, 2937 pcbinfo->ipi_hashmask); 2938 else 2939 #endif 2940 inpi.hash = INP_PCBHASH( 2941 ¶ms->sop_inc.inc_faddr, 2942 params->sop_inc.inc_lport, 2943 params->sop_inc.inc_fport, 2944 pcbinfo->ipi_hashmask); 2945 } 2946 while ((inp = inp_next(&inpi)) != NULL) 2947 if (inp->inp_gencnt == params->sop_id) { 2948 if (inp->inp_flags & INP_DROPPED) { 2949 INP_WUNLOCK(inp); 2950 return (ECONNRESET); 2951 } 2952 so = inp->inp_socket; 2953 KASSERT(so != NULL, ("inp_socket == NULL")); 2954 soref(so); 2955 if (params->sop_level == SOL_SOCKET) { 2956 INP_WUNLOCK(inp); 2957 error = sosetopt(so, &sopt); 2958 } else 2959 error = (*ctloutput_set)(inp, &sopt); 2960 sorele(so); 2961 break; 2962 } 2963 if (inp == NULL) 2964 error = ESRCH; 2965 return (error); 2966 } 2967 2968 #ifdef DDB 2969 static void 2970 db_print_indent(int indent) 2971 { 2972 int i; 2973 2974 for (i = 0; i < indent; i++) 2975 db_printf(" "); 2976 } 2977 2978 static void 2979 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2980 { 2981 char faddr_str[48], laddr_str[48]; 2982 2983 db_print_indent(indent); 2984 db_printf("%s at %p\n", name, inc); 2985 2986 indent += 2; 2987 2988 #ifdef INET6 2989 if (inc->inc_flags & INC_ISIPV6) { 2990 /* IPv6. */ 2991 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2992 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2993 } else 2994 #endif 2995 { 2996 /* IPv4. */ 2997 inet_ntoa_r(inc->inc_laddr, laddr_str); 2998 inet_ntoa_r(inc->inc_faddr, faddr_str); 2999 } 3000 db_print_indent(indent); 3001 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3002 ntohs(inc->inc_lport)); 3003 db_print_indent(indent); 3004 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3005 ntohs(inc->inc_fport)); 3006 } 3007 3008 static void 3009 db_print_inpflags(int inp_flags) 3010 { 3011 int comma; 3012 3013 comma = 0; 3014 if (inp_flags & INP_RECVOPTS) { 3015 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 3016 comma = 1; 3017 } 3018 if (inp_flags & INP_RECVRETOPTS) { 3019 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 3020 comma = 1; 3021 } 3022 if (inp_flags & INP_RECVDSTADDR) { 3023 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 3024 comma = 1; 3025 } 3026 if (inp_flags & INP_ORIGDSTADDR) { 3027 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 3028 comma = 1; 3029 } 3030 if (inp_flags & INP_HDRINCL) { 3031 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 3032 comma = 1; 3033 } 3034 if (inp_flags & INP_HIGHPORT) { 3035 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 3036 comma = 1; 3037 } 3038 if (inp_flags & INP_LOWPORT) { 3039 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 3040 comma = 1; 3041 } 3042 if (inp_flags & INP_ANONPORT) { 3043 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 3044 comma = 1; 3045 } 3046 if (inp_flags & INP_RECVIF) { 3047 db_printf("%sINP_RECVIF", comma ? ", " : ""); 3048 comma = 1; 3049 } 3050 if (inp_flags & INP_MTUDISC) { 3051 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 3052 comma = 1; 3053 } 3054 if (inp_flags & INP_RECVTTL) { 3055 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 3056 comma = 1; 3057 } 3058 if (inp_flags & INP_DONTFRAG) { 3059 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 3060 comma = 1; 3061 } 3062 if (inp_flags & INP_RECVTOS) { 3063 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 3064 comma = 1; 3065 } 3066 if (inp_flags & IN6P_IPV6_V6ONLY) { 3067 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3068 comma = 1; 3069 } 3070 if (inp_flags & IN6P_PKTINFO) { 3071 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3072 comma = 1; 3073 } 3074 if (inp_flags & IN6P_HOPLIMIT) { 3075 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3076 comma = 1; 3077 } 3078 if (inp_flags & IN6P_HOPOPTS) { 3079 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3080 comma = 1; 3081 } 3082 if (inp_flags & IN6P_DSTOPTS) { 3083 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3084 comma = 1; 3085 } 3086 if (inp_flags & IN6P_RTHDR) { 3087 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3088 comma = 1; 3089 } 3090 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3091 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3092 comma = 1; 3093 } 3094 if (inp_flags & IN6P_TCLASS) { 3095 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3096 comma = 1; 3097 } 3098 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3099 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3100 comma = 1; 3101 } 3102 if (inp_flags & INP_ONESBCAST) { 3103 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3104 comma = 1; 3105 } 3106 if (inp_flags & INP_DROPPED) { 3107 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3108 comma = 1; 3109 } 3110 if (inp_flags & INP_SOCKREF) { 3111 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3112 comma = 1; 3113 } 3114 if (inp_flags & IN6P_RFC2292) { 3115 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3116 comma = 1; 3117 } 3118 if (inp_flags & IN6P_MTU) { 3119 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3120 comma = 1; 3121 } 3122 } 3123 3124 static void 3125 db_print_inpvflag(u_char inp_vflag) 3126 { 3127 int comma; 3128 3129 comma = 0; 3130 if (inp_vflag & INP_IPV4) { 3131 db_printf("%sINP_IPV4", comma ? ", " : ""); 3132 comma = 1; 3133 } 3134 if (inp_vflag & INP_IPV6) { 3135 db_printf("%sINP_IPV6", comma ? ", " : ""); 3136 comma = 1; 3137 } 3138 if (inp_vflag & INP_IPV6PROTO) { 3139 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3140 comma = 1; 3141 } 3142 } 3143 3144 static void 3145 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3146 { 3147 3148 db_print_indent(indent); 3149 db_printf("%s at %p\n", name, inp); 3150 3151 indent += 2; 3152 3153 db_print_indent(indent); 3154 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3155 3156 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3157 3158 db_print_indent(indent); 3159 db_printf("inp_label: %p inp_flags: 0x%x (", 3160 inp->inp_label, inp->inp_flags); 3161 db_print_inpflags(inp->inp_flags); 3162 db_printf(")\n"); 3163 3164 db_print_indent(indent); 3165 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3166 inp->inp_vflag); 3167 db_print_inpvflag(inp->inp_vflag); 3168 db_printf(")\n"); 3169 3170 db_print_indent(indent); 3171 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3172 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3173 3174 db_print_indent(indent); 3175 #ifdef INET6 3176 if (inp->inp_vflag & INP_IPV6) { 3177 db_printf("in6p_options: %p in6p_outputopts: %p " 3178 "in6p_moptions: %p\n", inp->in6p_options, 3179 inp->in6p_outputopts, inp->in6p_moptions); 3180 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3181 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3182 inp->in6p_hops); 3183 } else 3184 #endif 3185 { 3186 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3187 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3188 inp->inp_options, inp->inp_moptions); 3189 } 3190 3191 db_print_indent(indent); 3192 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3193 (uintmax_t)inp->inp_gencnt); 3194 } 3195 3196 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3197 { 3198 struct inpcb *inp; 3199 3200 if (!have_addr) { 3201 db_printf("usage: show inpcb <addr>\n"); 3202 return; 3203 } 3204 inp = (struct inpcb *)addr; 3205 3206 db_print_inpcb(inp, "inpcb", 0); 3207 } 3208 #endif /* DDB */ 3209 3210 #ifdef RATELIMIT 3211 /* 3212 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3213 * if any. 3214 */ 3215 int 3216 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3217 { 3218 union if_snd_tag_modify_params params = { 3219 .rate_limit.max_rate = max_pacing_rate, 3220 .rate_limit.flags = M_NOWAIT, 3221 }; 3222 struct m_snd_tag *mst; 3223 int error; 3224 3225 mst = inp->inp_snd_tag; 3226 if (mst == NULL) 3227 return (EINVAL); 3228 3229 if (mst->sw->snd_tag_modify == NULL) { 3230 error = EOPNOTSUPP; 3231 } else { 3232 error = mst->sw->snd_tag_modify(mst, ¶ms); 3233 } 3234 return (error); 3235 } 3236 3237 /* 3238 * Query existing TX rate limit based on the existing 3239 * "inp->inp_snd_tag", if any. 3240 */ 3241 int 3242 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3243 { 3244 union if_snd_tag_query_params params = { }; 3245 struct m_snd_tag *mst; 3246 int error; 3247 3248 mst = inp->inp_snd_tag; 3249 if (mst == NULL) 3250 return (EINVAL); 3251 3252 if (mst->sw->snd_tag_query == NULL) { 3253 error = EOPNOTSUPP; 3254 } else { 3255 error = mst->sw->snd_tag_query(mst, ¶ms); 3256 if (error == 0 && p_max_pacing_rate != NULL) 3257 *p_max_pacing_rate = params.rate_limit.max_rate; 3258 } 3259 return (error); 3260 } 3261 3262 /* 3263 * Query existing TX queue level based on the existing 3264 * "inp->inp_snd_tag", if any. 3265 */ 3266 int 3267 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3268 { 3269 union if_snd_tag_query_params params = { }; 3270 struct m_snd_tag *mst; 3271 int error; 3272 3273 mst = inp->inp_snd_tag; 3274 if (mst == NULL) 3275 return (EINVAL); 3276 3277 if (mst->sw->snd_tag_query == NULL) 3278 return (EOPNOTSUPP); 3279 3280 error = mst->sw->snd_tag_query(mst, ¶ms); 3281 if (error == 0 && p_txqueue_level != NULL) 3282 *p_txqueue_level = params.rate_limit.queue_level; 3283 return (error); 3284 } 3285 3286 /* 3287 * Allocate a new TX rate limit send tag from the network interface 3288 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3289 */ 3290 int 3291 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3292 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3293 3294 { 3295 union if_snd_tag_alloc_params params = { 3296 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3297 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3298 .rate_limit.hdr.flowid = flowid, 3299 .rate_limit.hdr.flowtype = flowtype, 3300 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3301 .rate_limit.max_rate = max_pacing_rate, 3302 .rate_limit.flags = M_NOWAIT, 3303 }; 3304 int error; 3305 3306 INP_WLOCK_ASSERT(inp); 3307 3308 /* 3309 * If there is already a send tag, or the INP is being torn 3310 * down, allocating a new send tag is not allowed. Else send 3311 * tags may leak. 3312 */ 3313 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3314 return (EINVAL); 3315 3316 error = m_snd_tag_alloc(ifp, ¶ms, st); 3317 #ifdef INET 3318 if (error == 0) { 3319 counter_u64_add(rate_limit_set_ok, 1); 3320 counter_u64_add(rate_limit_active, 1); 3321 } else if (error != EOPNOTSUPP) 3322 counter_u64_add(rate_limit_alloc_fail, 1); 3323 #endif 3324 return (error); 3325 } 3326 3327 void 3328 in_pcbdetach_tag(struct m_snd_tag *mst) 3329 { 3330 3331 m_snd_tag_rele(mst); 3332 #ifdef INET 3333 counter_u64_add(rate_limit_active, -1); 3334 #endif 3335 } 3336 3337 /* 3338 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3339 * if any: 3340 */ 3341 void 3342 in_pcbdetach_txrtlmt(struct inpcb *inp) 3343 { 3344 struct m_snd_tag *mst; 3345 3346 INP_WLOCK_ASSERT(inp); 3347 3348 mst = inp->inp_snd_tag; 3349 inp->inp_snd_tag = NULL; 3350 3351 if (mst == NULL) 3352 return; 3353 3354 m_snd_tag_rele(mst); 3355 #ifdef INET 3356 counter_u64_add(rate_limit_active, -1); 3357 #endif 3358 } 3359 3360 int 3361 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3362 { 3363 int error; 3364 3365 /* 3366 * If the existing send tag is for the wrong interface due to 3367 * a route change, first drop the existing tag. Set the 3368 * CHANGED flag so that we will keep trying to allocate a new 3369 * tag if we fail to allocate one this time. 3370 */ 3371 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3372 in_pcbdetach_txrtlmt(inp); 3373 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3374 } 3375 3376 /* 3377 * NOTE: When attaching to a network interface a reference is 3378 * made to ensure the network interface doesn't go away until 3379 * all ratelimit connections are gone. The network interface 3380 * pointers compared below represent valid network interfaces, 3381 * except when comparing towards NULL. 3382 */ 3383 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3384 error = 0; 3385 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3386 if (inp->inp_snd_tag != NULL) 3387 in_pcbdetach_txrtlmt(inp); 3388 error = 0; 3389 } else if (inp->inp_snd_tag == NULL) { 3390 /* 3391 * In order to utilize packet pacing with RSS, we need 3392 * to wait until there is a valid RSS hash before we 3393 * can proceed: 3394 */ 3395 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3396 error = EAGAIN; 3397 } else { 3398 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3399 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3400 } 3401 } else { 3402 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3403 } 3404 if (error == 0 || error == EOPNOTSUPP) 3405 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3406 3407 return (error); 3408 } 3409 3410 /* 3411 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3412 * is set in the fast path and will attach/detach/modify the TX rate 3413 * limit send tag based on the socket's so_max_pacing_rate value. 3414 */ 3415 void 3416 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3417 { 3418 struct socket *socket; 3419 uint32_t max_pacing_rate; 3420 bool did_upgrade; 3421 3422 if (inp == NULL) 3423 return; 3424 3425 socket = inp->inp_socket; 3426 if (socket == NULL) 3427 return; 3428 3429 if (!INP_WLOCKED(inp)) { 3430 /* 3431 * NOTE: If the write locking fails, we need to bail 3432 * out and use the non-ratelimited ring for the 3433 * transmit until there is a new chance to get the 3434 * write lock. 3435 */ 3436 if (!INP_TRY_UPGRADE(inp)) 3437 return; 3438 did_upgrade = 1; 3439 } else { 3440 did_upgrade = 0; 3441 } 3442 3443 /* 3444 * NOTE: The so_max_pacing_rate value is read unlocked, 3445 * because atomic updates are not required since the variable 3446 * is checked at every mbuf we send. It is assumed that the 3447 * variable read itself will be atomic. 3448 */ 3449 max_pacing_rate = socket->so_max_pacing_rate; 3450 3451 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3452 3453 if (did_upgrade) 3454 INP_DOWNGRADE(inp); 3455 } 3456 3457 /* 3458 * Track route changes for TX rate limiting. 3459 */ 3460 void 3461 in_pcboutput_eagain(struct inpcb *inp) 3462 { 3463 bool did_upgrade; 3464 3465 if (inp == NULL) 3466 return; 3467 3468 if (inp->inp_snd_tag == NULL) 3469 return; 3470 3471 if (!INP_WLOCKED(inp)) { 3472 /* 3473 * NOTE: If the write locking fails, we need to bail 3474 * out and use the non-ratelimited ring for the 3475 * transmit until there is a new chance to get the 3476 * write lock. 3477 */ 3478 if (!INP_TRY_UPGRADE(inp)) 3479 return; 3480 did_upgrade = 1; 3481 } else { 3482 did_upgrade = 0; 3483 } 3484 3485 /* detach rate limiting */ 3486 in_pcbdetach_txrtlmt(inp); 3487 3488 /* make sure new mbuf send tag allocation is made */ 3489 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3490 3491 if (did_upgrade) 3492 INP_DOWNGRADE(inp); 3493 } 3494 3495 #ifdef INET 3496 static void 3497 rl_init(void *st) 3498 { 3499 rate_limit_new = counter_u64_alloc(M_WAITOK); 3500 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3501 rate_limit_active = counter_u64_alloc(M_WAITOK); 3502 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3503 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3504 } 3505 3506 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3507 #endif 3508 #endif /* RATELIMIT */ 3509