1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * Copyright (c) 2021-2022 Gleb Smirnoff <glebius@FreeBSD.org> 9 * All rights reserved. 10 * 11 * Portions of this software were developed by Robert N. M. Watson under 12 * contract to Juniper Networks, Inc. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 */ 38 39 #include "opt_ddb.h" 40 #include "opt_ipsec.h" 41 #include "opt_inet.h" 42 #include "opt_inet6.h" 43 #include "opt_ratelimit.h" 44 #include "opt_route.h" 45 #include "opt_rss.h" 46 47 #include <sys/param.h> 48 #include <sys/hash.h> 49 #include <sys/systm.h> 50 #include <sys/libkern.h> 51 #include <sys/lock.h> 52 #include <sys/malloc.h> 53 #include <sys/mbuf.h> 54 #include <sys/eventhandler.h> 55 #include <sys/domain.h> 56 #include <sys/proc.h> 57 #include <sys/protosw.h> 58 #include <sys/smp.h> 59 #include <sys/smr.h> 60 #include <sys/socket.h> 61 #include <sys/socketvar.h> 62 #include <sys/sockio.h> 63 #include <sys/priv.h> 64 #include <sys/proc.h> 65 #include <sys/refcount.h> 66 #include <sys/jail.h> 67 #include <sys/kernel.h> 68 #include <sys/sysctl.h> 69 70 #ifdef DDB 71 #include <ddb/ddb.h> 72 #endif 73 74 #include <vm/uma.h> 75 #include <vm/vm.h> 76 77 #include <net/if.h> 78 #include <net/if_var.h> 79 #include <net/if_private.h> 80 #include <net/if_types.h> 81 #include <net/if_llatbl.h> 82 #include <net/route.h> 83 #include <net/rss_config.h> 84 #include <net/vnet.h> 85 86 #if defined(INET) || defined(INET6) 87 #include <netinet/in.h> 88 #include <netinet/in_pcb.h> 89 #include <netinet/in_pcb_var.h> 90 #include <netinet/tcp.h> 91 #ifdef INET 92 #include <netinet/in_var.h> 93 #include <netinet/in_fib.h> 94 #endif 95 #include <netinet/ip_var.h> 96 #ifdef INET6 97 #include <netinet/ip6.h> 98 #include <netinet6/in6_pcb.h> 99 #include <netinet6/in6_var.h> 100 #include <netinet6/ip6_var.h> 101 #endif /* INET6 */ 102 #include <net/route/nhop.h> 103 #endif 104 105 #include <netipsec/ipsec_support.h> 106 107 #include <security/mac/mac_framework.h> 108 109 #define INPCBLBGROUP_SIZMIN 8 110 #define INPCBLBGROUP_SIZMAX 256 111 112 #define INP_FREED 0x00000200 /* Went through in_pcbfree(). */ 113 #define INP_INLBGROUP 0x01000000 /* Inserted into inpcblbgroup. */ 114 115 /* 116 * These configure the range of local port addresses assigned to 117 * "unspecified" outgoing connections/packets/whatever. 118 */ 119 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 120 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 121 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 122 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 123 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 124 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 125 126 /* 127 * Reserved ports accessible only to root. There are significant 128 * security considerations that must be accounted for when changing these, 129 * but the security benefits can be great. Please be careful. 130 */ 131 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 132 VNET_DEFINE(int, ipport_reservedlow); 133 134 /* Enable random ephemeral port allocation by default. */ 135 VNET_DEFINE(int, ipport_randomized) = 1; 136 137 #ifdef INET 138 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 139 struct in_addr faddr, u_int fport_arg, 140 struct in_addr laddr, u_int lport_arg, 141 int lookupflags, uint8_t numa_domain, int fib); 142 143 #define RANGECHK(var, min, max) \ 144 if ((var) < (min)) { (var) = (min); } \ 145 else if ((var) > (max)) { (var) = (max); } 146 147 static int 148 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 149 { 150 int error; 151 152 error = sysctl_handle_int(oidp, arg1, arg2, req); 153 if (error == 0) { 154 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 155 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 156 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 157 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 158 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 159 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 160 } 161 return (error); 162 } 163 164 #undef RANGECHK 165 166 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 167 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 168 "IP Ports"); 169 170 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 171 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 172 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 173 ""); 174 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 175 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 176 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 177 ""); 178 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 179 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 180 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 181 ""); 182 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 183 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 184 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 185 ""); 186 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 187 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 188 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 189 ""); 190 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 191 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 192 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 193 ""); 194 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 195 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 196 &VNET_NAME(ipport_reservedhigh), 0, ""); 197 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 198 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 199 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 200 CTLFLAG_VNET | CTLFLAG_RW, 201 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 202 203 #ifdef RATELIMIT 204 counter_u64_t rate_limit_new; 205 counter_u64_t rate_limit_chg; 206 counter_u64_t rate_limit_active; 207 counter_u64_t rate_limit_alloc_fail; 208 counter_u64_t rate_limit_set_ok; 209 210 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 211 "IP Rate Limiting"); 212 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 213 &rate_limit_active, "Active rate limited connections"); 214 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 215 &rate_limit_alloc_fail, "Rate limited connection failures"); 216 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 217 &rate_limit_set_ok, "Rate limited setting succeeded"); 218 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 219 &rate_limit_new, "Total Rate limit new attempts"); 220 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 221 &rate_limit_chg, "Total Rate limited change attempts"); 222 #endif /* RATELIMIT */ 223 224 #endif /* INET */ 225 226 VNET_DEFINE(uint32_t, in_pcbhashseed); 227 static void 228 in_pcbhashseed_init(void) 229 { 230 231 V_in_pcbhashseed = arc4random(); 232 } 233 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 234 in_pcbhashseed_init, NULL); 235 236 #ifdef INET 237 VNET_DEFINE_STATIC(int, connect_inaddr_wild) = 0; 238 #define V_connect_inaddr_wild VNET(connect_inaddr_wild) 239 SYSCTL_INT(_net_inet_ip, OID_AUTO, connect_inaddr_wild, 240 CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(connect_inaddr_wild), 0, 241 "Allow connecting to INADDR_ANY or INADDR_BROADCAST for connect(2)"); 242 #endif 243 244 static void in_pcbremhash(struct inpcb *); 245 246 /* 247 * in_pcb.c: manage the Protocol Control Blocks. 248 * 249 * NOTE: It is assumed that most of these functions will be called with 250 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 251 * functions often modify hash chains or addresses in pcbs. 252 */ 253 254 static struct inpcblbgroup * 255 in_pcblbgroup_alloc(struct ucred *cred, u_char vflag, uint16_t port, 256 const union in_dependaddr *addr, int size, uint8_t numa_domain, int fib) 257 { 258 struct inpcblbgroup *grp; 259 size_t bytes; 260 261 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 262 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 263 if (grp == NULL) 264 return (NULL); 265 LIST_INIT(&grp->il_pending); 266 grp->il_cred = crhold(cred); 267 grp->il_vflag = vflag; 268 grp->il_lport = port; 269 grp->il_numa_domain = numa_domain; 270 grp->il_fibnum = fib; 271 grp->il_dependladdr = *addr; 272 grp->il_inpsiz = size; 273 return (grp); 274 } 275 276 static void 277 in_pcblbgroup_free_deferred(epoch_context_t ctx) 278 { 279 struct inpcblbgroup *grp; 280 281 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 282 crfree(grp->il_cred); 283 free(grp, M_PCB); 284 } 285 286 static void 287 in_pcblbgroup_free(struct inpcblbgroup *grp) 288 { 289 KASSERT(LIST_EMPTY(&grp->il_pending), 290 ("local group %p still has pending inps", grp)); 291 292 CK_LIST_REMOVE(grp, il_list); 293 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 294 } 295 296 static struct inpcblbgroup * 297 in_pcblbgroup_find(struct inpcb *inp) 298 { 299 struct inpcbinfo *pcbinfo; 300 struct inpcblbgroup *grp; 301 struct inpcblbgrouphead *hdr; 302 303 INP_LOCK_ASSERT(inp); 304 305 pcbinfo = inp->inp_pcbinfo; 306 INP_HASH_LOCK_ASSERT(pcbinfo); 307 308 hdr = &pcbinfo->ipi_lbgrouphashbase[ 309 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 310 CK_LIST_FOREACH(grp, hdr, il_list) { 311 struct inpcb *inp1; 312 313 for (unsigned int i = 0; i < grp->il_inpcnt; i++) { 314 if (inp == grp->il_inp[i]) 315 goto found; 316 } 317 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 318 if (inp == inp1) 319 goto found; 320 } 321 } 322 found: 323 return (grp); 324 } 325 326 static void 327 in_pcblbgroup_insert(struct inpcblbgroup *grp, struct inpcb *inp) 328 { 329 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 330 ("invalid local group size %d and count %d", grp->il_inpsiz, 331 grp->il_inpcnt)); 332 INP_WLOCK_ASSERT(inp); 333 334 if (inp->inp_socket->so_proto->pr_listen != pr_listen_notsupp && 335 !SOLISTENING(inp->inp_socket)) { 336 /* 337 * If this is a TCP socket, it should not be visible to lbgroup 338 * lookups until listen() has been called. 339 */ 340 LIST_INSERT_HEAD(&grp->il_pending, inp, inp_lbgroup_list); 341 grp->il_pendcnt++; 342 } else { 343 grp->il_inp[grp->il_inpcnt] = inp; 344 345 /* 346 * Synchronize with in_pcblookup_lbgroup(): make sure that we 347 * don't expose a null slot to the lookup path. 348 */ 349 atomic_store_rel_int(&grp->il_inpcnt, grp->il_inpcnt + 1); 350 } 351 352 inp->inp_flags |= INP_INLBGROUP; 353 } 354 355 static struct inpcblbgroup * 356 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 357 struct inpcblbgroup *old_grp, int size) 358 { 359 struct inpcblbgroup *grp; 360 int i; 361 362 grp = in_pcblbgroup_alloc(old_grp->il_cred, old_grp->il_vflag, 363 old_grp->il_lport, &old_grp->il_dependladdr, size, 364 old_grp->il_numa_domain, old_grp->il_fibnum); 365 if (grp == NULL) 366 return (NULL); 367 368 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 369 ("invalid new local group size %d and old local group count %d", 370 grp->il_inpsiz, old_grp->il_inpcnt)); 371 372 for (i = 0; i < old_grp->il_inpcnt; ++i) 373 grp->il_inp[i] = old_grp->il_inp[i]; 374 grp->il_inpcnt = old_grp->il_inpcnt; 375 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 376 LIST_SWAP(&old_grp->il_pending, &grp->il_pending, inpcb, 377 inp_lbgroup_list); 378 grp->il_pendcnt = old_grp->il_pendcnt; 379 old_grp->il_pendcnt = 0; 380 in_pcblbgroup_free(old_grp); 381 return (grp); 382 } 383 384 /* 385 * Add PCB to load balance group for SO_REUSEPORT_LB option. 386 */ 387 static int 388 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 389 { 390 const static struct timeval interval = { 60, 0 }; 391 static struct timeval lastprint; 392 struct inpcbinfo *pcbinfo; 393 struct inpcblbgrouphead *hdr; 394 struct inpcblbgroup *grp; 395 uint32_t idx; 396 int fib; 397 398 pcbinfo = inp->inp_pcbinfo; 399 400 INP_WLOCK_ASSERT(inp); 401 INP_HASH_WLOCK_ASSERT(pcbinfo); 402 403 fib = (inp->inp_flags & INP_BOUNDFIB) != 0 ? 404 inp->inp_inc.inc_fibnum : RT_ALL_FIBS; 405 406 #ifdef INET6 407 /* 408 * Don't allow IPv4 mapped INET6 wild socket. 409 */ 410 if ((inp->inp_vflag & INP_IPV4) && 411 inp->inp_laddr.s_addr == INADDR_ANY && 412 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 413 return (0); 414 } 415 #endif 416 417 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 418 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 419 CK_LIST_FOREACH(grp, hdr, il_list) { 420 if (grp->il_cred->cr_prison == inp->inp_cred->cr_prison && 421 grp->il_vflag == inp->inp_vflag && 422 grp->il_lport == inp->inp_lport && 423 grp->il_numa_domain == numa_domain && 424 grp->il_fibnum == fib && 425 memcmp(&grp->il_dependladdr, 426 &inp->inp_inc.inc_ie.ie_dependladdr, 427 sizeof(grp->il_dependladdr)) == 0) { 428 break; 429 } 430 } 431 if (grp == NULL) { 432 /* Create new load balance group. */ 433 grp = in_pcblbgroup_alloc(inp->inp_cred, inp->inp_vflag, 434 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 435 INPCBLBGROUP_SIZMIN, numa_domain, fib); 436 if (grp == NULL) 437 return (ENOMEM); 438 in_pcblbgroup_insert(grp, inp); 439 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 440 } else if (grp->il_inpcnt + grp->il_pendcnt == grp->il_inpsiz) { 441 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 442 if (ratecheck(&lastprint, &interval)) 443 printf("lb group port %d, limit reached\n", 444 ntohs(grp->il_lport)); 445 return (0); 446 } 447 448 /* Expand this local group. */ 449 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 450 if (grp == NULL) 451 return (ENOMEM); 452 in_pcblbgroup_insert(grp, inp); 453 } else { 454 in_pcblbgroup_insert(grp, inp); 455 } 456 return (0); 457 } 458 459 /* 460 * Remove PCB from load balance group. 461 */ 462 static void 463 in_pcbremlbgrouphash(struct inpcb *inp) 464 { 465 struct inpcbinfo *pcbinfo; 466 struct inpcblbgrouphead *hdr; 467 struct inpcblbgroup *grp; 468 struct inpcb *inp1; 469 int i; 470 471 pcbinfo = inp->inp_pcbinfo; 472 473 INP_WLOCK_ASSERT(inp); 474 MPASS(inp->inp_flags & INP_INLBGROUP); 475 INP_HASH_WLOCK_ASSERT(pcbinfo); 476 477 hdr = &pcbinfo->ipi_lbgrouphashbase[ 478 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 479 CK_LIST_FOREACH(grp, hdr, il_list) { 480 for (i = 0; i < grp->il_inpcnt; ++i) { 481 if (grp->il_inp[i] != inp) 482 continue; 483 484 if (grp->il_inpcnt == 1 && 485 LIST_EMPTY(&grp->il_pending)) { 486 /* We are the last, free this local group. */ 487 in_pcblbgroup_free(grp); 488 } else { 489 grp->il_inp[i] = 490 grp->il_inp[grp->il_inpcnt - 1]; 491 492 /* 493 * Synchronize with in_pcblookup_lbgroup(). 494 */ 495 atomic_store_rel_int(&grp->il_inpcnt, 496 grp->il_inpcnt - 1); 497 } 498 inp->inp_flags &= ~INP_INLBGROUP; 499 return; 500 } 501 LIST_FOREACH(inp1, &grp->il_pending, inp_lbgroup_list) { 502 if (inp == inp1) { 503 LIST_REMOVE(inp, inp_lbgroup_list); 504 grp->il_pendcnt--; 505 inp->inp_flags &= ~INP_INLBGROUP; 506 return; 507 } 508 } 509 } 510 __assert_unreachable(); 511 } 512 513 int 514 in_pcblbgroup_numa(struct inpcb *inp, int arg) 515 { 516 struct inpcbinfo *pcbinfo; 517 int error; 518 uint8_t numa_domain; 519 520 switch (arg) { 521 case TCP_REUSPORT_LB_NUMA_NODOM: 522 numa_domain = M_NODOM; 523 break; 524 case TCP_REUSPORT_LB_NUMA_CURDOM: 525 numa_domain = PCPU_GET(domain); 526 break; 527 default: 528 if (arg < 0 || arg >= vm_ndomains) 529 return (EINVAL); 530 numa_domain = arg; 531 } 532 533 pcbinfo = inp->inp_pcbinfo; 534 INP_WLOCK_ASSERT(inp); 535 INP_HASH_WLOCK(pcbinfo); 536 if (in_pcblbgroup_find(inp) != NULL) { 537 /* Remove it from the old group. */ 538 in_pcbremlbgrouphash(inp); 539 /* Add it to the new group based on numa domain. */ 540 in_pcbinslbgrouphash(inp, numa_domain); 541 error = 0; 542 } else { 543 error = ENOENT; 544 } 545 INP_HASH_WUNLOCK(pcbinfo); 546 return (error); 547 } 548 549 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 550 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 551 552 /* 553 * Initialize an inpcbinfo - a per-VNET instance of connections db. 554 */ 555 void 556 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 557 u_int hash_nelements, u_int porthash_nelements) 558 { 559 560 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 561 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 562 NULL, MTX_DEF); 563 #ifdef VIMAGE 564 pcbinfo->ipi_vnet = curvnet; 565 #endif 566 CK_LIST_INIT(&pcbinfo->ipi_listhead); 567 pcbinfo->ipi_count = 0; 568 pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, 569 &pcbinfo->ipi_hashmask); 570 pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, 571 &pcbinfo->ipi_hashmask); 572 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 573 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 574 &pcbinfo->ipi_porthashmask); 575 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 576 &pcbinfo->ipi_lbgrouphashmask); 577 pcbinfo->ipi_zone = pcbstor->ips_zone; 578 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 579 } 580 581 /* 582 * Destroy an inpcbinfo. 583 */ 584 void 585 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 586 { 587 588 KASSERT(pcbinfo->ipi_count == 0, 589 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 590 591 hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); 592 hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); 593 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 594 pcbinfo->ipi_porthashmask); 595 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 596 pcbinfo->ipi_lbgrouphashmask); 597 mtx_destroy(&pcbinfo->ipi_hash_lock); 598 mtx_destroy(&pcbinfo->ipi_lock); 599 } 600 601 /* 602 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 603 */ 604 static void inpcb_fini(void *, int); 605 void 606 in_pcbstorage_init(void *arg) 607 { 608 struct inpcbstorage *pcbstor = arg; 609 610 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 611 pcbstor->ips_size, NULL, NULL, pcbstor->ips_pcbinit, 612 inpcb_fini, UMA_ALIGN_CACHE, UMA_ZONE_SMR); 613 } 614 615 /* 616 * Destroy a pcbstorage - used by unloadable protocols. 617 */ 618 void 619 in_pcbstorage_destroy(void *arg) 620 { 621 struct inpcbstorage *pcbstor = arg; 622 623 uma_zdestroy(pcbstor->ips_zone); 624 } 625 626 /* 627 * Allocate a PCB and associate it with the socket. 628 * On success return with the PCB locked. 629 */ 630 int 631 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 632 { 633 struct inpcb *inp; 634 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 635 int error; 636 #endif 637 638 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 639 if (inp == NULL) 640 return (ENOBUFS); 641 bzero(&inp->inp_start_zero, inp_zero_size); 642 #ifdef NUMA 643 inp->inp_numa_domain = M_NODOM; 644 #endif 645 inp->inp_pcbinfo = pcbinfo; 646 inp->inp_socket = so; 647 inp->inp_cred = crhold(so->so_cred); 648 inp->inp_inc.inc_fibnum = so->so_fibnum; 649 #ifdef MAC 650 error = mac_inpcb_init(inp, M_NOWAIT); 651 if (error != 0) 652 goto out; 653 mac_inpcb_create(so, inp); 654 #endif 655 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 656 error = ipsec_init_pcbpolicy(inp); 657 if (error != 0) { 658 #ifdef MAC 659 mac_inpcb_destroy(inp); 660 #endif 661 goto out; 662 } 663 #endif /*IPSEC*/ 664 #ifdef INET6 665 if (INP_SOCKAF(so) == AF_INET6) { 666 inp->inp_vflag |= INP_IPV6PROTO | INP_IPV6; 667 if (V_ip6_v6only) 668 inp->inp_flags |= IN6P_IPV6_V6ONLY; 669 #ifdef INET 670 else 671 inp->inp_vflag |= INP_IPV4; 672 #endif 673 if (V_ip6_auto_flowlabel) 674 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 675 inp->in6p_hops = -1; /* use kernel default */ 676 } 677 #endif 678 #if defined(INET) && defined(INET6) 679 else 680 #endif 681 #ifdef INET 682 inp->inp_vflag |= INP_IPV4; 683 #endif 684 inp->inp_smr = SMR_SEQ_INVALID; 685 686 /* 687 * Routes in inpcb's can cache L2 as well; they are guaranteed 688 * to be cleaned up. 689 */ 690 inp->inp_route.ro_flags = RT_LLE_CACHE; 691 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 692 INP_WLOCK(inp); 693 INP_INFO_WLOCK(pcbinfo); 694 pcbinfo->ipi_count++; 695 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 696 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 697 INP_INFO_WUNLOCK(pcbinfo); 698 so->so_pcb = inp; 699 700 return (0); 701 702 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 703 out: 704 crfree(inp->inp_cred); 705 #ifdef INVARIANTS 706 inp->inp_cred = NULL; 707 #endif 708 uma_zfree_smr(pcbinfo->ipi_zone, inp); 709 return (error); 710 #endif 711 } 712 713 #ifdef INET 714 int 715 in_pcbbind(struct inpcb *inp, struct sockaddr_in *sin, int flags, 716 struct ucred *cred) 717 { 718 int anonport, error; 719 720 KASSERT(sin == NULL || sin->sin_family == AF_INET, 721 ("%s: invalid address family for %p", __func__, sin)); 722 KASSERT(sin == NULL || sin->sin_len == sizeof(struct sockaddr_in), 723 ("%s: invalid address length for %p", __func__, sin)); 724 INP_WLOCK_ASSERT(inp); 725 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 726 727 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 728 return (EINVAL); 729 anonport = sin == NULL || sin->sin_port == 0; 730 error = in_pcbbind_setup(inp, sin, &inp->inp_laddr.s_addr, 731 &inp->inp_lport, flags, cred); 732 if (error) 733 return (error); 734 if (__predict_false((error = in_pcbinshash(inp)) != 0)) { 735 MPASS(inp->inp_socket->so_options & SO_REUSEPORT_LB); 736 inp->inp_laddr.s_addr = INADDR_ANY; 737 inp->inp_lport = 0; 738 inp->inp_flags &= ~INP_BOUNDFIB; 739 return (error); 740 } 741 if (anonport) 742 inp->inp_flags |= INP_ANONPORT; 743 return (0); 744 } 745 #endif 746 747 #if defined(INET) || defined(INET6) 748 /* 749 * Assign a local port like in_pcb_lport(), but also used with connect() 750 * and a foreign address and port. If fsa is non-NULL, choose a local port 751 * that is unused with those, otherwise one that is completely unused. 752 * lsa can be NULL for IPv6. 753 */ 754 int 755 in_pcb_lport_dest(const struct inpcb *inp, struct sockaddr *lsa, 756 u_short *lportp, struct sockaddr *fsa, u_short fport, struct ucred *cred, 757 int lookupflags) 758 { 759 struct inpcbinfo *pcbinfo; 760 struct inpcb *tmpinp; 761 unsigned short *lastport; 762 int count, error; 763 u_short aux, first, last, lport; 764 #ifdef INET 765 struct in_addr laddr, faddr; 766 #endif 767 #ifdef INET6 768 struct in6_addr *laddr6, *faddr6; 769 #endif 770 771 pcbinfo = inp->inp_pcbinfo; 772 773 /* 774 * Because no actual state changes occur here, a global write lock on 775 * the pcbinfo isn't required. 776 */ 777 INP_LOCK_ASSERT(inp); 778 INP_HASH_LOCK_ASSERT(pcbinfo); 779 780 if (inp->inp_flags & INP_HIGHPORT) { 781 first = V_ipport_hifirstauto; /* sysctl */ 782 last = V_ipport_hilastauto; 783 lastport = &pcbinfo->ipi_lasthi; 784 } else if (inp->inp_flags & INP_LOWPORT) { 785 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 786 if (error) 787 return (error); 788 first = V_ipport_lowfirstauto; /* 1023 */ 789 last = V_ipport_lowlastauto; /* 600 */ 790 lastport = &pcbinfo->ipi_lastlow; 791 } else { 792 first = V_ipport_firstauto; /* sysctl */ 793 last = V_ipport_lastauto; 794 lastport = &pcbinfo->ipi_lastport; 795 } 796 797 /* 798 * Instead of having two loops further down counting up or down 799 * make sure that first is always <= last and go with only one 800 * code path implementing all logic. 801 */ 802 if (first > last) { 803 aux = first; 804 first = last; 805 last = aux; 806 } 807 808 #ifdef INET 809 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 810 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 811 if (lsa != NULL) 812 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 813 if (fsa != NULL) 814 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 815 } 816 #endif 817 #ifdef INET6 818 laddr6 = NULL; 819 if ((inp->inp_vflag & INP_IPV6) != 0) { 820 if (lsa != NULL) 821 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 822 if (fsa != NULL) 823 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 824 } 825 #endif 826 827 tmpinp = NULL; 828 829 if (V_ipport_randomized) 830 *lastport = first + (arc4random() % (last - first)); 831 832 count = last - first; 833 834 do { 835 if (count-- < 0) /* completely used? */ 836 return (EADDRNOTAVAIL); 837 ++*lastport; 838 if (*lastport < first || *lastport > last) 839 *lastport = first; 840 lport = htons(*lastport); 841 842 if (fsa != NULL) { 843 #ifdef INET 844 if (lsa->sa_family == AF_INET) { 845 tmpinp = in_pcblookup_hash_locked(pcbinfo, 846 faddr, fport, laddr, lport, lookupflags, 847 M_NODOM, RT_ALL_FIBS); 848 } 849 #endif 850 #ifdef INET6 851 if (lsa->sa_family == AF_INET6) { 852 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 853 faddr6, fport, laddr6, lport, lookupflags, 854 M_NODOM, RT_ALL_FIBS); 855 } 856 #endif 857 } else { 858 #ifdef INET6 859 if ((inp->inp_vflag & INP_IPV6) != 0) { 860 tmpinp = in6_pcblookup_local(pcbinfo, 861 &inp->in6p_laddr, lport, RT_ALL_FIBS, 862 lookupflags, cred); 863 #ifdef INET 864 if (tmpinp == NULL && 865 (inp->inp_vflag & INP_IPV4)) 866 tmpinp = in_pcblookup_local(pcbinfo, 867 laddr, lport, RT_ALL_FIBS, 868 lookupflags, cred); 869 #endif 870 } 871 #endif 872 #if defined(INET) && defined(INET6) 873 else 874 #endif 875 #ifdef INET 876 tmpinp = in_pcblookup_local(pcbinfo, laddr, 877 lport, RT_ALL_FIBS, lookupflags, cred); 878 #endif 879 } 880 } while (tmpinp != NULL); 881 882 *lportp = lport; 883 884 return (0); 885 } 886 887 /* 888 * Select a local port (number) to use. 889 */ 890 int 891 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 892 struct ucred *cred, int lookupflags) 893 { 894 struct sockaddr_in laddr; 895 896 if (laddrp) { 897 bzero(&laddr, sizeof(laddr)); 898 laddr.sin_family = AF_INET; 899 laddr.sin_addr = *laddrp; 900 } 901 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 902 NULL, lportp, NULL, 0, cred, lookupflags)); 903 } 904 #endif /* INET || INET6 */ 905 906 #ifdef INET 907 /* 908 * Determine whether the inpcb can be bound to the specified address/port tuple. 909 */ 910 static int 911 in_pcbbind_avail(struct inpcb *inp, const struct in_addr laddr, 912 const u_short lport, const int fib, int sooptions, int lookupflags, 913 struct ucred *cred) 914 { 915 int reuseport, reuseport_lb; 916 917 INP_LOCK_ASSERT(inp); 918 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 919 920 reuseport = (sooptions & SO_REUSEPORT); 921 reuseport_lb = (sooptions & SO_REUSEPORT_LB); 922 923 if (IN_MULTICAST(ntohl(laddr.s_addr))) { 924 /* 925 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 926 * allow complete duplication of binding if 927 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 928 * and a multicast address is bound on both 929 * new and duplicated sockets. 930 */ 931 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT)) != 0) 932 reuseport = SO_REUSEADDR | SO_REUSEPORT; 933 /* 934 * XXX: How to deal with SO_REUSEPORT_LB here? 935 * Treat same as SO_REUSEPORT for now. 936 */ 937 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT_LB)) != 0) 938 reuseport_lb = SO_REUSEADDR | SO_REUSEPORT_LB; 939 } else if (!in_nullhost(laddr)) { 940 struct sockaddr_in sin; 941 942 memset(&sin, 0, sizeof(sin)); 943 sin.sin_family = AF_INET; 944 sin.sin_len = sizeof(sin); 945 sin.sin_addr = laddr; 946 947 /* 948 * Is the address a local IP address? 949 * If INP_BINDANY is set, then the socket may be bound 950 * to any endpoint address, local or not. 951 */ 952 if ((inp->inp_flags & INP_BINDANY) == 0 && 953 ifa_ifwithaddr_check((const struct sockaddr *)&sin) == 0) 954 return (EADDRNOTAVAIL); 955 } 956 957 if (lport != 0) { 958 struct inpcb *t; 959 960 if (ntohs(lport) <= V_ipport_reservedhigh && 961 ntohs(lport) >= V_ipport_reservedlow && 962 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 963 return (EACCES); 964 965 if (!IN_MULTICAST(ntohl(laddr.s_addr)) && 966 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 967 /* 968 * If a socket owned by a different user is already 969 * bound to this port, fail. In particular, SO_REUSE* 970 * can only be used to share a port among sockets owned 971 * by the same user. 972 * 973 * However, we can share a port with a connected socket 974 * which has a unique 4-tuple. 975 */ 976 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, 977 RT_ALL_FIBS, INPLOOKUP_WILDCARD, cred); 978 if (t != NULL && 979 (inp->inp_socket->so_type != SOCK_STREAM || 980 in_nullhost(t->inp_faddr)) && 981 (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) 982 return (EADDRINUSE); 983 } 984 t = in_pcblookup_local(inp->inp_pcbinfo, laddr, lport, fib, 985 lookupflags, cred); 986 if (t != NULL && ((reuseport | reuseport_lb) & 987 t->inp_socket->so_options) == 0) { 988 #ifdef INET6 989 if (!in_nullhost(laddr) || 990 !in_nullhost(t->inp_laddr) || 991 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 992 (t->inp_vflag & INP_IPV6PROTO) == 0) 993 #endif 994 return (EADDRINUSE); 995 } 996 } 997 return (0); 998 } 999 1000 /* 1001 * Set up a bind operation on a PCB, performing port allocation 1002 * as required, but do not actually modify the PCB. Callers can 1003 * either complete the bind by setting inp_laddr/inp_lport and 1004 * calling in_pcbinshash(), or they can just use the resulting 1005 * port and address to authorise the sending of a once-off packet. 1006 * 1007 * On error, the values of *laddrp and *lportp are not changed. 1008 */ 1009 int 1010 in_pcbbind_setup(struct inpcb *inp, struct sockaddr_in *sin, in_addr_t *laddrp, 1011 u_short *lportp, int flags, struct ucred *cred) 1012 { 1013 struct socket *so = inp->inp_socket; 1014 struct in_addr laddr; 1015 u_short lport = 0; 1016 int error, fib, lookupflags, sooptions; 1017 1018 /* 1019 * No state changes, so read locks are sufficient here. 1020 */ 1021 INP_LOCK_ASSERT(inp); 1022 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1023 1024 laddr.s_addr = *laddrp; 1025 if (sin != NULL && laddr.s_addr != INADDR_ANY) 1026 return (EINVAL); 1027 1028 lookupflags = 0; 1029 sooptions = atomic_load_int(&so->so_options); 1030 if ((sooptions & (SO_REUSEADDR | SO_REUSEPORT | SO_REUSEPORT_LB)) == 0) 1031 lookupflags = INPLOOKUP_WILDCARD; 1032 if (sin == NULL) { 1033 if ((error = prison_local_ip4(cred, &laddr)) != 0) 1034 return (error); 1035 } else { 1036 KASSERT(sin->sin_family == AF_INET, 1037 ("%s: invalid family for address %p", __func__, sin)); 1038 KASSERT(sin->sin_len == sizeof(*sin), 1039 ("%s: invalid length for address %p", __func__, sin)); 1040 1041 error = prison_local_ip4(cred, &sin->sin_addr); 1042 if (error) 1043 return (error); 1044 if (sin->sin_port != *lportp) { 1045 /* Don't allow the port to change. */ 1046 if (*lportp != 0) 1047 return (EINVAL); 1048 lport = sin->sin_port; 1049 } 1050 laddr = sin->sin_addr; 1051 1052 fib = (flags & INPBIND_FIB) != 0 ? inp->inp_inc.inc_fibnum : 1053 RT_ALL_FIBS; 1054 1055 /* See if this address/port combo is available. */ 1056 error = in_pcbbind_avail(inp, laddr, lport, fib, sooptions, 1057 lookupflags, cred); 1058 if (error != 0) 1059 return (error); 1060 } 1061 if (*lportp != 0) 1062 lport = *lportp; 1063 if (lport == 0) { 1064 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1065 if (error != 0) 1066 return (error); 1067 } 1068 *laddrp = laddr.s_addr; 1069 *lportp = lport; 1070 if ((flags & INPBIND_FIB) != 0) 1071 inp->inp_flags |= INP_BOUNDFIB; 1072 return (0); 1073 } 1074 1075 /* 1076 * Connect from a socket to a specified address. 1077 * Both address and port must be specified in argument sin. 1078 * If don't have a local address for this socket yet, 1079 * then pick one. 1080 */ 1081 int 1082 in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred) 1083 { 1084 struct in_addr laddr, faddr; 1085 u_short lport; 1086 int error; 1087 bool anonport; 1088 1089 INP_WLOCK_ASSERT(inp); 1090 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1091 KASSERT(in_nullhost(inp->inp_faddr), 1092 ("%s: inp is already connected", __func__)); 1093 KASSERT(sin->sin_family == AF_INET, 1094 ("%s: invalid address family for %p", __func__, sin)); 1095 KASSERT(sin->sin_len == sizeof(*sin), 1096 ("%s: invalid address length for %p", __func__, sin)); 1097 1098 if (sin->sin_port == 0) 1099 return (EADDRNOTAVAIL); 1100 1101 anonport = (inp->inp_lport == 0); 1102 1103 if (__predict_false(in_broadcast(sin->sin_addr))) { 1104 if (!V_connect_inaddr_wild || CK_STAILQ_EMPTY(&V_in_ifaddrhead)) 1105 return (ENETUNREACH); 1106 /* 1107 * If the destination address is INADDR_ANY, use the primary 1108 * local address. If the supplied address is INADDR_BROADCAST, 1109 * and the primary interface supports broadcast, choose the 1110 * broadcast address for that interface. 1111 */ 1112 if (in_nullhost(sin->sin_addr)) { 1113 faddr = 1114 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1115 if ((error = prison_get_ip4(cred, &faddr)) != 0) 1116 return (error); 1117 } else if (sin->sin_addr.s_addr == INADDR_BROADCAST && 1118 CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags 1119 & IFF_BROADCAST) { 1120 faddr = satosin(&CK_STAILQ_FIRST( 1121 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1122 } else 1123 faddr = sin->sin_addr; 1124 } else 1125 faddr = sin->sin_addr; 1126 1127 if (in_nullhost(inp->inp_laddr)) { 1128 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1129 if (error) 1130 return (error); 1131 } else 1132 laddr = inp->inp_laddr; 1133 1134 if (anonport) { 1135 struct sockaddr_in lsin = { 1136 .sin_family = AF_INET, 1137 .sin_addr = laddr, 1138 }; 1139 struct sockaddr_in fsin = { 1140 .sin_family = AF_INET, 1141 .sin_addr = faddr, 1142 }; 1143 1144 error = in_pcb_lport_dest(inp, (struct sockaddr *)&lsin, 1145 &lport, (struct sockaddr *)&fsin, sin->sin_port, cred, 1146 INPLOOKUP_WILDCARD); 1147 if (error) 1148 return (error); 1149 } else if (in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1150 sin->sin_port, laddr, inp->inp_lport, 0, M_NODOM, RT_ALL_FIBS) != 1151 NULL) 1152 return (EADDRINUSE); 1153 else 1154 lport = inp->inp_lport; 1155 1156 MPASS(!in_nullhost(inp->inp_laddr) || inp->inp_lport != 0 || 1157 !(inp->inp_flags & INP_INHASHLIST)); 1158 1159 inp->inp_faddr = faddr; 1160 inp->inp_fport = sin->sin_port; 1161 inp->inp_laddr = laddr; 1162 inp->inp_lport = lport; 1163 1164 if ((inp->inp_flags & INP_INHASHLIST) == 0) { 1165 error = in_pcbinshash(inp); 1166 MPASS(error == 0); 1167 } else 1168 in_pcbrehash(inp); 1169 #ifdef ROUTE_MPATH 1170 if (CALC_FLOWID_OUTBOUND) { 1171 uint32_t hash_val, hash_type; 1172 1173 hash_val = fib4_calc_software_hash(inp->inp_laddr, 1174 inp->inp_faddr, 0, sin->sin_port, 1175 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1176 1177 inp->inp_flowid = hash_val; 1178 inp->inp_flowtype = hash_type; 1179 } 1180 #endif 1181 if (anonport) 1182 inp->inp_flags |= INP_ANONPORT; 1183 return (0); 1184 } 1185 1186 /* 1187 * Do proper source address selection on an unbound socket in case 1188 * of connect. Take jails into account as well. 1189 */ 1190 int 1191 in_pcbladdr(const struct inpcb *inp, struct in_addr *faddr, 1192 struct in_addr *laddr, struct ucred *cred) 1193 { 1194 struct ifaddr *ifa; 1195 struct sockaddr *sa; 1196 struct sockaddr_in *sin, dst; 1197 struct nhop_object *nh; 1198 int error; 1199 1200 NET_EPOCH_ASSERT(); 1201 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1202 1203 /* 1204 * Bypass source address selection and use the primary jail IP 1205 * if requested. 1206 */ 1207 if (!prison_saddrsel_ip4(cred, laddr)) 1208 return (0); 1209 1210 /* 1211 * If the destination address is multicast and an outgoing 1212 * interface has been set as a multicast option, prefer the 1213 * address of that interface as our source address. 1214 */ 1215 if (IN_MULTICAST(ntohl(faddr->s_addr)) && inp->inp_moptions != NULL && 1216 inp->inp_moptions->imo_multicast_ifp != NULL) { 1217 struct ifnet *ifp = inp->inp_moptions->imo_multicast_ifp; 1218 struct in_ifaddr *ia; 1219 1220 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1221 if (ia->ia_ifp == ifp && 1222 prison_check_ip4(cred, &ia->ia_addr.sin_addr) == 0) 1223 break; 1224 } 1225 if (ia == NULL) 1226 return (EADDRNOTAVAIL); 1227 *laddr = ia->ia_addr.sin_addr; 1228 return (0); 1229 } 1230 1231 error = 0; 1232 1233 nh = NULL; 1234 bzero(&dst, sizeof(dst)); 1235 sin = &dst; 1236 sin->sin_family = AF_INET; 1237 sin->sin_len = sizeof(struct sockaddr_in); 1238 sin->sin_addr.s_addr = faddr->s_addr; 1239 1240 /* 1241 * If route is known our src addr is taken from the i/f, 1242 * else punt. 1243 * 1244 * Find out route to destination. 1245 */ 1246 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1247 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1248 0, NHR_NONE, 0); 1249 1250 /* 1251 * If we found a route, use the address corresponding to 1252 * the outgoing interface. 1253 * 1254 * Otherwise assume faddr is reachable on a directly connected 1255 * network and try to find a corresponding interface to take 1256 * the source address from. 1257 */ 1258 if (nh == NULL || nh->nh_ifp == NULL) { 1259 struct in_ifaddr *ia; 1260 struct ifnet *ifp; 1261 1262 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1263 inp->inp_socket->so_fibnum)); 1264 if (ia == NULL) { 1265 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1266 inp->inp_socket->so_fibnum)); 1267 } 1268 if (ia == NULL) { 1269 error = ENETUNREACH; 1270 goto done; 1271 } 1272 1273 if (!prison_flag(cred, PR_IP4)) { 1274 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1275 goto done; 1276 } 1277 1278 ifp = ia->ia_ifp; 1279 ia = NULL; 1280 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1281 sa = ifa->ifa_addr; 1282 if (sa->sa_family != AF_INET) 1283 continue; 1284 sin = (struct sockaddr_in *)sa; 1285 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1286 ia = (struct in_ifaddr *)ifa; 1287 break; 1288 } 1289 } 1290 if (ia != NULL) { 1291 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1292 goto done; 1293 } 1294 1295 /* 3. As a last resort return the 'default' jail address. */ 1296 error = prison_get_ip4(cred, laddr); 1297 goto done; 1298 } 1299 1300 /* 1301 * If the outgoing interface on the route found is not 1302 * a loopback interface, use the address from that interface. 1303 * In case of jails do those three steps: 1304 * 1. check if the interface address belongs to the jail. If so use it. 1305 * 2. check if we have any address on the outgoing interface 1306 * belonging to this jail. If so use it. 1307 * 3. as a last resort return the 'default' jail address. 1308 */ 1309 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1310 struct in_ifaddr *ia; 1311 struct ifnet *ifp; 1312 1313 /* If not jailed, use the default returned. */ 1314 if (!prison_flag(cred, PR_IP4)) { 1315 ia = (struct in_ifaddr *)nh->nh_ifa; 1316 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1317 goto done; 1318 } 1319 1320 /* Jailed. */ 1321 /* 1. Check if the iface address belongs to the jail. */ 1322 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1323 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1324 ia = (struct in_ifaddr *)nh->nh_ifa; 1325 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1326 goto done; 1327 } 1328 1329 /* 1330 * 2. Check if we have any address on the outgoing interface 1331 * belonging to this jail. 1332 */ 1333 ia = NULL; 1334 ifp = nh->nh_ifp; 1335 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1336 sa = ifa->ifa_addr; 1337 if (sa->sa_family != AF_INET) 1338 continue; 1339 sin = (struct sockaddr_in *)sa; 1340 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1341 ia = (struct in_ifaddr *)ifa; 1342 break; 1343 } 1344 } 1345 if (ia != NULL) { 1346 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1347 goto done; 1348 } 1349 1350 /* 3. As a last resort return the 'default' jail address. */ 1351 error = prison_get_ip4(cred, laddr); 1352 goto done; 1353 } 1354 1355 /* 1356 * The outgoing interface is marked with 'loopback net', so a route 1357 * to ourselves is here. 1358 * Try to find the interface of the destination address and then 1359 * take the address from there. That interface is not necessarily 1360 * a loopback interface. 1361 * In case of jails, check that it is an address of the jail 1362 * and if we cannot find, fall back to the 'default' jail address. 1363 */ 1364 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1365 struct in_ifaddr *ia; 1366 1367 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1368 inp->inp_socket->so_fibnum)); 1369 if (ia == NULL) 1370 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1371 inp->inp_socket->so_fibnum)); 1372 if (ia == NULL) 1373 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1374 1375 if (!prison_flag(cred, PR_IP4)) { 1376 if (ia == NULL) { 1377 error = ENETUNREACH; 1378 goto done; 1379 } 1380 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1381 goto done; 1382 } 1383 1384 /* Jailed. */ 1385 if (ia != NULL) { 1386 struct ifnet *ifp; 1387 1388 ifp = ia->ia_ifp; 1389 ia = NULL; 1390 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1391 sa = ifa->ifa_addr; 1392 if (sa->sa_family != AF_INET) 1393 continue; 1394 sin = (struct sockaddr_in *)sa; 1395 if (prison_check_ip4(cred, 1396 &sin->sin_addr) == 0) { 1397 ia = (struct in_ifaddr *)ifa; 1398 break; 1399 } 1400 } 1401 if (ia != NULL) { 1402 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1403 goto done; 1404 } 1405 } 1406 1407 /* 3. As a last resort return the 'default' jail address. */ 1408 error = prison_get_ip4(cred, laddr); 1409 goto done; 1410 } 1411 1412 done: 1413 if (error == 0 && laddr->s_addr == INADDR_ANY) 1414 return (EHOSTUNREACH); 1415 return (error); 1416 } 1417 1418 void 1419 in_pcbdisconnect(struct inpcb *inp) 1420 { 1421 1422 INP_WLOCK_ASSERT(inp); 1423 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1424 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 1425 ("%s: inp %p was already disconnected", __func__, inp)); 1426 1427 in_pcbremhash_locked(inp); 1428 1429 /* See the comment in in_pcbinshash(). */ 1430 inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); 1431 inp->inp_laddr.s_addr = INADDR_ANY; 1432 inp->inp_faddr.s_addr = INADDR_ANY; 1433 inp->inp_fport = 0; 1434 } 1435 #endif /* INET */ 1436 1437 void 1438 in_pcblisten(struct inpcb *inp) 1439 { 1440 struct inpcblbgroup *grp; 1441 1442 INP_WLOCK_ASSERT(inp); 1443 1444 if ((inp->inp_flags & INP_INLBGROUP) != 0) { 1445 struct inpcbinfo *pcbinfo; 1446 1447 pcbinfo = inp->inp_pcbinfo; 1448 INP_HASH_WLOCK(pcbinfo); 1449 grp = in_pcblbgroup_find(inp); 1450 LIST_REMOVE(inp, inp_lbgroup_list); 1451 grp->il_pendcnt--; 1452 in_pcblbgroup_insert(grp, inp); 1453 INP_HASH_WUNLOCK(pcbinfo); 1454 } 1455 } 1456 1457 /* 1458 * inpcb hash lookups are protected by SMR section. 1459 * 1460 * Once desired pcb has been found, switching from SMR section to a pcb 1461 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1462 * here because SMR is a critical section. 1463 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1464 */ 1465 void 1466 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1467 { 1468 1469 lock == INPLOOKUP_RLOCKPCB ? 1470 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1471 } 1472 1473 void 1474 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1475 { 1476 1477 lock == INPLOOKUP_RLOCKPCB ? 1478 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1479 } 1480 1481 int 1482 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1483 { 1484 1485 return (lock == INPLOOKUP_RLOCKPCB ? 1486 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1487 } 1488 1489 static inline bool 1490 _inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock, const int ignflags) 1491 { 1492 1493 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1494 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1495 1496 if (__predict_true(inp_trylock(inp, lock))) { 1497 if (__predict_false(inp->inp_flags & ignflags)) { 1498 smr_exit(inp->inp_pcbinfo->ipi_smr); 1499 inp_unlock(inp, lock); 1500 return (false); 1501 } 1502 smr_exit(inp->inp_pcbinfo->ipi_smr); 1503 return (true); 1504 } 1505 1506 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1507 smr_exit(inp->inp_pcbinfo->ipi_smr); 1508 inp_lock(inp, lock); 1509 if (__predict_false(in_pcbrele(inp, lock))) 1510 return (false); 1511 /* 1512 * inp acquired through refcount & lock for sure didn't went 1513 * through uma_zfree(). However, it may have already went 1514 * through in_pcbfree() and has another reference, that 1515 * prevented its release by our in_pcbrele(). 1516 */ 1517 if (__predict_false(inp->inp_flags & ignflags)) { 1518 inp_unlock(inp, lock); 1519 return (false); 1520 } 1521 return (true); 1522 } else { 1523 smr_exit(inp->inp_pcbinfo->ipi_smr); 1524 return (false); 1525 } 1526 } 1527 1528 bool 1529 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1530 { 1531 1532 /* 1533 * in_pcblookup() family of functions ignore not only freed entries, 1534 * that may be found due to lockless access to the hash, but dropped 1535 * entries, too. 1536 */ 1537 return (_inp_smr_lock(inp, lock, INP_FREED | INP_DROPPED)); 1538 } 1539 1540 /* 1541 * inp_next() - inpcb hash/list traversal iterator 1542 * 1543 * Requires initialized struct inpcb_iterator for context. 1544 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1545 * 1546 * - Iterator can have either write-lock or read-lock semantics, that can not 1547 * be changed later. 1548 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1549 * a single hash slot. Note: only rip_input() does the latter. 1550 * - Iterator may have optional bool matching function. The matching function 1551 * will be executed for each inpcb in the SMR context, so it can not acquire 1552 * locks and can safely access only immutable fields of inpcb. 1553 * 1554 * A fresh initialized iterator has NULL inpcb in its context and that 1555 * means that inp_next() call would return the very first inpcb on the list 1556 * locked with desired semantic. In all following calls the context pointer 1557 * shall hold the current inpcb pointer. The KPI user is not supposed to 1558 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1559 * and write NULL to its context. After end of traversal an iterator can be 1560 * reused. 1561 * 1562 * List traversals have the following features/constraints: 1563 * - New entries won't be seen, as they are always added to the head of a list. 1564 * - Removed entries won't stop traversal as long as they are not added to 1565 * a different list. This is violated by in_pcbrehash(). 1566 */ 1567 #define II_LIST_FIRST(ipi, hash) \ 1568 (((hash) == INP_ALL_LIST) ? \ 1569 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1570 CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) 1571 #define II_LIST_NEXT(inp, hash) \ 1572 (((hash) == INP_ALL_LIST) ? \ 1573 CK_LIST_NEXT((inp), inp_list) : \ 1574 CK_LIST_NEXT((inp), inp_hash_exact)) 1575 #define II_LOCK_ASSERT(inp, lock) \ 1576 rw_assert(&(inp)->inp_lock, \ 1577 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1578 struct inpcb * 1579 inp_next(struct inpcb_iterator *ii) 1580 { 1581 const struct inpcbinfo *ipi = ii->ipi; 1582 inp_match_t *match = ii->match; 1583 void *ctx = ii->ctx; 1584 inp_lookup_t lock = ii->lock; 1585 int hash = ii->hash; 1586 struct inpcb *inp; 1587 1588 if (ii->inp == NULL) { /* First call. */ 1589 smr_enter(ipi->ipi_smr); 1590 /* This is unrolled CK_LIST_FOREACH(). */ 1591 for (inp = II_LIST_FIRST(ipi, hash); 1592 inp != NULL; 1593 inp = II_LIST_NEXT(inp, hash)) { 1594 if (match != NULL && (match)(inp, ctx) == false) 1595 continue; 1596 if (__predict_true(_inp_smr_lock(inp, lock, INP_FREED))) 1597 break; 1598 else { 1599 smr_enter(ipi->ipi_smr); 1600 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1601 inp = II_LIST_FIRST(ipi, hash); 1602 if (inp == NULL) 1603 break; 1604 } 1605 } 1606 1607 if (inp == NULL) 1608 smr_exit(ipi->ipi_smr); 1609 else 1610 ii->inp = inp; 1611 1612 return (inp); 1613 } 1614 1615 /* Not a first call. */ 1616 smr_enter(ipi->ipi_smr); 1617 restart: 1618 inp = ii->inp; 1619 II_LOCK_ASSERT(inp, lock); 1620 next: 1621 inp = II_LIST_NEXT(inp, hash); 1622 if (inp == NULL) { 1623 smr_exit(ipi->ipi_smr); 1624 goto found; 1625 } 1626 1627 if (match != NULL && (match)(inp, ctx) == false) 1628 goto next; 1629 1630 if (__predict_true(inp_trylock(inp, lock))) { 1631 if (__predict_false(inp->inp_flags & INP_FREED)) { 1632 /* 1633 * Entries are never inserted in middle of a list, thus 1634 * as long as we are in SMR, we can continue traversal. 1635 * Jump to 'restart' should yield in the same result, 1636 * but could produce unnecessary looping. Could this 1637 * looping be unbound? 1638 */ 1639 inp_unlock(inp, lock); 1640 goto next; 1641 } else { 1642 smr_exit(ipi->ipi_smr); 1643 goto found; 1644 } 1645 } 1646 1647 /* 1648 * Can't obtain lock immediately, thus going hard. Once we exit the 1649 * SMR section we can no longer jump to 'next', and our only stable 1650 * anchoring point is ii->inp, which we keep locked for this case, so 1651 * we jump to 'restart'. 1652 */ 1653 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1654 smr_exit(ipi->ipi_smr); 1655 inp_lock(inp, lock); 1656 if (__predict_false(in_pcbrele(inp, lock))) { 1657 smr_enter(ipi->ipi_smr); 1658 goto restart; 1659 } 1660 /* 1661 * See comment in inp_smr_lock(). 1662 */ 1663 if (__predict_false(inp->inp_flags & INP_FREED)) { 1664 inp_unlock(inp, lock); 1665 smr_enter(ipi->ipi_smr); 1666 goto restart; 1667 } 1668 } else 1669 goto next; 1670 1671 found: 1672 inp_unlock(ii->inp, lock); 1673 ii->inp = inp; 1674 1675 return (ii->inp); 1676 } 1677 1678 /* 1679 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1680 * stability of an inpcb pointer despite the inpcb lock being released or 1681 * SMR section exited. 1682 * 1683 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1684 */ 1685 void 1686 in_pcbref(struct inpcb *inp) 1687 { 1688 u_int old __diagused; 1689 1690 old = refcount_acquire(&inp->inp_refcount); 1691 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1692 } 1693 1694 /* 1695 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1696 * freeing the pcb, if the reference was very last. 1697 */ 1698 bool 1699 in_pcbrele_rlocked(struct inpcb *inp) 1700 { 1701 1702 INP_RLOCK_ASSERT(inp); 1703 1704 if (!refcount_release(&inp->inp_refcount)) 1705 return (false); 1706 1707 MPASS(inp->inp_flags & INP_FREED); 1708 MPASS(inp->inp_socket == NULL); 1709 crfree(inp->inp_cred); 1710 #ifdef INVARIANTS 1711 inp->inp_cred = NULL; 1712 #endif 1713 INP_RUNLOCK(inp); 1714 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1715 return (true); 1716 } 1717 1718 bool 1719 in_pcbrele_wlocked(struct inpcb *inp) 1720 { 1721 1722 INP_WLOCK_ASSERT(inp); 1723 1724 if (!refcount_release(&inp->inp_refcount)) 1725 return (false); 1726 1727 MPASS(inp->inp_flags & INP_FREED); 1728 MPASS(inp->inp_socket == NULL); 1729 crfree(inp->inp_cred); 1730 #ifdef INVARIANTS 1731 inp->inp_cred = NULL; 1732 #endif 1733 INP_WUNLOCK(inp); 1734 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1735 return (true); 1736 } 1737 1738 bool 1739 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1740 { 1741 1742 return (lock == INPLOOKUP_RLOCKPCB ? 1743 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1744 } 1745 1746 /* 1747 * Dereference and rlock inp, for which the caller must own the 1748 * reference. Returns true if inp no longer usable, false otherwise. 1749 */ 1750 bool 1751 in_pcbrele_rlock(struct inpcb *inp) 1752 { 1753 INP_RLOCK(inp); 1754 if (in_pcbrele_rlocked(inp)) 1755 return (true); 1756 if ((inp->inp_flags & INP_FREED) != 0) { 1757 INP_RUNLOCK(inp); 1758 return (true); 1759 } 1760 return (false); 1761 } 1762 1763 /* 1764 * Unconditionally schedule an inpcb to be freed by decrementing its 1765 * reference count, which should occur only after the inpcb has been detached 1766 * from its socket. If another thread holds a temporary reference (acquired 1767 * using in_pcbref()) then the free is deferred until that reference is 1768 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1769 * Almost all work, including removal from global lists, is done in this 1770 * context, where the pcbinfo lock is held. 1771 */ 1772 void 1773 in_pcbfree(struct inpcb *inp) 1774 { 1775 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1776 #ifdef INET 1777 struct ip_moptions *imo; 1778 #endif 1779 #ifdef INET6 1780 struct ip6_moptions *im6o; 1781 #endif 1782 1783 INP_WLOCK_ASSERT(inp); 1784 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1785 KASSERT((inp->inp_flags & INP_FREED) == 0, 1786 ("%s: called twice for pcb %p", __func__, inp)); 1787 1788 /* 1789 * in_pcblookup_local() and in6_pcblookup_local() may return an inpcb 1790 * from the hash without acquiring inpcb lock, they rely on the hash 1791 * lock, thus in_pcbremhash() should be the first action. 1792 */ 1793 if (inp->inp_flags & INP_INHASHLIST) 1794 in_pcbremhash(inp); 1795 INP_INFO_WLOCK(pcbinfo); 1796 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1797 pcbinfo->ipi_count--; 1798 CK_LIST_REMOVE(inp, inp_list); 1799 INP_INFO_WUNLOCK(pcbinfo); 1800 1801 #ifdef RATELIMIT 1802 if (inp->inp_snd_tag != NULL) 1803 in_pcbdetach_txrtlmt(inp); 1804 #endif 1805 inp->inp_flags |= INP_FREED; 1806 inp->inp_socket->so_pcb = NULL; 1807 inp->inp_socket = NULL; 1808 1809 RO_INVALIDATE_CACHE(&inp->inp_route); 1810 #ifdef MAC 1811 mac_inpcb_destroy(inp); 1812 #endif 1813 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1814 if (inp->inp_sp != NULL) 1815 ipsec_delete_pcbpolicy(inp); 1816 #endif 1817 #ifdef INET 1818 if (inp->inp_options) 1819 (void)m_free(inp->inp_options); 1820 DEBUG_POISON_POINTER(inp->inp_options); 1821 imo = inp->inp_moptions; 1822 DEBUG_POISON_POINTER(inp->inp_moptions); 1823 #endif 1824 #ifdef INET6 1825 if (inp->inp_vflag & INP_IPV6PROTO) { 1826 ip6_freepcbopts(inp->in6p_outputopts); 1827 DEBUG_POISON_POINTER(inp->in6p_outputopts); 1828 im6o = inp->in6p_moptions; 1829 DEBUG_POISON_POINTER(inp->in6p_moptions); 1830 } else 1831 im6o = NULL; 1832 #endif 1833 1834 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1835 INP_WUNLOCK(inp); 1836 } 1837 #ifdef INET6 1838 ip6_freemoptions(im6o); 1839 #endif 1840 #ifdef INET 1841 inp_freemoptions(imo); 1842 #endif 1843 } 1844 1845 /* 1846 * Different protocols initialize their inpcbs differently - giving 1847 * different name to the lock. But they all are disposed the same. 1848 */ 1849 static void 1850 inpcb_fini(void *mem, int size) 1851 { 1852 struct inpcb *inp = mem; 1853 1854 INP_LOCK_DESTROY(inp); 1855 } 1856 1857 /* 1858 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1859 * port reservation, and preventing it from being returned by inpcb lookups. 1860 * 1861 * It is used by TCP to mark an inpcb as unused and avoid future packet 1862 * delivery or event notification when a socket remains open but TCP has 1863 * closed. This might occur as a result of a shutdown()-initiated TCP close 1864 * or a RST on the wire, and allows the port binding to be reused while still 1865 * maintaining the invariant that so_pcb always points to a valid inpcb until 1866 * in_pcbdetach(). 1867 * 1868 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1869 * in_pcbpurgeif0()? 1870 */ 1871 void 1872 in_pcbdrop(struct inpcb *inp) 1873 { 1874 1875 INP_WLOCK_ASSERT(inp); 1876 1877 inp->inp_flags |= INP_DROPPED; 1878 if (inp->inp_flags & INP_INHASHLIST) 1879 in_pcbremhash(inp); 1880 } 1881 1882 #ifdef INET 1883 /* 1884 * Common routines to return the socket addresses associated with inpcbs. 1885 */ 1886 int 1887 in_getsockaddr(struct socket *so, struct sockaddr *sa) 1888 { 1889 struct inpcb *inp; 1890 1891 inp = sotoinpcb(so); 1892 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1893 1894 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1895 .sin_len = sizeof(struct sockaddr_in), 1896 .sin_family = AF_INET, 1897 .sin_port = inp->inp_lport, 1898 .sin_addr = inp->inp_laddr, 1899 }; 1900 1901 return (0); 1902 } 1903 1904 int 1905 in_getpeeraddr(struct socket *so, struct sockaddr *sa) 1906 { 1907 struct inpcb *inp; 1908 1909 inp = sotoinpcb(so); 1910 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1911 1912 *(struct sockaddr_in *)sa = (struct sockaddr_in ){ 1913 .sin_len = sizeof(struct sockaddr_in), 1914 .sin_family = AF_INET, 1915 .sin_port = inp->inp_fport, 1916 .sin_addr = inp->inp_faddr, 1917 }; 1918 1919 return (0); 1920 } 1921 1922 static bool 1923 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 1924 { 1925 1926 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 1927 return (true); 1928 else 1929 return (false); 1930 } 1931 1932 void 1933 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1934 { 1935 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 1936 inp_v4_multi_match, NULL); 1937 struct inpcb *inp; 1938 struct in_multi *inm; 1939 struct in_mfilter *imf; 1940 struct ip_moptions *imo; 1941 1942 IN_MULTI_LOCK_ASSERT(); 1943 1944 while ((inp = inp_next(&inpi)) != NULL) { 1945 INP_WLOCK_ASSERT(inp); 1946 1947 imo = inp->inp_moptions; 1948 /* 1949 * Unselect the outgoing interface if it is being 1950 * detached. 1951 */ 1952 if (imo->imo_multicast_ifp == ifp) 1953 imo->imo_multicast_ifp = NULL; 1954 1955 /* 1956 * Drop multicast group membership if we joined 1957 * through the interface being detached. 1958 * 1959 * XXX This can all be deferred to an epoch_call 1960 */ 1961 restart: 1962 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1963 if ((inm = imf->imf_inm) == NULL) 1964 continue; 1965 if (inm->inm_ifp != ifp) 1966 continue; 1967 ip_mfilter_remove(&imo->imo_head, imf); 1968 in_leavegroup_locked(inm, NULL); 1969 ip_mfilter_free(imf); 1970 goto restart; 1971 } 1972 } 1973 } 1974 1975 /* 1976 * Lookup a PCB based on the local address and port. Caller must hold the 1977 * hash lock. No inpcb locks or references are acquired. 1978 */ 1979 #define INP_LOOKUP_MAPPED_PCB_COST 3 1980 struct inpcb * 1981 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1982 u_short lport, int fib, int lookupflags, struct ucred *cred) 1983 { 1984 struct inpcb *inp; 1985 #ifdef INET6 1986 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1987 #else 1988 int matchwild = 3; 1989 #endif 1990 int wildcard; 1991 1992 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1993 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1994 KASSERT(fib == RT_ALL_FIBS || (fib >= 0 && fib < V_rt_numfibs), 1995 ("%s: invalid fib %d", __func__, fib)); 1996 1997 INP_HASH_LOCK_ASSERT(pcbinfo); 1998 1999 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2000 struct inpcbhead *head; 2001 /* 2002 * Look for an unconnected (wildcard foreign addr) PCB that 2003 * matches the local address and port we're looking for. 2004 */ 2005 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2006 pcbinfo->ipi_hashmask)]; 2007 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2008 #ifdef INET6 2009 /* XXX inp locking */ 2010 if ((inp->inp_vflag & INP_IPV4) == 0) 2011 continue; 2012 #endif 2013 if (inp->inp_faddr.s_addr == INADDR_ANY && 2014 inp->inp_laddr.s_addr == laddr.s_addr && 2015 inp->inp_lport == lport && (fib == RT_ALL_FIBS || 2016 inp->inp_inc.inc_fibnum == fib)) { 2017 /* 2018 * Found? 2019 */ 2020 if (prison_equal_ip4(cred->cr_prison, 2021 inp->inp_cred->cr_prison)) 2022 return (inp); 2023 } 2024 } 2025 /* 2026 * Not found. 2027 */ 2028 return (NULL); 2029 } else { 2030 struct inpcbhead *porthash; 2031 struct inpcb *match = NULL; 2032 2033 /* 2034 * Port is in use by one or more PCBs. Look for best 2035 * fit. 2036 */ 2037 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2038 pcbinfo->ipi_porthashmask)]; 2039 CK_LIST_FOREACH(inp, porthash, inp_portlist) { 2040 if (inp->inp_lport != lport) 2041 continue; 2042 if (!prison_equal_ip4(inp->inp_cred->cr_prison, 2043 cred->cr_prison)) 2044 continue; 2045 if (fib != RT_ALL_FIBS && 2046 inp->inp_inc.inc_fibnum != fib) 2047 continue; 2048 wildcard = 0; 2049 #ifdef INET6 2050 /* XXX inp locking */ 2051 if ((inp->inp_vflag & INP_IPV4) == 0) 2052 continue; 2053 /* 2054 * We never select the PCB that has INP_IPV6 flag and 2055 * is bound to :: if we have another PCB which is bound 2056 * to 0.0.0.0. If a PCB has the INP_IPV6 flag, then we 2057 * set its cost higher than IPv4 only PCBs. 2058 * 2059 * Note that the case only happens when a socket is 2060 * bound to ::, under the condition that the use of the 2061 * mapped address is allowed. 2062 */ 2063 if ((inp->inp_vflag & INP_IPV6) != 0) 2064 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2065 #endif 2066 if (inp->inp_faddr.s_addr != INADDR_ANY) 2067 wildcard++; 2068 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2069 if (laddr.s_addr == INADDR_ANY) 2070 wildcard++; 2071 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2072 continue; 2073 } else { 2074 if (laddr.s_addr != INADDR_ANY) 2075 wildcard++; 2076 } 2077 if (wildcard < matchwild) { 2078 match = inp; 2079 matchwild = wildcard; 2080 if (matchwild == 0) 2081 break; 2082 } 2083 } 2084 return (match); 2085 } 2086 } 2087 #undef INP_LOOKUP_MAPPED_PCB_COST 2088 2089 static bool 2090 in_pcblookup_lb_match(const struct inpcblbgroup *grp, int domain, int fib) 2091 { 2092 return ((domain == M_NODOM || domain == grp->il_numa_domain) && 2093 (fib == RT_ALL_FIBS || fib == grp->il_fibnum)); 2094 } 2095 2096 static struct inpcb * 2097 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2098 const struct in_addr *faddr, uint16_t fport, const struct in_addr *laddr, 2099 uint16_t lport, int domain, int fib) 2100 { 2101 const struct inpcblbgrouphead *hdr; 2102 struct inpcblbgroup *grp; 2103 struct inpcblbgroup *jail_exact, *jail_wild, *local_exact, *local_wild; 2104 struct inpcb *inp; 2105 u_int count; 2106 2107 INP_HASH_LOCK_ASSERT(pcbinfo); 2108 NET_EPOCH_ASSERT(); 2109 2110 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2111 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2112 2113 /* 2114 * Search for an LB group match based on the following criteria: 2115 * - prefer jailed groups to non-jailed groups 2116 * - prefer exact source address matches to wildcard matches 2117 * - prefer groups bound to the specified NUMA domain 2118 */ 2119 jail_exact = jail_wild = local_exact = local_wild = NULL; 2120 CK_LIST_FOREACH(grp, hdr, il_list) { 2121 bool injail; 2122 2123 #ifdef INET6 2124 if (!(grp->il_vflag & INP_IPV4)) 2125 continue; 2126 #endif 2127 if (grp->il_lport != lport) 2128 continue; 2129 2130 injail = prison_flag(grp->il_cred, PR_IP4) != 0; 2131 if (injail && prison_check_ip4_locked(grp->il_cred->cr_prison, 2132 laddr) != 0) 2133 continue; 2134 2135 if (grp->il_laddr.s_addr == laddr->s_addr) { 2136 if (injail) { 2137 jail_exact = grp; 2138 if (in_pcblookup_lb_match(grp, domain, fib)) 2139 /* This is a perfect match. */ 2140 goto out; 2141 } else if (local_exact == NULL || 2142 in_pcblookup_lb_match(grp, domain, fib)) { 2143 local_exact = grp; 2144 } 2145 } else if (grp->il_laddr.s_addr == INADDR_ANY) { 2146 if (injail) { 2147 if (jail_wild == NULL || 2148 in_pcblookup_lb_match(grp, domain, fib)) 2149 jail_wild = grp; 2150 } else if (local_wild == NULL || 2151 in_pcblookup_lb_match(grp, domain, fib)) { 2152 local_wild = grp; 2153 } 2154 } 2155 } 2156 2157 if (jail_exact != NULL) 2158 grp = jail_exact; 2159 else if (jail_wild != NULL) 2160 grp = jail_wild; 2161 else if (local_exact != NULL) 2162 grp = local_exact; 2163 else 2164 grp = local_wild; 2165 if (grp == NULL) 2166 return (NULL); 2167 2168 out: 2169 /* 2170 * Synchronize with in_pcblbgroup_insert(). 2171 */ 2172 count = atomic_load_acq_int(&grp->il_inpcnt); 2173 if (count == 0) 2174 return (NULL); 2175 inp = grp->il_inp[INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % count]; 2176 KASSERT(inp != NULL, ("%s: inp == NULL", __func__)); 2177 return (inp); 2178 } 2179 2180 static bool 2181 in_pcblookup_exact_match(const struct inpcb *inp, struct in_addr faddr, 2182 u_short fport, struct in_addr laddr, u_short lport) 2183 { 2184 #ifdef INET6 2185 /* XXX inp locking */ 2186 if ((inp->inp_vflag & INP_IPV4) == 0) 2187 return (false); 2188 #endif 2189 if (inp->inp_faddr.s_addr == faddr.s_addr && 2190 inp->inp_laddr.s_addr == laddr.s_addr && 2191 inp->inp_fport == fport && 2192 inp->inp_lport == lport) 2193 return (true); 2194 return (false); 2195 } 2196 2197 static struct inpcb * 2198 in_pcblookup_hash_exact(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2199 u_short fport, struct in_addr laddr, u_short lport) 2200 { 2201 struct inpcbhead *head; 2202 struct inpcb *inp; 2203 2204 INP_HASH_LOCK_ASSERT(pcbinfo); 2205 2206 head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, 2207 pcbinfo->ipi_hashmask)]; 2208 CK_LIST_FOREACH(inp, head, inp_hash_exact) { 2209 if (in_pcblookup_exact_match(inp, faddr, fport, laddr, lport)) 2210 return (inp); 2211 } 2212 return (NULL); 2213 } 2214 2215 typedef enum { 2216 INPLOOKUP_MATCH_NONE = 0, 2217 INPLOOKUP_MATCH_WILD = 1, 2218 INPLOOKUP_MATCH_LADDR = 2, 2219 } inp_lookup_match_t; 2220 2221 static inp_lookup_match_t 2222 in_pcblookup_wild_match(const struct inpcb *inp, struct in_addr laddr, 2223 u_short lport, int fib) 2224 { 2225 #ifdef INET6 2226 /* XXX inp locking */ 2227 if ((inp->inp_vflag & INP_IPV4) == 0) 2228 return (INPLOOKUP_MATCH_NONE); 2229 #endif 2230 if (inp->inp_faddr.s_addr != INADDR_ANY || inp->inp_lport != lport) 2231 return (INPLOOKUP_MATCH_NONE); 2232 if (fib != RT_ALL_FIBS && inp->inp_inc.inc_fibnum != fib) 2233 return (INPLOOKUP_MATCH_NONE); 2234 if (inp->inp_laddr.s_addr == INADDR_ANY) 2235 return (INPLOOKUP_MATCH_WILD); 2236 if (inp->inp_laddr.s_addr == laddr.s_addr) 2237 return (INPLOOKUP_MATCH_LADDR); 2238 return (INPLOOKUP_MATCH_NONE); 2239 } 2240 2241 #define INP_LOOKUP_AGAIN ((struct inpcb *)(uintptr_t)-1) 2242 2243 static struct inpcb * 2244 in_pcblookup_hash_wild_smr(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2245 u_short lport, int fib, const inp_lookup_t lockflags) 2246 { 2247 struct inpcbhead *head; 2248 struct inpcb *inp; 2249 2250 KASSERT(SMR_ENTERED(pcbinfo->ipi_smr), 2251 ("%s: not in SMR read section", __func__)); 2252 2253 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2254 pcbinfo->ipi_hashmask)]; 2255 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2256 inp_lookup_match_t match; 2257 2258 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2259 if (match == INPLOOKUP_MATCH_NONE) 2260 continue; 2261 2262 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2263 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2264 if (match != INPLOOKUP_MATCH_NONE && 2265 prison_check_ip4_locked(inp->inp_cred->cr_prison, 2266 &laddr) == 0) 2267 return (inp); 2268 inp_unlock(inp, lockflags); 2269 } 2270 2271 /* 2272 * The matching socket disappeared out from under us. Fall back 2273 * to a serialized lookup. 2274 */ 2275 return (INP_LOOKUP_AGAIN); 2276 } 2277 return (NULL); 2278 } 2279 2280 static struct inpcb * 2281 in_pcblookup_hash_wild_locked(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2282 u_short lport, int fib) 2283 { 2284 struct inpcbhead *head; 2285 struct inpcb *inp, *local_wild, *local_exact, *jail_wild; 2286 #ifdef INET6 2287 struct inpcb *local_wild_mapped; 2288 #endif 2289 2290 INP_HASH_LOCK_ASSERT(pcbinfo); 2291 2292 /* 2293 * Order of socket selection - we always prefer jails. 2294 * 1. jailed, non-wild. 2295 * 2. jailed, wild. 2296 * 3. non-jailed, non-wild. 2297 * 4. non-jailed, wild. 2298 */ 2299 head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, 2300 pcbinfo->ipi_hashmask)]; 2301 local_wild = local_exact = jail_wild = NULL; 2302 #ifdef INET6 2303 local_wild_mapped = NULL; 2304 #endif 2305 CK_LIST_FOREACH(inp, head, inp_hash_wild) { 2306 inp_lookup_match_t match; 2307 bool injail; 2308 2309 match = in_pcblookup_wild_match(inp, laddr, lport, fib); 2310 if (match == INPLOOKUP_MATCH_NONE) 2311 continue; 2312 2313 injail = prison_flag(inp->inp_cred, PR_IP4) != 0; 2314 if (injail) { 2315 if (prison_check_ip4_locked(inp->inp_cred->cr_prison, 2316 &laddr) != 0) 2317 continue; 2318 } else { 2319 if (local_exact != NULL) 2320 continue; 2321 } 2322 2323 if (match == INPLOOKUP_MATCH_LADDR) { 2324 if (injail) 2325 return (inp); 2326 local_exact = inp; 2327 } else { 2328 #ifdef INET6 2329 /* XXX inp locking, NULL check */ 2330 if (inp->inp_vflag & INP_IPV6PROTO) 2331 local_wild_mapped = inp; 2332 else 2333 #endif 2334 if (injail) 2335 jail_wild = inp; 2336 else 2337 local_wild = inp; 2338 } 2339 } 2340 if (jail_wild != NULL) 2341 return (jail_wild); 2342 if (local_exact != NULL) 2343 return (local_exact); 2344 if (local_wild != NULL) 2345 return (local_wild); 2346 #ifdef INET6 2347 if (local_wild_mapped != NULL) 2348 return (local_wild_mapped); 2349 #endif 2350 return (NULL); 2351 } 2352 2353 /* 2354 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2355 * that the caller has either locked the hash list, which usually happens 2356 * for bind(2) operations, or is in SMR section, which happens when sorting 2357 * out incoming packets. 2358 */ 2359 static struct inpcb * 2360 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2361 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2362 uint8_t numa_domain, int fib) 2363 { 2364 struct inpcb *inp; 2365 const u_short fport = fport_arg, lport = lport_arg; 2366 2367 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD | INPLOOKUP_FIB)) == 0, 2368 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2369 KASSERT(faddr.s_addr != INADDR_ANY, 2370 ("%s: invalid foreign address", __func__)); 2371 KASSERT(laddr.s_addr != INADDR_ANY, 2372 ("%s: invalid local address", __func__)); 2373 INP_HASH_WLOCK_ASSERT(pcbinfo); 2374 2375 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2376 if (inp != NULL) 2377 return (inp); 2378 2379 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2380 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2381 &laddr, lport, numa_domain, fib); 2382 if (inp == NULL) { 2383 inp = in_pcblookup_hash_wild_locked(pcbinfo, laddr, 2384 lport, fib); 2385 } 2386 } 2387 2388 return (inp); 2389 } 2390 2391 static struct inpcb * 2392 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2393 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2394 uint8_t numa_domain, int fib) 2395 { 2396 struct inpcb *inp; 2397 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2398 2399 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2400 ("%s: LOCKPCB not set", __func__)); 2401 2402 INP_HASH_WLOCK(pcbinfo); 2403 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2404 lookupflags & ~INPLOOKUP_LOCKMASK, numa_domain, fib); 2405 if (inp != NULL && !inp_trylock(inp, lockflags)) { 2406 in_pcbref(inp); 2407 INP_HASH_WUNLOCK(pcbinfo); 2408 inp_lock(inp, lockflags); 2409 if (in_pcbrele(inp, lockflags)) 2410 /* XXX-MJ or retry until we get a negative match? */ 2411 inp = NULL; 2412 } else { 2413 INP_HASH_WUNLOCK(pcbinfo); 2414 } 2415 return (inp); 2416 } 2417 2418 static struct inpcb * 2419 in_pcblookup_hash_smr(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2420 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2421 uint8_t numa_domain, int fib) 2422 { 2423 struct inpcb *inp; 2424 const inp_lookup_t lockflags = lookupflags & INPLOOKUP_LOCKMASK; 2425 const u_short fport = fport_arg, lport = lport_arg; 2426 2427 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2428 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2429 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2430 ("%s: LOCKPCB not set", __func__)); 2431 2432 smr_enter(pcbinfo->ipi_smr); 2433 inp = in_pcblookup_hash_exact(pcbinfo, faddr, fport, laddr, lport); 2434 if (inp != NULL) { 2435 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2436 /* 2437 * Revalidate the 4-tuple, the socket could have been 2438 * disconnected. 2439 */ 2440 if (__predict_true(in_pcblookup_exact_match(inp, 2441 faddr, fport, laddr, lport))) 2442 return (inp); 2443 inp_unlock(inp, lockflags); 2444 } 2445 2446 /* 2447 * We failed to lock the inpcb, or its connection state changed 2448 * out from under us. Fall back to a precise search. 2449 */ 2450 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2451 lookupflags, numa_domain, fib)); 2452 } 2453 2454 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2455 inp = in_pcblookup_lbgroup(pcbinfo, &faddr, fport, 2456 &laddr, lport, numa_domain, fib); 2457 if (inp != NULL) { 2458 if (__predict_true(inp_smr_lock(inp, lockflags))) { 2459 if (__predict_true(in_pcblookup_wild_match(inp, 2460 laddr, lport, fib) != INPLOOKUP_MATCH_NONE)) 2461 return (inp); 2462 inp_unlock(inp, lockflags); 2463 } 2464 inp = INP_LOOKUP_AGAIN; 2465 } else { 2466 inp = in_pcblookup_hash_wild_smr(pcbinfo, laddr, lport, 2467 fib, lockflags); 2468 } 2469 if (inp == INP_LOOKUP_AGAIN) { 2470 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, 2471 lport, lookupflags, numa_domain, fib)); 2472 } 2473 } 2474 2475 if (inp == NULL) 2476 smr_exit(pcbinfo->ipi_smr); 2477 2478 return (inp); 2479 } 2480 2481 /* 2482 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2483 * from which a pre-calculated hash value may be extracted. 2484 */ 2485 struct inpcb * 2486 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2487 struct in_addr laddr, u_int lport, int lookupflags, 2488 struct ifnet *ifp) 2489 { 2490 int fib; 2491 2492 fib = (lookupflags & INPLOOKUP_FIB) ? if_getfib(ifp) : RT_ALL_FIBS; 2493 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2494 lookupflags, M_NODOM, fib)); 2495 } 2496 2497 struct inpcb * 2498 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2499 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2500 struct ifnet *ifp __unused, struct mbuf *m) 2501 { 2502 int fib; 2503 2504 M_ASSERTPKTHDR(m); 2505 fib = (lookupflags & INPLOOKUP_FIB) ? M_GETFIB(m) : RT_ALL_FIBS; 2506 return (in_pcblookup_hash_smr(pcbinfo, faddr, fport, laddr, lport, 2507 lookupflags, m->m_pkthdr.numa_domain, fib)); 2508 } 2509 #endif /* INET */ 2510 2511 static bool 2512 in_pcbjailed(const struct inpcb *inp, unsigned int flag) 2513 { 2514 return (prison_flag(inp->inp_cred, flag) != 0); 2515 } 2516 2517 /* 2518 * Insert the PCB into a hash chain using ordering rules which ensure that 2519 * in_pcblookup_hash_wild_*() always encounter the highest-ranking PCB first. 2520 * 2521 * Specifically, keep jailed PCBs in front of non-jailed PCBs, and keep PCBs 2522 * with exact local addresses ahead of wildcard PCBs. Unbound v4-mapped v6 PCBs 2523 * always appear last no matter whether they are jailed. 2524 */ 2525 static void 2526 _in_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2527 { 2528 struct inpcb *last; 2529 bool bound, injail; 2530 2531 INP_LOCK_ASSERT(inp); 2532 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2533 2534 last = NULL; 2535 bound = inp->inp_laddr.s_addr != INADDR_ANY; 2536 if (!bound && (inp->inp_vflag & INP_IPV6PROTO) != 0) { 2537 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2538 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2539 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2540 return; 2541 } 2542 } 2543 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2544 return; 2545 } 2546 2547 injail = in_pcbjailed(inp, PR_IP4); 2548 if (!injail) { 2549 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2550 if (!in_pcbjailed(last, PR_IP4)) 2551 break; 2552 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2553 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2554 return; 2555 } 2556 } 2557 } else if (!CK_LIST_EMPTY(pcbhash) && 2558 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP4)) { 2559 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2560 return; 2561 } 2562 if (!bound) { 2563 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2564 if (last->inp_laddr.s_addr == INADDR_ANY) 2565 break; 2566 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2567 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2568 return; 2569 } 2570 } 2571 } 2572 if (last == NULL) 2573 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2574 else 2575 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2576 } 2577 2578 #ifdef INET6 2579 /* 2580 * See the comment above _in_pcbinshash_wild(). 2581 */ 2582 static void 2583 _in6_pcbinshash_wild(struct inpcbhead *pcbhash, struct inpcb *inp) 2584 { 2585 struct inpcb *last; 2586 bool bound, injail; 2587 2588 INP_LOCK_ASSERT(inp); 2589 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2590 2591 last = NULL; 2592 bound = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr); 2593 injail = in_pcbjailed(inp, PR_IP6); 2594 if (!injail) { 2595 CK_LIST_FOREACH(last, pcbhash, inp_hash_wild) { 2596 if (!in_pcbjailed(last, PR_IP6)) 2597 break; 2598 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2599 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2600 return; 2601 } 2602 } 2603 } else if (!CK_LIST_EMPTY(pcbhash) && 2604 !in_pcbjailed(CK_LIST_FIRST(pcbhash), PR_IP6)) { 2605 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2606 return; 2607 } 2608 if (!bound) { 2609 CK_LIST_FOREACH_FROM(last, pcbhash, inp_hash_wild) { 2610 if (IN6_IS_ADDR_UNSPECIFIED(&last->in6p_laddr)) 2611 break; 2612 if (CK_LIST_NEXT(last, inp_hash_wild) == NULL) { 2613 CK_LIST_INSERT_AFTER(last, inp, inp_hash_wild); 2614 return; 2615 } 2616 } 2617 } 2618 if (last == NULL) 2619 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); 2620 else 2621 CK_LIST_INSERT_BEFORE(last, inp, inp_hash_wild); 2622 } 2623 #endif 2624 2625 /* 2626 * Insert PCB onto various hash lists. 2627 * 2628 * With normal sockets this function shall not fail, so it could return void. 2629 * But for SO_REUSEPORT_LB it may need to allocate memory with locks held, 2630 * that's the only condition when it can fail. 2631 */ 2632 int 2633 in_pcbinshash(struct inpcb *inp) 2634 { 2635 struct inpcbhead *pcbhash, *pcbporthash; 2636 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2637 uint32_t hash; 2638 bool connected; 2639 2640 INP_WLOCK_ASSERT(inp); 2641 INP_HASH_WLOCK_ASSERT(pcbinfo); 2642 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2643 ("in_pcbinshash: INP_INHASHLIST")); 2644 2645 #ifdef INET6 2646 if (inp->inp_vflag & INP_IPV6) { 2647 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2648 inp->inp_fport, pcbinfo->ipi_hashmask); 2649 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2650 } else 2651 #endif 2652 { 2653 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2654 inp->inp_fport, pcbinfo->ipi_hashmask); 2655 connected = !in_nullhost(inp->inp_faddr); 2656 } 2657 2658 if (connected) 2659 pcbhash = &pcbinfo->ipi_hash_exact[hash]; 2660 else 2661 pcbhash = &pcbinfo->ipi_hash_wild[hash]; 2662 2663 pcbporthash = &pcbinfo->ipi_porthashbase[ 2664 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2665 2666 /* 2667 * Ignore SO_REUSEPORT_LB if the socket is connected. Really this case 2668 * should be an error, but for UDP sockets it is not, and some 2669 * applications erroneously set it on connected UDP sockets, so we can't 2670 * change this without breaking compatibility. 2671 */ 2672 if (!connected && 2673 (inp->inp_socket->so_options & SO_REUSEPORT_LB) != 0) { 2674 int error = in_pcbinslbgrouphash(inp, M_NODOM); 2675 if (error != 0) 2676 return (error); 2677 } 2678 2679 /* 2680 * The PCB may have been disconnected in the past. Before we can safely 2681 * make it visible in the hash table, we must wait for all readers which 2682 * may be traversing this PCB to finish. 2683 */ 2684 if (inp->inp_smr != SMR_SEQ_INVALID) { 2685 smr_wait(pcbinfo->ipi_smr, inp->inp_smr); 2686 inp->inp_smr = SMR_SEQ_INVALID; 2687 } 2688 2689 if (connected) 2690 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); 2691 else { 2692 #ifdef INET6 2693 if ((inp->inp_vflag & INP_IPV6) != 0) 2694 _in6_pcbinshash_wild(pcbhash, inp); 2695 else 2696 #endif 2697 _in_pcbinshash_wild(pcbhash, inp); 2698 } 2699 CK_LIST_INSERT_HEAD(pcbporthash, inp, inp_portlist); 2700 inp->inp_flags |= INP_INHASHLIST; 2701 2702 return (0); 2703 } 2704 2705 void 2706 in_pcbremhash_locked(struct inpcb *inp) 2707 { 2708 2709 INP_WLOCK_ASSERT(inp); 2710 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 2711 MPASS(inp->inp_flags & INP_INHASHLIST); 2712 2713 if ((inp->inp_flags & INP_INLBGROUP) != 0) 2714 in_pcbremlbgrouphash(inp); 2715 #ifdef INET6 2716 if (inp->inp_vflag & INP_IPV6) { 2717 if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) 2718 CK_LIST_REMOVE(inp, inp_hash_wild); 2719 else 2720 CK_LIST_REMOVE(inp, inp_hash_exact); 2721 } else 2722 #endif 2723 { 2724 if (in_nullhost(inp->inp_faddr)) 2725 CK_LIST_REMOVE(inp, inp_hash_wild); 2726 else 2727 CK_LIST_REMOVE(inp, inp_hash_exact); 2728 } 2729 CK_LIST_REMOVE(inp, inp_portlist); 2730 inp->inp_flags &= ~INP_INHASHLIST; 2731 } 2732 2733 static void 2734 in_pcbremhash(struct inpcb *inp) 2735 { 2736 INP_HASH_WLOCK(inp->inp_pcbinfo); 2737 in_pcbremhash_locked(inp); 2738 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 2739 } 2740 2741 /* 2742 * Move PCB to the proper hash bucket when { faddr, fport } have been 2743 * changed. NOTE: This does not handle the case of the lport changing (the 2744 * hashed port list would have to be updated as well), so the lport must 2745 * not change after in_pcbinshash() has been called. 2746 */ 2747 void 2748 in_pcbrehash(struct inpcb *inp) 2749 { 2750 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2751 struct inpcbhead *head; 2752 uint32_t hash; 2753 bool connected; 2754 2755 INP_WLOCK_ASSERT(inp); 2756 INP_HASH_WLOCK_ASSERT(pcbinfo); 2757 KASSERT(inp->inp_flags & INP_INHASHLIST, 2758 ("%s: !INP_INHASHLIST", __func__)); 2759 KASSERT(inp->inp_smr == SMR_SEQ_INVALID, 2760 ("%s: inp was disconnected", __func__)); 2761 2762 #ifdef INET6 2763 if (inp->inp_vflag & INP_IPV6) { 2764 hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, 2765 inp->inp_fport, pcbinfo->ipi_hashmask); 2766 connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); 2767 } else 2768 #endif 2769 { 2770 hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, 2771 inp->inp_fport, pcbinfo->ipi_hashmask); 2772 connected = !in_nullhost(inp->inp_faddr); 2773 } 2774 2775 /* See the comment in in_pcbinshash(). */ 2776 if (connected && (inp->inp_flags & INP_INLBGROUP) != 0) 2777 in_pcbremlbgrouphash(inp); 2778 2779 /* 2780 * When rehashing, the caller must ensure that either the new or the old 2781 * foreign address was unspecified. 2782 */ 2783 if (connected) 2784 CK_LIST_REMOVE(inp, inp_hash_wild); 2785 else 2786 CK_LIST_REMOVE(inp, inp_hash_exact); 2787 2788 if (connected) { 2789 head = &pcbinfo->ipi_hash_exact[hash]; 2790 CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); 2791 } else { 2792 head = &pcbinfo->ipi_hash_wild[hash]; 2793 CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); 2794 } 2795 } 2796 2797 /* 2798 * Check for alternatives when higher level complains 2799 * about service problems. For now, invalidate cached 2800 * routing information. If the route was created dynamically 2801 * (by a redirect), time to try a default gateway again. 2802 */ 2803 void 2804 in_losing(struct inpcb *inp) 2805 { 2806 2807 RO_INVALIDATE_CACHE(&inp->inp_route); 2808 return; 2809 } 2810 2811 /* 2812 * A set label operation has occurred at the socket layer, propagate the 2813 * label change into the in_pcb for the socket. 2814 */ 2815 void 2816 in_pcbsosetlabel(struct socket *so) 2817 { 2818 #ifdef MAC 2819 struct inpcb *inp; 2820 2821 inp = sotoinpcb(so); 2822 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2823 2824 INP_WLOCK(inp); 2825 SOCK_LOCK(so); 2826 mac_inpcb_sosetlabel(so, inp); 2827 SOCK_UNLOCK(so); 2828 INP_WUNLOCK(inp); 2829 #endif 2830 } 2831 2832 void 2833 inp_wlock(struct inpcb *inp) 2834 { 2835 2836 INP_WLOCK(inp); 2837 } 2838 2839 void 2840 inp_wunlock(struct inpcb *inp) 2841 { 2842 2843 INP_WUNLOCK(inp); 2844 } 2845 2846 void 2847 inp_rlock(struct inpcb *inp) 2848 { 2849 2850 INP_RLOCK(inp); 2851 } 2852 2853 void 2854 inp_runlock(struct inpcb *inp) 2855 { 2856 2857 INP_RUNLOCK(inp); 2858 } 2859 2860 #ifdef INVARIANT_SUPPORT 2861 void 2862 inp_lock_assert(struct inpcb *inp) 2863 { 2864 2865 INP_WLOCK_ASSERT(inp); 2866 } 2867 2868 void 2869 inp_unlock_assert(struct inpcb *inp) 2870 { 2871 2872 INP_UNLOCK_ASSERT(inp); 2873 } 2874 #endif 2875 2876 void 2877 inp_apply_all(struct inpcbinfo *pcbinfo, 2878 void (*func)(struct inpcb *, void *), void *arg) 2879 { 2880 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2881 INPLOOKUP_WLOCKPCB); 2882 struct inpcb *inp; 2883 2884 while ((inp = inp_next(&inpi)) != NULL) 2885 func(inp, arg); 2886 } 2887 2888 struct socket * 2889 inp_inpcbtosocket(struct inpcb *inp) 2890 { 2891 2892 INP_WLOCK_ASSERT(inp); 2893 return (inp->inp_socket); 2894 } 2895 2896 void 2897 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2898 uint32_t *faddr, uint16_t *fp) 2899 { 2900 2901 INP_LOCK_ASSERT(inp); 2902 *laddr = inp->inp_laddr.s_addr; 2903 *faddr = inp->inp_faddr.s_addr; 2904 *lp = inp->inp_lport; 2905 *fp = inp->inp_fport; 2906 } 2907 2908 /* 2909 * Create an external-format (``xinpcb'') structure using the information in 2910 * the kernel-format in_pcb structure pointed to by inp. This is done to 2911 * reduce the spew of irrelevant information over this interface, to isolate 2912 * user code from changes in the kernel structure, and potentially to provide 2913 * information-hiding if we decide that some of this information should be 2914 * hidden from users. 2915 */ 2916 void 2917 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2918 { 2919 2920 bzero(xi, sizeof(*xi)); 2921 xi->xi_len = sizeof(struct xinpcb); 2922 if (inp->inp_socket) 2923 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2924 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2925 xi->inp_gencnt = inp->inp_gencnt; 2926 xi->inp_flow = inp->inp_flow; 2927 xi->inp_flowid = inp->inp_flowid; 2928 xi->inp_flowtype = inp->inp_flowtype; 2929 xi->inp_flags = inp->inp_flags; 2930 xi->inp_flags2 = inp->inp_flags2; 2931 xi->in6p_cksum = inp->in6p_cksum; 2932 xi->in6p_hops = inp->in6p_hops; 2933 xi->inp_ip_tos = inp->inp_ip_tos; 2934 xi->inp_vflag = inp->inp_vflag; 2935 xi->inp_ip_ttl = inp->inp_ip_ttl; 2936 xi->inp_ip_p = inp->inp_ip_p; 2937 xi->inp_ip_minttl = inp->inp_ip_minttl; 2938 } 2939 2940 int 2941 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2942 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2943 { 2944 struct sockopt sopt; 2945 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2946 INPLOOKUP_WLOCKPCB); 2947 struct inpcb *inp; 2948 struct sockopt_parameters *params; 2949 struct socket *so; 2950 int error; 2951 char buf[1024]; 2952 2953 if (req->oldptr != NULL || req->oldlen != 0) 2954 return (EINVAL); 2955 if (req->newptr == NULL) 2956 return (EPERM); 2957 if (req->newlen > sizeof(buf)) 2958 return (ENOMEM); 2959 error = SYSCTL_IN(req, buf, req->newlen); 2960 if (error != 0) 2961 return (error); 2962 if (req->newlen < sizeof(struct sockopt_parameters)) 2963 return (EINVAL); 2964 params = (struct sockopt_parameters *)buf; 2965 sopt.sopt_level = params->sop_level; 2966 sopt.sopt_name = params->sop_optname; 2967 sopt.sopt_dir = SOPT_SET; 2968 sopt.sopt_val = params->sop_optval; 2969 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2970 sopt.sopt_td = NULL; 2971 #ifdef INET6 2972 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2973 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2974 params->sop_inc.inc6_laddr.s6_addr16[1] = 2975 htons(params->sop_inc.inc6_zoneid & 0xffff); 2976 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2977 params->sop_inc.inc6_faddr.s6_addr16[1] = 2978 htons(params->sop_inc.inc6_zoneid & 0xffff); 2979 } 2980 #endif 2981 if (params->sop_inc.inc_lport != htons(0) && 2982 params->sop_inc.inc_fport != htons(0)) { 2983 #ifdef INET6 2984 if (params->sop_inc.inc_flags & INC_ISIPV6) 2985 inpi.hash = INP6_PCBHASH( 2986 ¶ms->sop_inc.inc6_faddr, 2987 params->sop_inc.inc_lport, 2988 params->sop_inc.inc_fport, 2989 pcbinfo->ipi_hashmask); 2990 else 2991 #endif 2992 inpi.hash = INP_PCBHASH( 2993 ¶ms->sop_inc.inc_faddr, 2994 params->sop_inc.inc_lport, 2995 params->sop_inc.inc_fport, 2996 pcbinfo->ipi_hashmask); 2997 } 2998 while ((inp = inp_next(&inpi)) != NULL) 2999 if (inp->inp_gencnt == params->sop_id) { 3000 if (inp->inp_flags & INP_DROPPED) { 3001 INP_WUNLOCK(inp); 3002 return (ECONNRESET); 3003 } 3004 so = inp->inp_socket; 3005 KASSERT(so != NULL, ("inp_socket == NULL")); 3006 soref(so); 3007 if (params->sop_level == SOL_SOCKET) { 3008 INP_WUNLOCK(inp); 3009 error = sosetopt(so, &sopt); 3010 } else 3011 error = (*ctloutput_set)(inp, &sopt); 3012 sorele(so); 3013 break; 3014 } 3015 if (inp == NULL) 3016 error = ESRCH; 3017 return (error); 3018 } 3019 3020 #ifdef DDB 3021 static void 3022 db_print_indent(int indent) 3023 { 3024 int i; 3025 3026 for (i = 0; i < indent; i++) 3027 db_printf(" "); 3028 } 3029 3030 static void 3031 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 3032 { 3033 char faddr_str[48], laddr_str[48]; 3034 3035 db_print_indent(indent); 3036 db_printf("%s at %p\n", name, inc); 3037 3038 indent += 2; 3039 3040 #ifdef INET6 3041 if (inc->inc_flags & INC_ISIPV6) { 3042 /* IPv6. */ 3043 ip6_sprintf(laddr_str, &inc->inc6_laddr); 3044 ip6_sprintf(faddr_str, &inc->inc6_faddr); 3045 } else 3046 #endif 3047 { 3048 /* IPv4. */ 3049 inet_ntoa_r(inc->inc_laddr, laddr_str); 3050 inet_ntoa_r(inc->inc_faddr, faddr_str); 3051 } 3052 db_print_indent(indent); 3053 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 3054 ntohs(inc->inc_lport)); 3055 db_print_indent(indent); 3056 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 3057 ntohs(inc->inc_fport)); 3058 } 3059 3060 void 3061 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3062 { 3063 3064 db_print_indent(indent); 3065 db_printf("%s at %p\n", name, inp); 3066 3067 indent += 2; 3068 3069 db_print_indent(indent); 3070 db_printf("inp_flow: 0x%x inp_label: %p\n", inp->inp_flow, 3071 inp->inp_label); 3072 3073 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3074 3075 db_print_indent(indent); 3076 db_printf("inp_flags: 0x%b\n", inp->inp_flags, INP_FLAGS_BITS); 3077 3078 db_print_indent(indent); 3079 db_printf("inp_flags2: 0x%b\n", inp->inp_flags2, INP_FLAGS2_BITS); 3080 3081 db_print_indent(indent); 3082 db_printf("inp_sp: %p inp_vflag: 0x%b\n", inp->inp_sp, 3083 inp->inp_vflag, INP_VFLAGS_BITS); 3084 3085 db_print_indent(indent); 3086 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3087 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3088 3089 #ifdef INET6 3090 if (inp->inp_vflag & INP_IPV6) { 3091 db_print_indent(indent); 3092 db_printf("in6p_options: %p in6p_outputopts: %p " 3093 "in6p_moptions: %p\n", inp->in6p_options, 3094 inp->in6p_outputopts, inp->in6p_moptions); 3095 db_print_indent(indent); 3096 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3097 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3098 inp->in6p_hops); 3099 } else 3100 #endif 3101 { 3102 db_print_indent(indent); 3103 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3104 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3105 inp->inp_options, inp->inp_moptions); 3106 } 3107 3108 db_print_indent(indent); 3109 db_printf("inp_gencnt: %ju\n", (uintmax_t)inp->inp_gencnt); 3110 } 3111 3112 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3113 { 3114 struct inpcb *inp; 3115 3116 if (!have_addr) { 3117 db_printf("usage: show inpcb <addr>\n"); 3118 return; 3119 } 3120 inp = (struct inpcb *)addr; 3121 3122 db_print_inpcb(inp, "inpcb", 0); 3123 } 3124 #endif /* DDB */ 3125 3126 #ifdef RATELIMIT 3127 /* 3128 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3129 * if any. 3130 */ 3131 int 3132 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3133 { 3134 union if_snd_tag_modify_params params = { 3135 .rate_limit.max_rate = max_pacing_rate, 3136 .rate_limit.flags = M_NOWAIT, 3137 }; 3138 struct m_snd_tag *mst; 3139 int error; 3140 3141 mst = inp->inp_snd_tag; 3142 if (mst == NULL) 3143 return (EINVAL); 3144 3145 if (mst->sw->snd_tag_modify == NULL) { 3146 error = EOPNOTSUPP; 3147 } else { 3148 error = mst->sw->snd_tag_modify(mst, ¶ms); 3149 } 3150 return (error); 3151 } 3152 3153 /* 3154 * Query existing TX rate limit based on the existing 3155 * "inp->inp_snd_tag", if any. 3156 */ 3157 int 3158 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3159 { 3160 union if_snd_tag_query_params params = { }; 3161 struct m_snd_tag *mst; 3162 int error; 3163 3164 mst = inp->inp_snd_tag; 3165 if (mst == NULL) 3166 return (EINVAL); 3167 3168 if (mst->sw->snd_tag_query == NULL) { 3169 error = EOPNOTSUPP; 3170 } else { 3171 error = mst->sw->snd_tag_query(mst, ¶ms); 3172 if (error == 0 && p_max_pacing_rate != NULL) 3173 *p_max_pacing_rate = params.rate_limit.max_rate; 3174 } 3175 return (error); 3176 } 3177 3178 /* 3179 * Query existing TX queue level based on the existing 3180 * "inp->inp_snd_tag", if any. 3181 */ 3182 int 3183 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3184 { 3185 union if_snd_tag_query_params params = { }; 3186 struct m_snd_tag *mst; 3187 int error; 3188 3189 mst = inp->inp_snd_tag; 3190 if (mst == NULL) 3191 return (EINVAL); 3192 3193 if (mst->sw->snd_tag_query == NULL) 3194 return (EOPNOTSUPP); 3195 3196 error = mst->sw->snd_tag_query(mst, ¶ms); 3197 if (error == 0 && p_txqueue_level != NULL) 3198 *p_txqueue_level = params.rate_limit.queue_level; 3199 return (error); 3200 } 3201 3202 /* 3203 * Allocate a new TX rate limit send tag from the network interface 3204 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3205 */ 3206 int 3207 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3208 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3209 3210 { 3211 union if_snd_tag_alloc_params params = { 3212 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3213 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3214 .rate_limit.hdr.flowid = flowid, 3215 .rate_limit.hdr.flowtype = flowtype, 3216 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3217 .rate_limit.max_rate = max_pacing_rate, 3218 .rate_limit.flags = M_NOWAIT, 3219 }; 3220 int error; 3221 3222 INP_WLOCK_ASSERT(inp); 3223 3224 /* 3225 * If there is already a send tag, or the INP is being torn 3226 * down, allocating a new send tag is not allowed. Else send 3227 * tags may leak. 3228 */ 3229 if (*st != NULL || (inp->inp_flags & INP_DROPPED) != 0) 3230 return (EINVAL); 3231 3232 error = m_snd_tag_alloc(ifp, ¶ms, st); 3233 #ifdef INET 3234 if (error == 0) { 3235 counter_u64_add(rate_limit_set_ok, 1); 3236 counter_u64_add(rate_limit_active, 1); 3237 } else if (error != EOPNOTSUPP) 3238 counter_u64_add(rate_limit_alloc_fail, 1); 3239 #endif 3240 return (error); 3241 } 3242 3243 void 3244 in_pcbdetach_tag(struct m_snd_tag *mst) 3245 { 3246 3247 m_snd_tag_rele(mst); 3248 #ifdef INET 3249 counter_u64_add(rate_limit_active, -1); 3250 #endif 3251 } 3252 3253 /* 3254 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3255 * if any: 3256 */ 3257 void 3258 in_pcbdetach_txrtlmt(struct inpcb *inp) 3259 { 3260 struct m_snd_tag *mst; 3261 3262 INP_WLOCK_ASSERT(inp); 3263 3264 mst = inp->inp_snd_tag; 3265 inp->inp_snd_tag = NULL; 3266 3267 if (mst == NULL) 3268 return; 3269 3270 m_snd_tag_rele(mst); 3271 #ifdef INET 3272 counter_u64_add(rate_limit_active, -1); 3273 #endif 3274 } 3275 3276 int 3277 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3278 { 3279 int error; 3280 3281 /* 3282 * If the existing send tag is for the wrong interface due to 3283 * a route change, first drop the existing tag. Set the 3284 * CHANGED flag so that we will keep trying to allocate a new 3285 * tag if we fail to allocate one this time. 3286 */ 3287 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3288 in_pcbdetach_txrtlmt(inp); 3289 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3290 } 3291 3292 /* 3293 * NOTE: When attaching to a network interface a reference is 3294 * made to ensure the network interface doesn't go away until 3295 * all ratelimit connections are gone. The network interface 3296 * pointers compared below represent valid network interfaces, 3297 * except when comparing towards NULL. 3298 */ 3299 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3300 error = 0; 3301 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3302 if (inp->inp_snd_tag != NULL) 3303 in_pcbdetach_txrtlmt(inp); 3304 error = 0; 3305 } else if (inp->inp_snd_tag == NULL) { 3306 /* 3307 * In order to utilize packet pacing with RSS, we need 3308 * to wait until there is a valid RSS hash before we 3309 * can proceed: 3310 */ 3311 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3312 error = EAGAIN; 3313 } else { 3314 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3315 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3316 } 3317 } else { 3318 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3319 } 3320 if (error == 0 || error == EOPNOTSUPP) 3321 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3322 3323 return (error); 3324 } 3325 3326 /* 3327 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3328 * is set in the fast path and will attach/detach/modify the TX rate 3329 * limit send tag based on the socket's so_max_pacing_rate value. 3330 */ 3331 void 3332 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3333 { 3334 struct socket *socket; 3335 uint32_t max_pacing_rate; 3336 bool did_upgrade; 3337 3338 if (inp == NULL) 3339 return; 3340 3341 socket = inp->inp_socket; 3342 if (socket == NULL) 3343 return; 3344 3345 if (!INP_WLOCKED(inp)) { 3346 /* 3347 * NOTE: If the write locking fails, we need to bail 3348 * out and use the non-ratelimited ring for the 3349 * transmit until there is a new chance to get the 3350 * write lock. 3351 */ 3352 if (!INP_TRY_UPGRADE(inp)) 3353 return; 3354 did_upgrade = 1; 3355 } else { 3356 did_upgrade = 0; 3357 } 3358 3359 /* 3360 * NOTE: The so_max_pacing_rate value is read unlocked, 3361 * because atomic updates are not required since the variable 3362 * is checked at every mbuf we send. It is assumed that the 3363 * variable read itself will be atomic. 3364 */ 3365 max_pacing_rate = socket->so_max_pacing_rate; 3366 3367 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3368 3369 if (did_upgrade) 3370 INP_DOWNGRADE(inp); 3371 } 3372 3373 /* 3374 * Track route changes for TX rate limiting. 3375 */ 3376 void 3377 in_pcboutput_eagain(struct inpcb *inp) 3378 { 3379 bool did_upgrade; 3380 3381 if (inp == NULL) 3382 return; 3383 3384 if (inp->inp_snd_tag == NULL) 3385 return; 3386 3387 if (!INP_WLOCKED(inp)) { 3388 /* 3389 * NOTE: If the write locking fails, we need to bail 3390 * out and use the non-ratelimited ring for the 3391 * transmit until there is a new chance to get the 3392 * write lock. 3393 */ 3394 if (!INP_TRY_UPGRADE(inp)) 3395 return; 3396 did_upgrade = 1; 3397 } else { 3398 did_upgrade = 0; 3399 } 3400 3401 /* detach rate limiting */ 3402 in_pcbdetach_txrtlmt(inp); 3403 3404 /* make sure new mbuf send tag allocation is made */ 3405 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3406 3407 if (did_upgrade) 3408 INP_DOWNGRADE(inp); 3409 } 3410 3411 #ifdef INET 3412 static void 3413 rl_init(void *st) 3414 { 3415 rate_limit_new = counter_u64_alloc(M_WAITOK); 3416 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3417 rate_limit_active = counter_u64_alloc(M_WAITOK); 3418 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3419 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3420 } 3421 3422 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3423 #endif 3424 #endif /* RATELIMIT */ 3425