1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_route.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/hash.h> 53 #include <sys/systm.h> 54 #include <sys/libkern.h> 55 #include <sys/lock.h> 56 #include <sys/malloc.h> 57 #include <sys/mbuf.h> 58 #include <sys/callout.h> 59 #include <sys/eventhandler.h> 60 #include <sys/domain.h> 61 #include <sys/protosw.h> 62 #include <sys/smp.h> 63 #include <sys/socket.h> 64 #include <sys/socketvar.h> 65 #include <sys/sockio.h> 66 #include <sys/priv.h> 67 #include <sys/proc.h> 68 #include <sys/refcount.h> 69 #include <sys/jail.h> 70 #include <sys/kernel.h> 71 #include <sys/sysctl.h> 72 73 #ifdef DDB 74 #include <ddb/ddb.h> 75 #endif 76 77 #include <vm/uma.h> 78 #include <vm/vm.h> 79 80 #include <net/if.h> 81 #include <net/if_var.h> 82 #include <net/if_types.h> 83 #include <net/if_llatbl.h> 84 #include <net/route.h> 85 #include <net/rss_config.h> 86 #include <net/vnet.h> 87 88 #if defined(INET) || defined(INET6) 89 #include <netinet/in.h> 90 #include <netinet/in_pcb.h> 91 #include <netinet/in_pcb_var.h> 92 #ifdef INET 93 #include <netinet/in_var.h> 94 #include <netinet/in_fib.h> 95 #endif 96 #include <netinet/ip_var.h> 97 #include <netinet/tcp_var.h> 98 #ifdef TCPHPTS 99 #include <netinet/tcp_hpts.h> 100 #endif 101 #include <netinet/udp.h> 102 #include <netinet/udp_var.h> 103 #ifdef INET6 104 #include <netinet/ip6.h> 105 #include <netinet6/in6_pcb.h> 106 #include <netinet6/in6_var.h> 107 #include <netinet6/ip6_var.h> 108 #endif /* INET6 */ 109 #include <net/route/nhop.h> 110 #endif 111 112 #include <netipsec/ipsec_support.h> 113 114 #include <security/mac/mac_framework.h> 115 116 #define INPCBLBGROUP_SIZMIN 8 117 #define INPCBLBGROUP_SIZMAX 256 118 #define INP_FREED 0x00000200 /* See in_pcb.h. */ 119 120 static struct callout ipport_tick_callout; 121 122 /* 123 * These configure the range of local port addresses assigned to 124 * "unspecified" outgoing connections/packets/whatever. 125 */ 126 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 127 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 128 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 129 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 130 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 131 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 132 133 /* 134 * Reserved ports accessible only to root. There are significant 135 * security considerations that must be accounted for when changing these, 136 * but the security benefits can be great. Please be careful. 137 */ 138 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 139 VNET_DEFINE(int, ipport_reservedlow); 140 141 /* Variables dealing with random ephemeral port allocation. */ 142 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ 143 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ 144 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ 145 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ 146 VNET_DEFINE(int, ipport_tcpallocs); 147 VNET_DEFINE_STATIC(int, ipport_tcplastcount); 148 149 #define V_ipport_tcplastcount VNET(ipport_tcplastcount) 150 151 #ifdef INET 152 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 153 struct in_addr faddr, u_int fport_arg, 154 struct in_addr laddr, u_int lport_arg, 155 int lookupflags, struct ifnet *ifp, 156 uint8_t numa_domain); 157 158 #define RANGECHK(var, min, max) \ 159 if ((var) < (min)) { (var) = (min); } \ 160 else if ((var) > (max)) { (var) = (max); } 161 162 static int 163 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 164 { 165 int error; 166 167 error = sysctl_handle_int(oidp, arg1, arg2, req); 168 if (error == 0) { 169 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 170 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 171 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 172 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 173 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 174 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 175 } 176 return (error); 177 } 178 179 #undef RANGECHK 180 181 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 182 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 183 "IP Ports"); 184 185 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 186 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 187 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 188 ""); 189 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 190 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 191 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 192 ""); 193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 194 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 195 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 196 ""); 197 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 198 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 199 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 200 ""); 201 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 202 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 203 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 204 ""); 205 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 206 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 207 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 208 ""); 209 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 210 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 211 &VNET_NAME(ipport_reservedhigh), 0, ""); 212 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 213 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 214 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 215 CTLFLAG_VNET | CTLFLAG_RW, 216 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 217 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, 218 CTLFLAG_VNET | CTLFLAG_RW, 219 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " 220 "allocations before switching to a sequential one"); 221 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, 222 CTLFLAG_VNET | CTLFLAG_RW, 223 &VNET_NAME(ipport_randomtime), 0, 224 "Minimum time to keep sequential port " 225 "allocation before switching to a random one"); 226 227 #ifdef RATELIMIT 228 counter_u64_t rate_limit_new; 229 counter_u64_t rate_limit_chg; 230 counter_u64_t rate_limit_active; 231 counter_u64_t rate_limit_alloc_fail; 232 counter_u64_t rate_limit_set_ok; 233 234 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 235 "IP Rate Limiting"); 236 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 237 &rate_limit_active, "Active rate limited connections"); 238 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 239 &rate_limit_alloc_fail, "Rate limited connection failures"); 240 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 241 &rate_limit_set_ok, "Rate limited setting succeeded"); 242 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, newrl, CTLFLAG_RD, 243 &rate_limit_new, "Total Rate limit new attempts"); 244 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, chgrl, CTLFLAG_RD, 245 &rate_limit_chg, "Total Rate limited change attempts"); 246 247 #endif /* RATELIMIT */ 248 249 #endif /* INET */ 250 251 VNET_DEFINE(uint32_t, in_pcbhashseed); 252 static void 253 in_pcbhashseed_init(void) 254 { 255 256 V_in_pcbhashseed = arc4random(); 257 } 258 VNET_SYSINIT(in_pcbhashseed_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, 259 in_pcbhashseed_init, 0); 260 261 /* 262 * in_pcb.c: manage the Protocol Control Blocks. 263 * 264 * NOTE: It is assumed that most of these functions will be called with 265 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 266 * functions often modify hash chains or addresses in pcbs. 267 */ 268 269 static struct inpcblbgroup * 270 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, 271 uint16_t port, const union in_dependaddr *addr, int size, 272 uint8_t numa_domain) 273 { 274 struct inpcblbgroup *grp; 275 size_t bytes; 276 277 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 278 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 279 if (!grp) 280 return (NULL); 281 grp->il_vflag = vflag; 282 grp->il_lport = port; 283 grp->il_numa_domain = numa_domain; 284 grp->il_dependladdr = *addr; 285 grp->il_inpsiz = size; 286 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 287 return (grp); 288 } 289 290 static void 291 in_pcblbgroup_free_deferred(epoch_context_t ctx) 292 { 293 struct inpcblbgroup *grp; 294 295 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 296 free(grp, M_PCB); 297 } 298 299 static void 300 in_pcblbgroup_free(struct inpcblbgroup *grp) 301 { 302 303 CK_LIST_REMOVE(grp, il_list); 304 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 305 } 306 307 static struct inpcblbgroup * 308 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 309 struct inpcblbgroup *old_grp, int size) 310 { 311 struct inpcblbgroup *grp; 312 int i; 313 314 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, 315 old_grp->il_lport, &old_grp->il_dependladdr, size, 316 old_grp->il_numa_domain); 317 if (grp == NULL) 318 return (NULL); 319 320 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 321 ("invalid new local group size %d and old local group count %d", 322 grp->il_inpsiz, old_grp->il_inpcnt)); 323 324 for (i = 0; i < old_grp->il_inpcnt; ++i) 325 grp->il_inp[i] = old_grp->il_inp[i]; 326 grp->il_inpcnt = old_grp->il_inpcnt; 327 in_pcblbgroup_free(old_grp); 328 return (grp); 329 } 330 331 /* 332 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 333 * and shrink group if possible. 334 */ 335 static void 336 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 337 int i) 338 { 339 struct inpcblbgroup *grp, *new_grp; 340 341 grp = *grpp; 342 for (; i + 1 < grp->il_inpcnt; ++i) 343 grp->il_inp[i] = grp->il_inp[i + 1]; 344 grp->il_inpcnt--; 345 346 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 347 grp->il_inpcnt <= grp->il_inpsiz / 4) { 348 /* Shrink this group. */ 349 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 350 if (new_grp != NULL) 351 *grpp = new_grp; 352 } 353 } 354 355 /* 356 * Add PCB to load balance group for SO_REUSEPORT_LB option. 357 */ 358 static int 359 in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) 360 { 361 const static struct timeval interval = { 60, 0 }; 362 static struct timeval lastprint; 363 struct inpcbinfo *pcbinfo; 364 struct inpcblbgrouphead *hdr; 365 struct inpcblbgroup *grp; 366 uint32_t idx; 367 368 pcbinfo = inp->inp_pcbinfo; 369 370 INP_WLOCK_ASSERT(inp); 371 INP_HASH_WLOCK_ASSERT(pcbinfo); 372 373 /* 374 * Don't allow jailed socket to join local group. 375 */ 376 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) 377 return (0); 378 379 #ifdef INET6 380 /* 381 * Don't allow IPv4 mapped INET6 wild socket. 382 */ 383 if ((inp->inp_vflag & INP_IPV4) && 384 inp->inp_laddr.s_addr == INADDR_ANY && 385 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 386 return (0); 387 } 388 #endif 389 390 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 391 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 392 CK_LIST_FOREACH(grp, hdr, il_list) { 393 if (grp->il_vflag == inp->inp_vflag && 394 grp->il_lport == inp->inp_lport && 395 grp->il_numa_domain == numa_domain && 396 memcmp(&grp->il_dependladdr, 397 &inp->inp_inc.inc_ie.ie_dependladdr, 398 sizeof(grp->il_dependladdr)) == 0) 399 break; 400 } 401 if (grp == NULL) { 402 /* Create new load balance group. */ 403 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, 404 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 405 INPCBLBGROUP_SIZMIN, numa_domain); 406 if (grp == NULL) 407 return (ENOBUFS); 408 } else if (grp->il_inpcnt == grp->il_inpsiz) { 409 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 410 if (ratecheck(&lastprint, &interval)) 411 printf("lb group port %d, limit reached\n", 412 ntohs(grp->il_lport)); 413 return (0); 414 } 415 416 /* Expand this local group. */ 417 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 418 if (grp == NULL) 419 return (ENOBUFS); 420 } 421 422 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 423 ("invalid local group size %d and count %d", grp->il_inpsiz, 424 grp->il_inpcnt)); 425 426 grp->il_inp[grp->il_inpcnt] = inp; 427 grp->il_inpcnt++; 428 return (0); 429 } 430 431 /* 432 * Remove PCB from load balance group. 433 */ 434 static void 435 in_pcbremlbgrouphash(struct inpcb *inp) 436 { 437 struct inpcbinfo *pcbinfo; 438 struct inpcblbgrouphead *hdr; 439 struct inpcblbgroup *grp; 440 int i; 441 442 pcbinfo = inp->inp_pcbinfo; 443 444 INP_WLOCK_ASSERT(inp); 445 INP_HASH_WLOCK_ASSERT(pcbinfo); 446 447 hdr = &pcbinfo->ipi_lbgrouphashbase[ 448 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 449 CK_LIST_FOREACH(grp, hdr, il_list) { 450 for (i = 0; i < grp->il_inpcnt; ++i) { 451 if (grp->il_inp[i] != inp) 452 continue; 453 454 if (grp->il_inpcnt == 1) { 455 /* We are the last, free this local group. */ 456 in_pcblbgroup_free(grp); 457 } else { 458 /* Pull up inpcbs, shrink group if possible. */ 459 in_pcblbgroup_reorder(hdr, &grp, i); 460 } 461 return; 462 } 463 } 464 } 465 466 int 467 in_pcblbgroup_numa(struct inpcb *inp, int arg) 468 { 469 struct inpcbinfo *pcbinfo; 470 struct inpcblbgrouphead *hdr; 471 struct inpcblbgroup *grp; 472 int err, i; 473 uint8_t numa_domain; 474 475 switch (arg) { 476 case TCP_REUSPORT_LB_NUMA_NODOM: 477 numa_domain = M_NODOM; 478 break; 479 case TCP_REUSPORT_LB_NUMA_CURDOM: 480 numa_domain = PCPU_GET(domain); 481 break; 482 default: 483 if (arg < 0 || arg >= vm_ndomains) 484 return (EINVAL); 485 numa_domain = arg; 486 } 487 488 err = 0; 489 pcbinfo = inp->inp_pcbinfo; 490 INP_WLOCK_ASSERT(inp); 491 INP_HASH_WLOCK(pcbinfo); 492 hdr = &pcbinfo->ipi_lbgrouphashbase[ 493 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 494 CK_LIST_FOREACH(grp, hdr, il_list) { 495 for (i = 0; i < grp->il_inpcnt; ++i) { 496 if (grp->il_inp[i] != inp) 497 continue; 498 499 if (grp->il_numa_domain == numa_domain) { 500 goto abort_with_hash_wlock; 501 } 502 503 /* Remove it from the old group. */ 504 in_pcbremlbgrouphash(inp); 505 506 /* Add it to the new group based on numa domain. */ 507 in_pcbinslbgrouphash(inp, numa_domain); 508 goto abort_with_hash_wlock; 509 } 510 } 511 err = ENOENT; 512 abort_with_hash_wlock: 513 INP_HASH_WUNLOCK(pcbinfo); 514 return (err); 515 } 516 517 /* Make sure it is safe to use hashinit(9) on CK_LIST. */ 518 CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); 519 520 /* 521 * Initialize an inpcbinfo - a per-VNET instance of connections db. 522 */ 523 void 524 in_pcbinfo_init(struct inpcbinfo *pcbinfo, struct inpcbstorage *pcbstor, 525 u_int hash_nelements, u_int porthash_nelements) 526 { 527 528 mtx_init(&pcbinfo->ipi_lock, pcbstor->ips_infolock_name, NULL, MTX_DEF); 529 mtx_init(&pcbinfo->ipi_hash_lock, pcbstor->ips_hashlock_name, 530 NULL, MTX_DEF); 531 #ifdef VIMAGE 532 pcbinfo->ipi_vnet = curvnet; 533 #endif 534 CK_LIST_INIT(&pcbinfo->ipi_listhead); 535 pcbinfo->ipi_count = 0; 536 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 537 &pcbinfo->ipi_hashmask); 538 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 539 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 540 &pcbinfo->ipi_porthashmask); 541 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 542 &pcbinfo->ipi_lbgrouphashmask); 543 pcbinfo->ipi_zone = pcbstor->ips_zone; 544 pcbinfo->ipi_portzone = pcbstor->ips_portzone; 545 pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); 546 } 547 548 /* 549 * Destroy an inpcbinfo. 550 */ 551 void 552 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 553 { 554 555 KASSERT(pcbinfo->ipi_count == 0, 556 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 557 558 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 559 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 560 pcbinfo->ipi_porthashmask); 561 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 562 pcbinfo->ipi_lbgrouphashmask); 563 mtx_destroy(&pcbinfo->ipi_hash_lock); 564 mtx_destroy(&pcbinfo->ipi_lock); 565 } 566 567 /* 568 * Initialize a pcbstorage - per protocol zones to allocate inpcbs. 569 */ 570 static void inpcb_dtor(void *, int, void *); 571 static void inpcb_fini(void *, int); 572 void 573 in_pcbstorage_init(void *arg) 574 { 575 struct inpcbstorage *pcbstor = arg; 576 577 pcbstor->ips_zone = uma_zcreate(pcbstor->ips_zone_name, 578 sizeof(struct inpcb), NULL, inpcb_dtor, pcbstor->ips_pcbinit, 579 inpcb_fini, UMA_ALIGN_PTR, UMA_ZONE_SMR); 580 pcbstor->ips_portzone = uma_zcreate(pcbstor->ips_portzone_name, 581 sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); 582 uma_zone_set_smr(pcbstor->ips_portzone, 583 uma_zone_get_smr(pcbstor->ips_zone)); 584 } 585 586 /* 587 * Destroy a pcbstorage - used by unloadable protocols. 588 */ 589 void 590 in_pcbstorage_destroy(void *arg) 591 { 592 struct inpcbstorage *pcbstor = arg; 593 594 uma_zdestroy(pcbstor->ips_zone); 595 uma_zdestroy(pcbstor->ips_portzone); 596 } 597 598 /* 599 * Allocate a PCB and associate it with the socket. 600 * On success return with the PCB locked. 601 */ 602 int 603 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 604 { 605 struct inpcb *inp; 606 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 607 int error; 608 #endif 609 610 inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); 611 if (inp == NULL) 612 return (ENOBUFS); 613 bzero(&inp->inp_start_zero, inp_zero_size); 614 #ifdef NUMA 615 inp->inp_numa_domain = M_NODOM; 616 #endif 617 inp->inp_pcbinfo = pcbinfo; 618 inp->inp_socket = so; 619 inp->inp_cred = crhold(so->so_cred); 620 inp->inp_inc.inc_fibnum = so->so_fibnum; 621 #ifdef MAC 622 error = mac_inpcb_init(inp, M_NOWAIT); 623 if (error != 0) 624 goto out; 625 mac_inpcb_create(so, inp); 626 #endif 627 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 628 error = ipsec_init_pcbpolicy(inp); 629 if (error != 0) { 630 #ifdef MAC 631 mac_inpcb_destroy(inp); 632 #endif 633 goto out; 634 } 635 #endif /*IPSEC*/ 636 #ifdef INET6 637 if (INP_SOCKAF(so) == AF_INET6) { 638 inp->inp_vflag |= INP_IPV6PROTO; 639 if (V_ip6_v6only) 640 inp->inp_flags |= IN6P_IPV6_V6ONLY; 641 } 642 if (V_ip6_auto_flowlabel) 643 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 644 #endif 645 /* 646 * Routes in inpcb's can cache L2 as well; they are guaranteed 647 * to be cleaned up. 648 */ 649 inp->inp_route.ro_flags = RT_LLE_CACHE; 650 #ifdef TCPHPTS 651 /* 652 * If using hpts lets drop a random number in so 653 * not all new connections fall on the same CPU. 654 */ 655 inp->inp_hpts_cpu = hpts_random_cpu(inp); 656 #endif 657 refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ 658 INP_WLOCK(inp); 659 INP_INFO_WLOCK(pcbinfo); 660 pcbinfo->ipi_count++; 661 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 662 CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); 663 INP_INFO_WUNLOCK(pcbinfo); 664 so->so_pcb = inp; 665 666 return (0); 667 668 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 669 out: 670 uma_zfree_smr(pcbinfo->ipi_zone, inp); 671 return (error); 672 #endif 673 } 674 675 #ifdef INET 676 int 677 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 678 { 679 int anonport, error; 680 681 KASSERT(nam == NULL || nam->sa_family == AF_INET, 682 ("%s: invalid address family for %p", __func__, nam)); 683 KASSERT(nam == NULL || nam->sa_len == sizeof(struct sockaddr_in), 684 ("%s: invalid address length for %p", __func__, nam)); 685 INP_WLOCK_ASSERT(inp); 686 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 687 688 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 689 return (EINVAL); 690 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 691 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 692 &inp->inp_lport, cred); 693 if (error) 694 return (error); 695 if (in_pcbinshash(inp) != 0) { 696 inp->inp_laddr.s_addr = INADDR_ANY; 697 inp->inp_lport = 0; 698 return (EAGAIN); 699 } 700 if (anonport) 701 inp->inp_flags |= INP_ANONPORT; 702 return (0); 703 } 704 #endif 705 706 #if defined(INET) || defined(INET6) 707 /* 708 * Assign a local port like in_pcb_lport(), but also used with connect() 709 * and a foreign address and port. If fsa is non-NULL, choose a local port 710 * that is unused with those, otherwise one that is completely unused. 711 * lsa can be NULL for IPv6. 712 */ 713 int 714 in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, u_short *lportp, 715 struct sockaddr *fsa, u_short fport, struct ucred *cred, int lookupflags) 716 { 717 struct inpcbinfo *pcbinfo; 718 struct inpcb *tmpinp; 719 unsigned short *lastport; 720 int count, dorandom, error; 721 u_short aux, first, last, lport; 722 #ifdef INET 723 struct in_addr laddr, faddr; 724 #endif 725 #ifdef INET6 726 struct in6_addr *laddr6, *faddr6; 727 #endif 728 729 pcbinfo = inp->inp_pcbinfo; 730 731 /* 732 * Because no actual state changes occur here, a global write lock on 733 * the pcbinfo isn't required. 734 */ 735 INP_LOCK_ASSERT(inp); 736 INP_HASH_LOCK_ASSERT(pcbinfo); 737 738 if (inp->inp_flags & INP_HIGHPORT) { 739 first = V_ipport_hifirstauto; /* sysctl */ 740 last = V_ipport_hilastauto; 741 lastport = &pcbinfo->ipi_lasthi; 742 } else if (inp->inp_flags & INP_LOWPORT) { 743 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 744 if (error) 745 return (error); 746 first = V_ipport_lowfirstauto; /* 1023 */ 747 last = V_ipport_lowlastauto; /* 600 */ 748 lastport = &pcbinfo->ipi_lastlow; 749 } else { 750 first = V_ipport_firstauto; /* sysctl */ 751 last = V_ipport_lastauto; 752 lastport = &pcbinfo->ipi_lastport; 753 } 754 /* 755 * For UDP(-Lite), use random port allocation as long as the user 756 * allows it. For TCP (and as of yet unknown) connections, 757 * use random port allocation only if the user allows it AND 758 * ipport_tick() allows it. 759 */ 760 if (V_ipport_randomized && 761 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || 762 pcbinfo == &V_ulitecbinfo)) 763 dorandom = 1; 764 else 765 dorandom = 0; 766 /* 767 * It makes no sense to do random port allocation if 768 * we have the only port available. 769 */ 770 if (first == last) 771 dorandom = 0; 772 /* Make sure to not include UDP(-Lite) packets in the count. */ 773 if (pcbinfo != &V_udbinfo && pcbinfo != &V_ulitecbinfo) 774 V_ipport_tcpallocs++; 775 /* 776 * Instead of having two loops further down counting up or down 777 * make sure that first is always <= last and go with only one 778 * code path implementing all logic. 779 */ 780 if (first > last) { 781 aux = first; 782 first = last; 783 last = aux; 784 } 785 786 #ifdef INET 787 laddr.s_addr = INADDR_ANY; /* used by INET6+INET below too */ 788 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 789 if (lsa != NULL) 790 laddr = ((struct sockaddr_in *)lsa)->sin_addr; 791 if (fsa != NULL) 792 faddr = ((struct sockaddr_in *)fsa)->sin_addr; 793 } 794 #endif 795 #ifdef INET6 796 laddr6 = NULL; 797 if ((inp->inp_vflag & INP_IPV6) != 0) { 798 if (lsa != NULL) 799 laddr6 = &((struct sockaddr_in6 *)lsa)->sin6_addr; 800 if (fsa != NULL) 801 faddr6 = &((struct sockaddr_in6 *)fsa)->sin6_addr; 802 } 803 #endif 804 805 tmpinp = NULL; 806 lport = *lportp; 807 808 if (dorandom) 809 *lastport = first + (arc4random() % (last - first)); 810 811 count = last - first; 812 813 do { 814 if (count-- < 0) /* completely used? */ 815 return (EADDRNOTAVAIL); 816 ++*lastport; 817 if (*lastport < first || *lastport > last) 818 *lastport = first; 819 lport = htons(*lastport); 820 821 if (fsa != NULL) { 822 #ifdef INET 823 if (lsa->sa_family == AF_INET) { 824 tmpinp = in_pcblookup_hash_locked(pcbinfo, 825 faddr, fport, laddr, lport, lookupflags, 826 NULL, M_NODOM); 827 } 828 #endif 829 #ifdef INET6 830 if (lsa->sa_family == AF_INET6) { 831 tmpinp = in6_pcblookup_hash_locked(pcbinfo, 832 faddr6, fport, laddr6, lport, lookupflags, 833 NULL, M_NODOM); 834 } 835 #endif 836 } else { 837 #ifdef INET6 838 if ((inp->inp_vflag & INP_IPV6) != 0) { 839 tmpinp = in6_pcblookup_local(pcbinfo, 840 &inp->in6p_laddr, lport, lookupflags, cred); 841 #ifdef INET 842 if (tmpinp == NULL && 843 (inp->inp_vflag & INP_IPV4)) 844 tmpinp = in_pcblookup_local(pcbinfo, 845 laddr, lport, lookupflags, cred); 846 #endif 847 } 848 #endif 849 #if defined(INET) && defined(INET6) 850 else 851 #endif 852 #ifdef INET 853 tmpinp = in_pcblookup_local(pcbinfo, laddr, 854 lport, lookupflags, cred); 855 #endif 856 } 857 } while (tmpinp != NULL); 858 859 *lportp = lport; 860 861 return (0); 862 } 863 864 /* 865 * Select a local port (number) to use. 866 */ 867 int 868 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 869 struct ucred *cred, int lookupflags) 870 { 871 struct sockaddr_in laddr; 872 873 if (laddrp) { 874 bzero(&laddr, sizeof(laddr)); 875 laddr.sin_family = AF_INET; 876 laddr.sin_addr = *laddrp; 877 } 878 return (in_pcb_lport_dest(inp, laddrp ? (struct sockaddr *) &laddr : 879 NULL, lportp, NULL, 0, cred, lookupflags)); 880 } 881 882 /* 883 * Return cached socket options. 884 */ 885 int 886 inp_so_options(const struct inpcb *inp) 887 { 888 int so_options; 889 890 so_options = 0; 891 892 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 893 so_options |= SO_REUSEPORT_LB; 894 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 895 so_options |= SO_REUSEPORT; 896 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 897 so_options |= SO_REUSEADDR; 898 return (so_options); 899 } 900 #endif /* INET || INET6 */ 901 902 /* 903 * Check if a new BINDMULTI socket is allowed to be created. 904 * 905 * ni points to the new inp. 906 * oi points to the existing inp. 907 * 908 * This checks whether the existing inp also has BINDMULTI and 909 * whether the credentials match. 910 */ 911 int 912 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 913 { 914 /* Check permissions match */ 915 if ((ni->inp_flags2 & INP_BINDMULTI) && 916 (ni->inp_cred->cr_uid != 917 oi->inp_cred->cr_uid)) 918 return (0); 919 920 /* Check the existing inp has BINDMULTI set */ 921 if ((ni->inp_flags2 & INP_BINDMULTI) && 922 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 923 return (0); 924 925 /* 926 * We're okay - either INP_BINDMULTI isn't set on ni, or 927 * it is and it matches the checks. 928 */ 929 return (1); 930 } 931 932 #ifdef INET 933 /* 934 * Set up a bind operation on a PCB, performing port allocation 935 * as required, but do not actually modify the PCB. Callers can 936 * either complete the bind by setting inp_laddr/inp_lport and 937 * calling in_pcbinshash(), or they can just use the resulting 938 * port and address to authorise the sending of a once-off packet. 939 * 940 * On error, the values of *laddrp and *lportp are not changed. 941 */ 942 int 943 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 944 u_short *lportp, struct ucred *cred) 945 { 946 struct socket *so = inp->inp_socket; 947 struct sockaddr_in *sin; 948 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 949 struct in_addr laddr; 950 u_short lport = 0; 951 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 952 int error; 953 954 /* 955 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 956 * so that we don't have to add to the (already messy) code below. 957 */ 958 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 959 960 /* 961 * No state changes, so read locks are sufficient here. 962 */ 963 INP_LOCK_ASSERT(inp); 964 INP_HASH_LOCK_ASSERT(pcbinfo); 965 966 laddr.s_addr = *laddrp; 967 if (nam != NULL && laddr.s_addr != INADDR_ANY) 968 return (EINVAL); 969 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 970 lookupflags = INPLOOKUP_WILDCARD; 971 if (nam == NULL) { 972 if ((error = prison_local_ip4(cred, &laddr)) != 0) 973 return (error); 974 } else { 975 sin = (struct sockaddr_in *)nam; 976 KASSERT(sin->sin_family == AF_INET, 977 ("%s: invalid family for address %p", __func__, sin)); 978 KASSERT(sin->sin_len == sizeof(*sin), 979 ("%s: invalid length for address %p", __func__, sin)); 980 981 error = prison_local_ip4(cred, &sin->sin_addr); 982 if (error) 983 return (error); 984 if (sin->sin_port != *lportp) { 985 /* Don't allow the port to change. */ 986 if (*lportp != 0) 987 return (EINVAL); 988 lport = sin->sin_port; 989 } 990 /* NB: lport is left as 0 if the port isn't being changed. */ 991 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 992 /* 993 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 994 * allow complete duplication of binding if 995 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 996 * and a multicast address is bound on both 997 * new and duplicated sockets. 998 */ 999 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 1000 reuseport = SO_REUSEADDR|SO_REUSEPORT; 1001 /* 1002 * XXX: How to deal with SO_REUSEPORT_LB here? 1003 * Treat same as SO_REUSEPORT for now. 1004 */ 1005 if ((so->so_options & 1006 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 1007 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 1008 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 1009 sin->sin_port = 0; /* yech... */ 1010 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 1011 /* 1012 * Is the address a local IP address? 1013 * If INP_BINDANY is set, then the socket may be bound 1014 * to any endpoint address, local or not. 1015 */ 1016 if ((inp->inp_flags & INP_BINDANY) == 0 && 1017 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 1018 return (EADDRNOTAVAIL); 1019 } 1020 laddr = sin->sin_addr; 1021 if (lport) { 1022 struct inpcb *t; 1023 struct tcptw *tw; 1024 1025 /* GROSS */ 1026 if (ntohs(lport) <= V_ipport_reservedhigh && 1027 ntohs(lport) >= V_ipport_reservedlow && 1028 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 1029 return (EACCES); 1030 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 1031 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 1032 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1033 lport, INPLOOKUP_WILDCARD, cred); 1034 /* 1035 * XXX 1036 * This entire block sorely needs a rewrite. 1037 */ 1038 if (t && 1039 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1040 ((t->inp_flags & INP_TIMEWAIT) == 0) && 1041 (so->so_type != SOCK_STREAM || 1042 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 1043 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 1044 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 1045 (t->inp_flags2 & INP_REUSEPORT) || 1046 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 1047 (inp->inp_cred->cr_uid != 1048 t->inp_cred->cr_uid)) 1049 return (EADDRINUSE); 1050 1051 /* 1052 * If the socket is a BINDMULTI socket, then 1053 * the credentials need to match and the 1054 * original socket also has to have been bound 1055 * with BINDMULTI. 1056 */ 1057 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1058 return (EADDRINUSE); 1059 } 1060 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 1061 lport, lookupflags, cred); 1062 if (t && (t->inp_flags & INP_TIMEWAIT)) { 1063 /* 1064 * XXXRW: If an incpb has had its timewait 1065 * state recycled, we treat the address as 1066 * being in use (for now). This is better 1067 * than a panic, but not desirable. 1068 */ 1069 tw = intotw(t); 1070 if (tw == NULL || 1071 ((reuseport & tw->tw_so_options) == 0 && 1072 (reuseport_lb & 1073 tw->tw_so_options) == 0)) { 1074 return (EADDRINUSE); 1075 } 1076 } else if (t && 1077 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 1078 (reuseport & inp_so_options(t)) == 0 && 1079 (reuseport_lb & inp_so_options(t)) == 0) { 1080 #ifdef INET6 1081 if (ntohl(sin->sin_addr.s_addr) != 1082 INADDR_ANY || 1083 ntohl(t->inp_laddr.s_addr) != 1084 INADDR_ANY || 1085 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 1086 (t->inp_vflag & INP_IPV6PROTO) == 0) 1087 #endif 1088 return (EADDRINUSE); 1089 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 1090 return (EADDRINUSE); 1091 } 1092 } 1093 } 1094 if (*lportp != 0) 1095 lport = *lportp; 1096 if (lport == 0) { 1097 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 1098 if (error != 0) 1099 return (error); 1100 } 1101 *laddrp = laddr.s_addr; 1102 *lportp = lport; 1103 return (0); 1104 } 1105 1106 /* 1107 * Connect from a socket to a specified address. 1108 * Both address and port must be specified in argument sin. 1109 * If don't have a local address for this socket yet, 1110 * then pick one. 1111 */ 1112 int 1113 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred, 1114 bool rehash) 1115 { 1116 u_short lport, fport; 1117 in_addr_t laddr, faddr; 1118 int anonport, error; 1119 1120 INP_WLOCK_ASSERT(inp); 1121 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1122 1123 lport = inp->inp_lport; 1124 laddr = inp->inp_laddr.s_addr; 1125 anonport = (lport == 0); 1126 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 1127 NULL, cred); 1128 if (error) 1129 return (error); 1130 1131 /* Do the initial binding of the local address if required. */ 1132 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 1133 KASSERT(rehash == true, 1134 ("Rehashing required for unbound inps")); 1135 inp->inp_lport = lport; 1136 inp->inp_laddr.s_addr = laddr; 1137 if (in_pcbinshash(inp) != 0) { 1138 inp->inp_laddr.s_addr = INADDR_ANY; 1139 inp->inp_lport = 0; 1140 return (EAGAIN); 1141 } 1142 } 1143 1144 /* Commit the remaining changes. */ 1145 inp->inp_lport = lport; 1146 inp->inp_laddr.s_addr = laddr; 1147 inp->inp_faddr.s_addr = faddr; 1148 inp->inp_fport = fport; 1149 if (rehash) { 1150 in_pcbrehash(inp); 1151 } else { 1152 in_pcbinshash(inp); 1153 } 1154 1155 if (anonport) 1156 inp->inp_flags |= INP_ANONPORT; 1157 return (0); 1158 } 1159 1160 /* 1161 * Do proper source address selection on an unbound socket in case 1162 * of connect. Take jails into account as well. 1163 */ 1164 int 1165 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1166 struct ucred *cred) 1167 { 1168 struct ifaddr *ifa; 1169 struct sockaddr *sa; 1170 struct sockaddr_in *sin, dst; 1171 struct nhop_object *nh; 1172 int error; 1173 1174 NET_EPOCH_ASSERT(); 1175 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1176 /* 1177 * Bypass source address selection and use the primary jail IP 1178 * if requested. 1179 */ 1180 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) 1181 return (0); 1182 1183 error = 0; 1184 1185 nh = NULL; 1186 bzero(&dst, sizeof(dst)); 1187 sin = &dst; 1188 sin->sin_family = AF_INET; 1189 sin->sin_len = sizeof(struct sockaddr_in); 1190 sin->sin_addr.s_addr = faddr->s_addr; 1191 1192 /* 1193 * If route is known our src addr is taken from the i/f, 1194 * else punt. 1195 * 1196 * Find out route to destination. 1197 */ 1198 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1199 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1200 0, NHR_NONE, 0); 1201 1202 /* 1203 * If we found a route, use the address corresponding to 1204 * the outgoing interface. 1205 * 1206 * Otherwise assume faddr is reachable on a directly connected 1207 * network and try to find a corresponding interface to take 1208 * the source address from. 1209 */ 1210 if (nh == NULL || nh->nh_ifp == NULL) { 1211 struct in_ifaddr *ia; 1212 struct ifnet *ifp; 1213 1214 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1215 inp->inp_socket->so_fibnum)); 1216 if (ia == NULL) { 1217 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1218 inp->inp_socket->so_fibnum)); 1219 } 1220 if (ia == NULL) { 1221 error = ENETUNREACH; 1222 goto done; 1223 } 1224 1225 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1226 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1227 goto done; 1228 } 1229 1230 ifp = ia->ia_ifp; 1231 ia = NULL; 1232 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1233 sa = ifa->ifa_addr; 1234 if (sa->sa_family != AF_INET) 1235 continue; 1236 sin = (struct sockaddr_in *)sa; 1237 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1238 ia = (struct in_ifaddr *)ifa; 1239 break; 1240 } 1241 } 1242 if (ia != NULL) { 1243 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1244 goto done; 1245 } 1246 1247 /* 3. As a last resort return the 'default' jail address. */ 1248 error = prison_get_ip4(cred, laddr); 1249 goto done; 1250 } 1251 1252 /* 1253 * If the outgoing interface on the route found is not 1254 * a loopback interface, use the address from that interface. 1255 * In case of jails do those three steps: 1256 * 1. check if the interface address belongs to the jail. If so use it. 1257 * 2. check if we have any address on the outgoing interface 1258 * belonging to this jail. If so use it. 1259 * 3. as a last resort return the 'default' jail address. 1260 */ 1261 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1262 struct in_ifaddr *ia; 1263 struct ifnet *ifp; 1264 1265 /* If not jailed, use the default returned. */ 1266 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1267 ia = (struct in_ifaddr *)nh->nh_ifa; 1268 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1269 goto done; 1270 } 1271 1272 /* Jailed. */ 1273 /* 1. Check if the iface address belongs to the jail. */ 1274 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1275 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1276 ia = (struct in_ifaddr *)nh->nh_ifa; 1277 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1278 goto done; 1279 } 1280 1281 /* 1282 * 2. Check if we have any address on the outgoing interface 1283 * belonging to this jail. 1284 */ 1285 ia = NULL; 1286 ifp = nh->nh_ifp; 1287 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1288 sa = ifa->ifa_addr; 1289 if (sa->sa_family != AF_INET) 1290 continue; 1291 sin = (struct sockaddr_in *)sa; 1292 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1293 ia = (struct in_ifaddr *)ifa; 1294 break; 1295 } 1296 } 1297 if (ia != NULL) { 1298 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1299 goto done; 1300 } 1301 1302 /* 3. As a last resort return the 'default' jail address. */ 1303 error = prison_get_ip4(cred, laddr); 1304 goto done; 1305 } 1306 1307 /* 1308 * The outgoing interface is marked with 'loopback net', so a route 1309 * to ourselves is here. 1310 * Try to find the interface of the destination address and then 1311 * take the address from there. That interface is not necessarily 1312 * a loopback interface. 1313 * In case of jails, check that it is an address of the jail 1314 * and if we cannot find, fall back to the 'default' jail address. 1315 */ 1316 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1317 struct in_ifaddr *ia; 1318 1319 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1320 inp->inp_socket->so_fibnum)); 1321 if (ia == NULL) 1322 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1323 inp->inp_socket->so_fibnum)); 1324 if (ia == NULL) 1325 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1326 1327 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1328 if (ia == NULL) { 1329 error = ENETUNREACH; 1330 goto done; 1331 } 1332 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1333 goto done; 1334 } 1335 1336 /* Jailed. */ 1337 if (ia != NULL) { 1338 struct ifnet *ifp; 1339 1340 ifp = ia->ia_ifp; 1341 ia = NULL; 1342 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1343 sa = ifa->ifa_addr; 1344 if (sa->sa_family != AF_INET) 1345 continue; 1346 sin = (struct sockaddr_in *)sa; 1347 if (prison_check_ip4(cred, 1348 &sin->sin_addr) == 0) { 1349 ia = (struct in_ifaddr *)ifa; 1350 break; 1351 } 1352 } 1353 if (ia != NULL) { 1354 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1355 goto done; 1356 } 1357 } 1358 1359 /* 3. As a last resort return the 'default' jail address. */ 1360 error = prison_get_ip4(cred, laddr); 1361 goto done; 1362 } 1363 1364 done: 1365 return (error); 1366 } 1367 1368 /* 1369 * Set up for a connect from a socket to the specified address. 1370 * On entry, *laddrp and *lportp should contain the current local 1371 * address and port for the PCB; these are updated to the values 1372 * that should be placed in inp_laddr and inp_lport to complete 1373 * the connect. 1374 * 1375 * On success, *faddrp and *fportp will be set to the remote address 1376 * and port. These are not updated in the error case. 1377 * 1378 * If the operation fails because the connection already exists, 1379 * *oinpp will be set to the PCB of that connection so that the 1380 * caller can decide to override it. In all other cases, *oinpp 1381 * is set to NULL. 1382 */ 1383 int 1384 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1385 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1386 struct inpcb **oinpp, struct ucred *cred) 1387 { 1388 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1389 struct in_ifaddr *ia; 1390 struct inpcb *oinp; 1391 struct in_addr laddr, faddr; 1392 u_short lport, fport; 1393 int error; 1394 1395 KASSERT(sin->sin_family == AF_INET, 1396 ("%s: invalid address family for %p", __func__, sin)); 1397 KASSERT(sin->sin_len == sizeof(*sin), 1398 ("%s: invalid address length for %p", __func__, sin)); 1399 1400 /* 1401 * Because a global state change doesn't actually occur here, a read 1402 * lock is sufficient. 1403 */ 1404 NET_EPOCH_ASSERT(); 1405 INP_LOCK_ASSERT(inp); 1406 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1407 1408 if (oinpp != NULL) 1409 *oinpp = NULL; 1410 if (sin->sin_port == 0) 1411 return (EADDRNOTAVAIL); 1412 laddr.s_addr = *laddrp; 1413 lport = *lportp; 1414 faddr = sin->sin_addr; 1415 fport = sin->sin_port; 1416 #ifdef ROUTE_MPATH 1417 if (CALC_FLOWID_OUTBOUND) { 1418 uint32_t hash_val, hash_type; 1419 1420 hash_val = fib4_calc_software_hash(laddr, faddr, 0, fport, 1421 inp->inp_socket->so_proto->pr_protocol, &hash_type); 1422 1423 inp->inp_flowid = hash_val; 1424 inp->inp_flowtype = hash_type; 1425 } 1426 #endif 1427 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1428 /* 1429 * If the destination address is INADDR_ANY, 1430 * use the primary local address. 1431 * If the supplied address is INADDR_BROADCAST, 1432 * and the primary interface supports broadcast, 1433 * choose the broadcast address for that interface. 1434 */ 1435 if (faddr.s_addr == INADDR_ANY) { 1436 faddr = 1437 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1438 if (cred != NULL && 1439 (error = prison_get_ip4(cred, &faddr)) != 0) 1440 return (error); 1441 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1442 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1443 IFF_BROADCAST) 1444 faddr = satosin(&CK_STAILQ_FIRST( 1445 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1446 } 1447 } 1448 if (laddr.s_addr == INADDR_ANY) { 1449 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1450 /* 1451 * If the destination address is multicast and an outgoing 1452 * interface has been set as a multicast option, prefer the 1453 * address of that interface as our source address. 1454 */ 1455 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1456 inp->inp_moptions != NULL) { 1457 struct ip_moptions *imo; 1458 struct ifnet *ifp; 1459 1460 imo = inp->inp_moptions; 1461 if (imo->imo_multicast_ifp != NULL) { 1462 ifp = imo->imo_multicast_ifp; 1463 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1464 if ((ia->ia_ifp == ifp) && 1465 (cred == NULL || 1466 prison_check_ip4(cred, 1467 &ia->ia_addr.sin_addr) == 0)) 1468 break; 1469 } 1470 if (ia == NULL) 1471 error = EADDRNOTAVAIL; 1472 else { 1473 laddr = ia->ia_addr.sin_addr; 1474 error = 0; 1475 } 1476 } 1477 } 1478 if (error) 1479 return (error); 1480 } 1481 1482 if (lport != 0) { 1483 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, 1484 fport, laddr, lport, 0, NULL, M_NODOM); 1485 if (oinp != NULL) { 1486 if (oinpp != NULL) 1487 *oinpp = oinp; 1488 return (EADDRINUSE); 1489 } 1490 } else { 1491 struct sockaddr_in lsin, fsin; 1492 1493 bzero(&lsin, sizeof(lsin)); 1494 bzero(&fsin, sizeof(fsin)); 1495 lsin.sin_family = AF_INET; 1496 lsin.sin_addr = laddr; 1497 fsin.sin_family = AF_INET; 1498 fsin.sin_addr = faddr; 1499 error = in_pcb_lport_dest(inp, (struct sockaddr *) &lsin, 1500 &lport, (struct sockaddr *)& fsin, fport, cred, 1501 INPLOOKUP_WILDCARD); 1502 if (error) 1503 return (error); 1504 } 1505 *laddrp = laddr.s_addr; 1506 *lportp = lport; 1507 *faddrp = faddr.s_addr; 1508 *fportp = fport; 1509 return (0); 1510 } 1511 1512 void 1513 in_pcbdisconnect(struct inpcb *inp) 1514 { 1515 1516 INP_WLOCK_ASSERT(inp); 1517 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1518 1519 inp->inp_faddr.s_addr = INADDR_ANY; 1520 inp->inp_fport = 0; 1521 in_pcbrehash(inp); 1522 } 1523 #endif /* INET */ 1524 1525 /* 1526 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1527 * For most protocols, this will be invoked immediately prior to calling 1528 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1529 * socket, in which case in_pcbfree() is deferred. 1530 */ 1531 void 1532 in_pcbdetach(struct inpcb *inp) 1533 { 1534 1535 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1536 1537 #ifdef RATELIMIT 1538 if (inp->inp_snd_tag != NULL) 1539 in_pcbdetach_txrtlmt(inp); 1540 #endif 1541 inp->inp_socket->so_pcb = NULL; 1542 inp->inp_socket = NULL; 1543 } 1544 1545 /* 1546 * inpcb hash lookups are protected by SMR section. 1547 * 1548 * Once desired pcb has been found, switching from SMR section to a pcb 1549 * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK 1550 * here because SMR is a critical section. 1551 * In 99%+ cases inp_smr_lock() would obtain the lock immediately. 1552 */ 1553 static inline void 1554 inp_lock(struct inpcb *inp, const inp_lookup_t lock) 1555 { 1556 1557 lock == INPLOOKUP_RLOCKPCB ? 1558 rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); 1559 } 1560 1561 static inline void 1562 inp_unlock(struct inpcb *inp, const inp_lookup_t lock) 1563 { 1564 1565 lock == INPLOOKUP_RLOCKPCB ? 1566 rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); 1567 } 1568 1569 static inline int 1570 inp_trylock(struct inpcb *inp, const inp_lookup_t lock) 1571 { 1572 1573 return (lock == INPLOOKUP_RLOCKPCB ? 1574 rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); 1575 } 1576 1577 static inline bool 1578 in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) 1579 { 1580 1581 return (lock == INPLOOKUP_RLOCKPCB ? 1582 in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); 1583 } 1584 1585 bool 1586 inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) 1587 { 1588 1589 MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); 1590 SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); 1591 1592 if (__predict_true(inp_trylock(inp, lock))) { 1593 if (__predict_false(inp->inp_flags & INP_FREED)) { 1594 smr_exit(inp->inp_pcbinfo->ipi_smr); 1595 inp_unlock(inp, lock); 1596 return (false); 1597 } 1598 smr_exit(inp->inp_pcbinfo->ipi_smr); 1599 return (true); 1600 } 1601 1602 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1603 smr_exit(inp->inp_pcbinfo->ipi_smr); 1604 inp_lock(inp, lock); 1605 if (__predict_false(in_pcbrele(inp, lock))) 1606 return (false); 1607 /* 1608 * inp acquired through refcount & lock for sure didn't went 1609 * through uma_zfree(). However, it may have already went 1610 * through in_pcbfree() and has another reference, that 1611 * prevented its release by our in_pcbrele(). 1612 */ 1613 if (__predict_false(inp->inp_flags & INP_FREED)) { 1614 inp_unlock(inp, lock); 1615 return (false); 1616 } 1617 return (true); 1618 } else { 1619 smr_exit(inp->inp_pcbinfo->ipi_smr); 1620 return (false); 1621 } 1622 } 1623 1624 /* 1625 * inp_next() - inpcb hash/list traversal iterator 1626 * 1627 * Requires initialized struct inpcb_iterator for context. 1628 * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). 1629 * 1630 * - Iterator can have either write-lock or read-lock semantics, that can not 1631 * be changed later. 1632 * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through 1633 * a single hash slot. Note: only rip_input() does the latter. 1634 * - Iterator may have optional bool matching function. The matching function 1635 * will be executed for each inpcb in the SMR context, so it can not acquire 1636 * locks and can safely access only immutable fields of inpcb. 1637 * 1638 * A fresh initialized iterator has NULL inpcb in its context and that 1639 * means that inp_next() call would return the very first inpcb on the list 1640 * locked with desired semantic. In all following calls the context pointer 1641 * shall hold the current inpcb pointer. The KPI user is not supposed to 1642 * unlock the current inpcb! Upon end of traversal inp_next() will return NULL 1643 * and write NULL to its context. After end of traversal an iterator can be 1644 * reused. 1645 * 1646 * List traversals have the following features/constraints: 1647 * - New entries won't be seen, as they are always added to the head of a list. 1648 * - Removed entries won't stop traversal as long as they are not added to 1649 * a different list. This is violated by in_pcbrehash(). 1650 */ 1651 #define II_LIST_FIRST(ipi, hash) \ 1652 (((hash) == INP_ALL_LIST) ? \ 1653 CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ 1654 CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) 1655 #define II_LIST_NEXT(inp, hash) \ 1656 (((hash) == INP_ALL_LIST) ? \ 1657 CK_LIST_NEXT((inp), inp_list) : \ 1658 CK_LIST_NEXT((inp), inp_hash)) 1659 #define II_LOCK_ASSERT(inp, lock) \ 1660 rw_assert(&(inp)->inp_lock, \ 1661 (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) 1662 struct inpcb * 1663 inp_next(struct inpcb_iterator *ii) 1664 { 1665 const struct inpcbinfo *ipi = ii->ipi; 1666 inp_match_t *match = ii->match; 1667 void *ctx = ii->ctx; 1668 inp_lookup_t lock = ii->lock; 1669 int hash = ii->hash; 1670 struct inpcb *inp; 1671 1672 if (ii->inp == NULL) { /* First call. */ 1673 smr_enter(ipi->ipi_smr); 1674 /* This is unrolled CK_LIST_FOREACH(). */ 1675 for (inp = II_LIST_FIRST(ipi, hash); 1676 inp != NULL; 1677 inp = II_LIST_NEXT(inp, hash)) { 1678 if (match != NULL && (match)(inp, ctx) == false) 1679 continue; 1680 if (__predict_true(inp_smr_lock(inp, lock))) 1681 break; 1682 else { 1683 smr_enter(ipi->ipi_smr); 1684 MPASS(inp != II_LIST_FIRST(ipi, hash)); 1685 inp = II_LIST_FIRST(ipi, hash); 1686 if (inp == NULL) 1687 break; 1688 } 1689 } 1690 1691 if (inp == NULL) 1692 smr_exit(ipi->ipi_smr); 1693 else 1694 ii->inp = inp; 1695 1696 return (inp); 1697 } 1698 1699 /* Not a first call. */ 1700 smr_enter(ipi->ipi_smr); 1701 restart: 1702 inp = ii->inp; 1703 II_LOCK_ASSERT(inp, lock); 1704 next: 1705 inp = II_LIST_NEXT(inp, hash); 1706 if (inp == NULL) { 1707 smr_exit(ipi->ipi_smr); 1708 goto found; 1709 } 1710 1711 if (match != NULL && (match)(inp, ctx) == false) 1712 goto next; 1713 1714 if (__predict_true(inp_trylock(inp, lock))) { 1715 if (__predict_false(inp->inp_flags & INP_FREED)) { 1716 /* 1717 * Entries are never inserted in middle of a list, thus 1718 * as long as we are in SMR, we can continue traversal. 1719 * Jump to 'restart' should yield in the same result, 1720 * but could produce unnecessary looping. Could this 1721 * looping be unbound? 1722 */ 1723 inp_unlock(inp, lock); 1724 goto next; 1725 } else { 1726 smr_exit(ipi->ipi_smr); 1727 goto found; 1728 } 1729 } 1730 1731 /* 1732 * Can't obtain lock immediately, thus going hard. Once we exit the 1733 * SMR section we can no longer jump to 'next', and our only stable 1734 * anchoring point is ii->inp, which we keep locked for this case, so 1735 * we jump to 'restart'. 1736 */ 1737 if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { 1738 smr_exit(ipi->ipi_smr); 1739 inp_lock(inp, lock); 1740 if (__predict_false(in_pcbrele(inp, lock))) { 1741 smr_enter(ipi->ipi_smr); 1742 goto restart; 1743 } 1744 /* 1745 * See comment in inp_smr_lock(). 1746 */ 1747 if (__predict_false(inp->inp_flags & INP_FREED)) { 1748 inp_unlock(inp, lock); 1749 smr_enter(ipi->ipi_smr); 1750 goto restart; 1751 } 1752 } else 1753 goto next; 1754 1755 found: 1756 inp_unlock(ii->inp, lock); 1757 ii->inp = inp; 1758 1759 return (ii->inp); 1760 } 1761 1762 /* 1763 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1764 * stability of an inpcb pointer despite the inpcb lock being released or 1765 * SMR section exited. 1766 * 1767 * To free a reference later in_pcbrele_(r|w)locked() must be performed. 1768 */ 1769 void 1770 in_pcbref(struct inpcb *inp) 1771 { 1772 u_int old __diagused; 1773 1774 old = refcount_acquire(&inp->inp_refcount); 1775 KASSERT(old > 0, ("%s: refcount 0", __func__)); 1776 } 1777 1778 /* 1779 * Drop a refcount on an inpcb elevated using in_pcbref(), potentially 1780 * freeing the pcb, if the reference was very last. 1781 */ 1782 bool 1783 in_pcbrele_rlocked(struct inpcb *inp) 1784 { 1785 1786 INP_RLOCK_ASSERT(inp); 1787 1788 if (refcount_release(&inp->inp_refcount) == 0) 1789 return (false); 1790 1791 MPASS(inp->inp_flags & INP_FREED); 1792 MPASS(inp->inp_socket == NULL); 1793 MPASS(inp->inp_in_hpts == 0); 1794 INP_RUNLOCK(inp); 1795 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1796 return (true); 1797 } 1798 1799 bool 1800 in_pcbrele_wlocked(struct inpcb *inp) 1801 { 1802 1803 INP_WLOCK_ASSERT(inp); 1804 1805 if (refcount_release(&inp->inp_refcount) == 0) 1806 return (false); 1807 1808 MPASS(inp->inp_flags & INP_FREED); 1809 MPASS(inp->inp_socket == NULL); 1810 MPASS(inp->inp_in_hpts == 0); 1811 INP_WUNLOCK(inp); 1812 uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); 1813 return (true); 1814 } 1815 1816 /* 1817 * Unconditionally schedule an inpcb to be freed by decrementing its 1818 * reference count, which should occur only after the inpcb has been detached 1819 * from its socket. If another thread holds a temporary reference (acquired 1820 * using in_pcbref()) then the free is deferred until that reference is 1821 * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. 1822 * Almost all work, including removal from global lists, is done in this 1823 * context, where the pcbinfo lock is held. 1824 */ 1825 void 1826 in_pcbfree(struct inpcb *inp) 1827 { 1828 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1829 #ifdef INET 1830 struct ip_moptions *imo; 1831 #endif 1832 #ifdef INET6 1833 struct ip6_moptions *im6o; 1834 #endif 1835 1836 INP_WLOCK_ASSERT(inp); 1837 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1838 KASSERT((inp->inp_flags & INP_FREED) == 0, 1839 ("%s: called twice for pcb %p", __func__, inp)); 1840 1841 inp->inp_flags |= INP_FREED; 1842 INP_INFO_WLOCK(pcbinfo); 1843 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1844 pcbinfo->ipi_count--; 1845 CK_LIST_REMOVE(inp, inp_list); 1846 INP_INFO_WUNLOCK(pcbinfo); 1847 1848 if (inp->inp_flags & INP_INHASHLIST) { 1849 struct inpcbport *phd = inp->inp_phd; 1850 1851 INP_HASH_WLOCK(pcbinfo); 1852 /* XXX: Only do if SO_REUSEPORT_LB set? */ 1853 in_pcbremlbgrouphash(inp); 1854 1855 CK_LIST_REMOVE(inp, inp_hash); 1856 CK_LIST_REMOVE(inp, inp_portlist); 1857 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1858 CK_LIST_REMOVE(phd, phd_hash); 1859 uma_zfree_smr(pcbinfo->ipi_portzone, phd); 1860 } 1861 INP_HASH_WUNLOCK(pcbinfo); 1862 inp->inp_flags &= ~INP_INHASHLIST; 1863 } 1864 1865 RO_INVALIDATE_CACHE(&inp->inp_route); 1866 #ifdef MAC 1867 mac_inpcb_destroy(inp); 1868 #endif 1869 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1870 if (inp->inp_sp != NULL) 1871 ipsec_delete_pcbpolicy(inp); 1872 #endif 1873 #ifdef INET 1874 if (inp->inp_options) 1875 (void)m_free(inp->inp_options); 1876 imo = inp->inp_moptions; 1877 #endif 1878 #ifdef INET6 1879 if (inp->inp_vflag & INP_IPV6PROTO) { 1880 ip6_freepcbopts(inp->in6p_outputopts); 1881 im6o = inp->in6p_moptions; 1882 } else 1883 im6o = NULL; 1884 #endif 1885 1886 if (__predict_false(in_pcbrele_wlocked(inp) == false)) { 1887 INP_WUNLOCK(inp); 1888 } 1889 #ifdef INET6 1890 ip6_freemoptions(im6o); 1891 #endif 1892 #ifdef INET 1893 inp_freemoptions(imo); 1894 #endif 1895 /* Destruction is finalized in inpcb_dtor(). */ 1896 } 1897 1898 static void 1899 inpcb_dtor(void *mem, int size, void *arg) 1900 { 1901 struct inpcb *inp = mem; 1902 1903 crfree(inp->inp_cred); 1904 #ifdef INVARIANTS 1905 inp->inp_cred = NULL; 1906 #endif 1907 } 1908 1909 /* 1910 * Different protocols initialize their inpcbs differently - giving 1911 * different name to the lock. But they all are disposed the same. 1912 */ 1913 static void 1914 inpcb_fini(void *mem, int size) 1915 { 1916 struct inpcb *inp = mem; 1917 1918 INP_LOCK_DESTROY(inp); 1919 } 1920 1921 /* 1922 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1923 * port reservation, and preventing it from being returned by inpcb lookups. 1924 * 1925 * It is used by TCP to mark an inpcb as unused and avoid future packet 1926 * delivery or event notification when a socket remains open but TCP has 1927 * closed. This might occur as a result of a shutdown()-initiated TCP close 1928 * or a RST on the wire, and allows the port binding to be reused while still 1929 * maintaining the invariant that so_pcb always points to a valid inpcb until 1930 * in_pcbdetach(). 1931 * 1932 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1933 * in_pcbnotifyall() and in_pcbpurgeif0()? 1934 */ 1935 void 1936 in_pcbdrop(struct inpcb *inp) 1937 { 1938 1939 INP_WLOCK_ASSERT(inp); 1940 #ifdef INVARIANTS 1941 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1942 MPASS(inp->inp_refcount > 1); 1943 #endif 1944 1945 /* 1946 * XXXRW: Possibly we should protect the setting of INP_DROPPED with 1947 * the hash lock...? 1948 */ 1949 inp->inp_flags |= INP_DROPPED; 1950 if (inp->inp_flags & INP_INHASHLIST) { 1951 struct inpcbport *phd = inp->inp_phd; 1952 1953 INP_HASH_WLOCK(inp->inp_pcbinfo); 1954 in_pcbremlbgrouphash(inp); 1955 CK_LIST_REMOVE(inp, inp_hash); 1956 CK_LIST_REMOVE(inp, inp_portlist); 1957 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1958 CK_LIST_REMOVE(phd, phd_hash); 1959 uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); 1960 } 1961 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1962 inp->inp_flags &= ~INP_INHASHLIST; 1963 } 1964 } 1965 1966 #ifdef INET 1967 /* 1968 * Common routines to return the socket addresses associated with inpcbs. 1969 */ 1970 struct sockaddr * 1971 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1972 { 1973 struct sockaddr_in *sin; 1974 1975 sin = malloc(sizeof *sin, M_SONAME, 1976 M_WAITOK | M_ZERO); 1977 sin->sin_family = AF_INET; 1978 sin->sin_len = sizeof(*sin); 1979 sin->sin_addr = *addr_p; 1980 sin->sin_port = port; 1981 1982 return (struct sockaddr *)sin; 1983 } 1984 1985 int 1986 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1987 { 1988 struct inpcb *inp; 1989 struct in_addr addr; 1990 in_port_t port; 1991 1992 inp = sotoinpcb(so); 1993 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1994 1995 INP_RLOCK(inp); 1996 port = inp->inp_lport; 1997 addr = inp->inp_laddr; 1998 INP_RUNLOCK(inp); 1999 2000 *nam = in_sockaddr(port, &addr); 2001 return 0; 2002 } 2003 2004 int 2005 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 2006 { 2007 struct inpcb *inp; 2008 struct in_addr addr; 2009 in_port_t port; 2010 2011 inp = sotoinpcb(so); 2012 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 2013 2014 INP_RLOCK(inp); 2015 port = inp->inp_fport; 2016 addr = inp->inp_faddr; 2017 INP_RUNLOCK(inp); 2018 2019 *nam = in_sockaddr(port, &addr); 2020 return 0; 2021 } 2022 2023 void 2024 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 2025 struct inpcb *(*notify)(struct inpcb *, int)) 2026 { 2027 struct inpcb *inp, *inp_temp; 2028 2029 INP_INFO_WLOCK(pcbinfo); 2030 CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { 2031 INP_WLOCK(inp); 2032 #ifdef INET6 2033 if ((inp->inp_vflag & INP_IPV4) == 0) { 2034 INP_WUNLOCK(inp); 2035 continue; 2036 } 2037 #endif 2038 if (inp->inp_faddr.s_addr != faddr.s_addr || 2039 inp->inp_socket == NULL) { 2040 INP_WUNLOCK(inp); 2041 continue; 2042 } 2043 if ((*notify)(inp, errno)) 2044 INP_WUNLOCK(inp); 2045 } 2046 INP_INFO_WUNLOCK(pcbinfo); 2047 } 2048 2049 static bool 2050 inp_v4_multi_match(const struct inpcb *inp, void *v __unused) 2051 { 2052 2053 if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) 2054 return (true); 2055 else 2056 return (false); 2057 } 2058 2059 void 2060 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 2061 { 2062 struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, 2063 inp_v4_multi_match, NULL); 2064 struct inpcb *inp; 2065 struct in_multi *inm; 2066 struct in_mfilter *imf; 2067 struct ip_moptions *imo; 2068 2069 IN_MULTI_LOCK_ASSERT(); 2070 2071 while ((inp = inp_next(&inpi)) != NULL) { 2072 INP_WLOCK_ASSERT(inp); 2073 2074 imo = inp->inp_moptions; 2075 /* 2076 * Unselect the outgoing interface if it is being 2077 * detached. 2078 */ 2079 if (imo->imo_multicast_ifp == ifp) 2080 imo->imo_multicast_ifp = NULL; 2081 2082 /* 2083 * Drop multicast group membership if we joined 2084 * through the interface being detached. 2085 * 2086 * XXX This can all be deferred to an epoch_call 2087 */ 2088 restart: 2089 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 2090 if ((inm = imf->imf_inm) == NULL) 2091 continue; 2092 if (inm->inm_ifp != ifp) 2093 continue; 2094 ip_mfilter_remove(&imo->imo_head, imf); 2095 in_leavegroup_locked(inm, NULL); 2096 ip_mfilter_free(imf); 2097 goto restart; 2098 } 2099 } 2100 } 2101 2102 /* 2103 * Lookup a PCB based on the local address and port. Caller must hold the 2104 * hash lock. No inpcb locks or references are acquired. 2105 */ 2106 #define INP_LOOKUP_MAPPED_PCB_COST 3 2107 struct inpcb * 2108 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 2109 u_short lport, int lookupflags, struct ucred *cred) 2110 { 2111 struct inpcb *inp; 2112 #ifdef INET6 2113 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 2114 #else 2115 int matchwild = 3; 2116 #endif 2117 int wildcard; 2118 2119 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2120 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2121 INP_HASH_LOCK_ASSERT(pcbinfo); 2122 2123 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 2124 struct inpcbhead *head; 2125 /* 2126 * Look for an unconnected (wildcard foreign addr) PCB that 2127 * matches the local address and port we're looking for. 2128 */ 2129 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2130 pcbinfo->ipi_hashmask)]; 2131 CK_LIST_FOREACH(inp, head, inp_hash) { 2132 #ifdef INET6 2133 /* XXX inp locking */ 2134 if ((inp->inp_vflag & INP_IPV4) == 0) 2135 continue; 2136 #endif 2137 if (inp->inp_faddr.s_addr == INADDR_ANY && 2138 inp->inp_laddr.s_addr == laddr.s_addr && 2139 inp->inp_lport == lport) { 2140 /* 2141 * Found? 2142 */ 2143 if (cred == NULL || 2144 prison_equal_ip4(cred->cr_prison, 2145 inp->inp_cred->cr_prison)) 2146 return (inp); 2147 } 2148 } 2149 /* 2150 * Not found. 2151 */ 2152 return (NULL); 2153 } else { 2154 struct inpcbporthead *porthash; 2155 struct inpcbport *phd; 2156 struct inpcb *match = NULL; 2157 /* 2158 * Best fit PCB lookup. 2159 * 2160 * First see if this local port is in use by looking on the 2161 * port hash list. 2162 */ 2163 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 2164 pcbinfo->ipi_porthashmask)]; 2165 CK_LIST_FOREACH(phd, porthash, phd_hash) { 2166 if (phd->phd_port == lport) 2167 break; 2168 } 2169 if (phd != NULL) { 2170 /* 2171 * Port is in use by one or more PCBs. Look for best 2172 * fit. 2173 */ 2174 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 2175 wildcard = 0; 2176 if (cred != NULL && 2177 !prison_equal_ip4(inp->inp_cred->cr_prison, 2178 cred->cr_prison)) 2179 continue; 2180 #ifdef INET6 2181 /* XXX inp locking */ 2182 if ((inp->inp_vflag & INP_IPV4) == 0) 2183 continue; 2184 /* 2185 * We never select the PCB that has 2186 * INP_IPV6 flag and is bound to :: if 2187 * we have another PCB which is bound 2188 * to 0.0.0.0. If a PCB has the 2189 * INP_IPV6 flag, then we set its cost 2190 * higher than IPv4 only PCBs. 2191 * 2192 * Note that the case only happens 2193 * when a socket is bound to ::, under 2194 * the condition that the use of the 2195 * mapped address is allowed. 2196 */ 2197 if ((inp->inp_vflag & INP_IPV6) != 0) 2198 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 2199 #endif 2200 if (inp->inp_faddr.s_addr != INADDR_ANY) 2201 wildcard++; 2202 if (inp->inp_laddr.s_addr != INADDR_ANY) { 2203 if (laddr.s_addr == INADDR_ANY) 2204 wildcard++; 2205 else if (inp->inp_laddr.s_addr != laddr.s_addr) 2206 continue; 2207 } else { 2208 if (laddr.s_addr != INADDR_ANY) 2209 wildcard++; 2210 } 2211 if (wildcard < matchwild) { 2212 match = inp; 2213 matchwild = wildcard; 2214 if (matchwild == 0) 2215 break; 2216 } 2217 } 2218 } 2219 return (match); 2220 } 2221 } 2222 #undef INP_LOOKUP_MAPPED_PCB_COST 2223 2224 static struct inpcb * 2225 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 2226 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 2227 uint16_t fport, int lookupflags, int numa_domain) 2228 { 2229 struct inpcb *local_wild, *numa_wild; 2230 const struct inpcblbgrouphead *hdr; 2231 struct inpcblbgroup *grp; 2232 uint32_t idx; 2233 2234 INP_HASH_LOCK_ASSERT(pcbinfo); 2235 2236 hdr = &pcbinfo->ipi_lbgrouphashbase[ 2237 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 2238 2239 /* 2240 * Order of socket selection: 2241 * 1. non-wild. 2242 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). 2243 * 2244 * NOTE: 2245 * - Load balanced group does not contain jailed sockets 2246 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets 2247 */ 2248 local_wild = NULL; 2249 numa_wild = NULL; 2250 CK_LIST_FOREACH(grp, hdr, il_list) { 2251 #ifdef INET6 2252 if (!(grp->il_vflag & INP_IPV4)) 2253 continue; 2254 #endif 2255 if (grp->il_lport != lport) 2256 continue; 2257 2258 idx = INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) % 2259 grp->il_inpcnt; 2260 if (grp->il_laddr.s_addr == laddr->s_addr) { 2261 if (numa_domain == M_NODOM || 2262 grp->il_numa_domain == numa_domain) { 2263 return (grp->il_inp[idx]); 2264 } else { 2265 numa_wild = grp->il_inp[idx]; 2266 } 2267 } 2268 if (grp->il_laddr.s_addr == INADDR_ANY && 2269 (lookupflags & INPLOOKUP_WILDCARD) != 0 && 2270 (local_wild == NULL || numa_domain == M_NODOM || 2271 grp->il_numa_domain == numa_domain)) { 2272 local_wild = grp->il_inp[idx]; 2273 } 2274 } 2275 if (numa_wild != NULL) 2276 return (numa_wild); 2277 2278 return (local_wild); 2279 } 2280 2281 /* 2282 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2283 * that the caller has either locked the hash list, which usually happens 2284 * for bind(2) operations, or is in SMR section, which happens when sorting 2285 * out incoming packets. 2286 */ 2287 static struct inpcb * 2288 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2289 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2290 struct ifnet *ifp, uint8_t numa_domain) 2291 { 2292 struct inpcbhead *head; 2293 struct inpcb *inp, *tmpinp; 2294 u_short fport = fport_arg, lport = lport_arg; 2295 2296 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2297 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2298 INP_HASH_LOCK_ASSERT(pcbinfo); 2299 2300 /* 2301 * First look for an exact match. 2302 */ 2303 tmpinp = NULL; 2304 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, 2305 pcbinfo->ipi_hashmask)]; 2306 CK_LIST_FOREACH(inp, head, inp_hash) { 2307 #ifdef INET6 2308 /* XXX inp locking */ 2309 if ((inp->inp_vflag & INP_IPV4) == 0) 2310 continue; 2311 #endif 2312 if (inp->inp_faddr.s_addr == faddr.s_addr && 2313 inp->inp_laddr.s_addr == laddr.s_addr && 2314 inp->inp_fport == fport && 2315 inp->inp_lport == lport) { 2316 /* 2317 * XXX We should be able to directly return 2318 * the inp here, without any checks. 2319 * Well unless both bound with SO_REUSEPORT? 2320 */ 2321 if (prison_flag(inp->inp_cred, PR_IP4)) 2322 return (inp); 2323 if (tmpinp == NULL) 2324 tmpinp = inp; 2325 } 2326 } 2327 if (tmpinp != NULL) 2328 return (tmpinp); 2329 2330 /* 2331 * Then look in lb group (for wildcard match). 2332 */ 2333 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2334 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2335 fport, lookupflags, numa_domain); 2336 if (inp != NULL) 2337 return (inp); 2338 } 2339 2340 /* 2341 * Then look for a wildcard match, if requested. 2342 */ 2343 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2344 struct inpcb *local_wild = NULL, *local_exact = NULL; 2345 #ifdef INET6 2346 struct inpcb *local_wild_mapped = NULL; 2347 #endif 2348 struct inpcb *jail_wild = NULL; 2349 int injail; 2350 2351 /* 2352 * Order of socket selection - we always prefer jails. 2353 * 1. jailed, non-wild. 2354 * 2. jailed, wild. 2355 * 3. non-jailed, non-wild. 2356 * 4. non-jailed, wild. 2357 */ 2358 2359 head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, 2360 pcbinfo->ipi_hashmask)]; 2361 CK_LIST_FOREACH(inp, head, inp_hash) { 2362 #ifdef INET6 2363 /* XXX inp locking */ 2364 if ((inp->inp_vflag & INP_IPV4) == 0) 2365 continue; 2366 #endif 2367 if (inp->inp_faddr.s_addr != INADDR_ANY || 2368 inp->inp_lport != lport) 2369 continue; 2370 2371 injail = prison_flag(inp->inp_cred, PR_IP4); 2372 if (injail) { 2373 if (prison_check_ip4_locked( 2374 inp->inp_cred->cr_prison, &laddr) != 0) 2375 continue; 2376 } else { 2377 if (local_exact != NULL) 2378 continue; 2379 } 2380 2381 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2382 if (injail) 2383 return (inp); 2384 else 2385 local_exact = inp; 2386 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2387 #ifdef INET6 2388 /* XXX inp locking, NULL check */ 2389 if (inp->inp_vflag & INP_IPV6PROTO) 2390 local_wild_mapped = inp; 2391 else 2392 #endif 2393 if (injail) 2394 jail_wild = inp; 2395 else 2396 local_wild = inp; 2397 } 2398 } /* LIST_FOREACH */ 2399 if (jail_wild != NULL) 2400 return (jail_wild); 2401 if (local_exact != NULL) 2402 return (local_exact); 2403 if (local_wild != NULL) 2404 return (local_wild); 2405 #ifdef INET6 2406 if (local_wild_mapped != NULL) 2407 return (local_wild_mapped); 2408 #endif 2409 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2410 2411 return (NULL); 2412 } 2413 2414 /* 2415 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2416 * hash list lock, and will return the inpcb locked (i.e., requires 2417 * INPLOOKUP_LOCKPCB). 2418 */ 2419 static struct inpcb * 2420 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2421 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2422 struct ifnet *ifp, uint8_t numa_domain) 2423 { 2424 struct inpcb *inp; 2425 2426 smr_enter(pcbinfo->ipi_smr); 2427 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2428 lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); 2429 if (inp != NULL) { 2430 if (__predict_false(inp_smr_lock(inp, 2431 (lookupflags & INPLOOKUP_LOCKMASK)) == false)) 2432 inp = NULL; 2433 } else 2434 smr_exit(pcbinfo->ipi_smr); 2435 2436 return (inp); 2437 } 2438 2439 /* 2440 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2441 * from which a pre-calculated hash value may be extracted. 2442 */ 2443 struct inpcb * 2444 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2445 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2446 { 2447 2448 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2449 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2450 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2451 ("%s: LOCKPCB not set", __func__)); 2452 2453 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2454 lookupflags, ifp, M_NODOM)); 2455 } 2456 2457 struct inpcb * 2458 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2459 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2460 struct ifnet *ifp, struct mbuf *m) 2461 { 2462 2463 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2464 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2465 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2466 ("%s: LOCKPCB not set", __func__)); 2467 2468 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2469 lookupflags, ifp, m->m_pkthdr.numa_domain)); 2470 } 2471 #endif /* INET */ 2472 2473 /* 2474 * Insert PCB onto various hash lists. 2475 */ 2476 int 2477 in_pcbinshash(struct inpcb *inp) 2478 { 2479 struct inpcbhead *pcbhash; 2480 struct inpcbporthead *pcbporthash; 2481 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2482 struct inpcbport *phd; 2483 int so_options; 2484 2485 INP_WLOCK_ASSERT(inp); 2486 INP_HASH_WLOCK_ASSERT(pcbinfo); 2487 2488 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2489 ("in_pcbinshash: INP_INHASHLIST")); 2490 2491 #ifdef INET6 2492 if (inp->inp_vflag & INP_IPV6) 2493 pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2494 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2495 else 2496 #endif 2497 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2498 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2499 2500 pcbporthash = &pcbinfo->ipi_porthashbase[ 2501 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2502 2503 /* 2504 * Add entry to load balance group. 2505 * Only do this if SO_REUSEPORT_LB is set. 2506 */ 2507 so_options = inp_so_options(inp); 2508 if (so_options & SO_REUSEPORT_LB) { 2509 int ret = in_pcbinslbgrouphash(inp, M_NODOM); 2510 if (ret) { 2511 /* pcb lb group malloc fail (ret=ENOBUFS). */ 2512 return (ret); 2513 } 2514 } 2515 2516 /* 2517 * Go through port list and look for a head for this lport. 2518 */ 2519 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2520 if (phd->phd_port == inp->inp_lport) 2521 break; 2522 } 2523 /* 2524 * If none exists, malloc one and tack it on. 2525 */ 2526 if (phd == NULL) { 2527 phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); 2528 if (phd == NULL) { 2529 return (ENOBUFS); /* XXX */ 2530 } 2531 phd->phd_port = inp->inp_lport; 2532 CK_LIST_INIT(&phd->phd_pcblist); 2533 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2534 } 2535 inp->inp_phd = phd; 2536 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2537 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2538 inp->inp_flags |= INP_INHASHLIST; 2539 2540 return (0); 2541 } 2542 2543 /* 2544 * Move PCB to the proper hash bucket when { faddr, fport } have been 2545 * changed. NOTE: This does not handle the case of the lport changing (the 2546 * hashed port list would have to be updated as well), so the lport must 2547 * not change after in_pcbinshash() has been called. 2548 * 2549 * XXXGL: a race between this function and SMR-protected hash iterator 2550 * will lead to iterator traversing a possibly wrong hash list. However, 2551 * this race should have been here since change from rwlock to epoch. 2552 */ 2553 void 2554 in_pcbrehash(struct inpcb *inp) 2555 { 2556 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2557 struct inpcbhead *head; 2558 2559 INP_WLOCK_ASSERT(inp); 2560 INP_HASH_WLOCK_ASSERT(pcbinfo); 2561 2562 KASSERT(inp->inp_flags & INP_INHASHLIST, 2563 ("in_pcbrehash: !INP_INHASHLIST")); 2564 2565 #ifdef INET6 2566 if (inp->inp_vflag & INP_IPV6) 2567 head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, 2568 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2569 else 2570 #endif 2571 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, 2572 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2573 2574 CK_LIST_REMOVE(inp, inp_hash); 2575 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2576 } 2577 2578 /* 2579 * Check for alternatives when higher level complains 2580 * about service problems. For now, invalidate cached 2581 * routing information. If the route was created dynamically 2582 * (by a redirect), time to try a default gateway again. 2583 */ 2584 void 2585 in_losing(struct inpcb *inp) 2586 { 2587 2588 RO_INVALIDATE_CACHE(&inp->inp_route); 2589 return; 2590 } 2591 2592 /* 2593 * A set label operation has occurred at the socket layer, propagate the 2594 * label change into the in_pcb for the socket. 2595 */ 2596 void 2597 in_pcbsosetlabel(struct socket *so) 2598 { 2599 #ifdef MAC 2600 struct inpcb *inp; 2601 2602 inp = sotoinpcb(so); 2603 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2604 2605 INP_WLOCK(inp); 2606 SOCK_LOCK(so); 2607 mac_inpcb_sosetlabel(so, inp); 2608 SOCK_UNLOCK(so); 2609 INP_WUNLOCK(inp); 2610 #endif 2611 } 2612 2613 /* 2614 * ipport_tick runs once per second, determining if random port allocation 2615 * should be continued. If more than ipport_randomcps ports have been 2616 * allocated in the last second, then we return to sequential port 2617 * allocation. We return to random allocation only once we drop below 2618 * ipport_randomcps for at least ipport_randomtime seconds. 2619 */ 2620 static void 2621 ipport_tick(void *xtp) 2622 { 2623 VNET_ITERATOR_DECL(vnet_iter); 2624 2625 VNET_LIST_RLOCK_NOSLEEP(); 2626 VNET_FOREACH(vnet_iter) { 2627 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ 2628 if (V_ipport_tcpallocs - V_ipport_tcplastcount <= 2629 V_ipport_randomcps) { 2630 if (V_ipport_stoprandom > 0) 2631 V_ipport_stoprandom--; 2632 } else 2633 V_ipport_stoprandom = V_ipport_randomtime; 2634 V_ipport_tcplastcount = V_ipport_tcpallocs; 2635 CURVNET_RESTORE(); 2636 } 2637 VNET_LIST_RUNLOCK_NOSLEEP(); 2638 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); 2639 } 2640 2641 static void 2642 ip_fini(void *xtp) 2643 { 2644 2645 callout_stop(&ipport_tick_callout); 2646 } 2647 2648 /* 2649 * The ipport_callout should start running at about the time we attach the 2650 * inet or inet6 domains. 2651 */ 2652 static void 2653 ipport_tick_init(const void *unused __unused) 2654 { 2655 2656 /* Start ipport_tick. */ 2657 callout_init(&ipport_tick_callout, 1); 2658 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 2659 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 2660 SHUTDOWN_PRI_DEFAULT); 2661 } 2662 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 2663 ipport_tick_init, NULL); 2664 2665 void 2666 inp_wlock(struct inpcb *inp) 2667 { 2668 2669 INP_WLOCK(inp); 2670 } 2671 2672 void 2673 inp_wunlock(struct inpcb *inp) 2674 { 2675 2676 INP_WUNLOCK(inp); 2677 } 2678 2679 void 2680 inp_rlock(struct inpcb *inp) 2681 { 2682 2683 INP_RLOCK(inp); 2684 } 2685 2686 void 2687 inp_runlock(struct inpcb *inp) 2688 { 2689 2690 INP_RUNLOCK(inp); 2691 } 2692 2693 #ifdef INVARIANT_SUPPORT 2694 void 2695 inp_lock_assert(struct inpcb *inp) 2696 { 2697 2698 INP_WLOCK_ASSERT(inp); 2699 } 2700 2701 void 2702 inp_unlock_assert(struct inpcb *inp) 2703 { 2704 2705 INP_UNLOCK_ASSERT(inp); 2706 } 2707 #endif 2708 2709 void 2710 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) 2711 { 2712 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, 2713 INPLOOKUP_WLOCKPCB); 2714 struct inpcb *inp; 2715 2716 while ((inp = inp_next(&inpi)) != NULL) 2717 func(inp, arg); 2718 } 2719 2720 struct socket * 2721 inp_inpcbtosocket(struct inpcb *inp) 2722 { 2723 2724 INP_WLOCK_ASSERT(inp); 2725 return (inp->inp_socket); 2726 } 2727 2728 struct tcpcb * 2729 inp_inpcbtotcpcb(struct inpcb *inp) 2730 { 2731 2732 INP_WLOCK_ASSERT(inp); 2733 return ((struct tcpcb *)inp->inp_ppcb); 2734 } 2735 2736 int 2737 inp_ip_tos_get(const struct inpcb *inp) 2738 { 2739 2740 return (inp->inp_ip_tos); 2741 } 2742 2743 void 2744 inp_ip_tos_set(struct inpcb *inp, int val) 2745 { 2746 2747 inp->inp_ip_tos = val; 2748 } 2749 2750 void 2751 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2752 uint32_t *faddr, uint16_t *fp) 2753 { 2754 2755 INP_LOCK_ASSERT(inp); 2756 *laddr = inp->inp_laddr.s_addr; 2757 *faddr = inp->inp_faddr.s_addr; 2758 *lp = inp->inp_lport; 2759 *fp = inp->inp_fport; 2760 } 2761 2762 struct inpcb * 2763 so_sotoinpcb(struct socket *so) 2764 { 2765 2766 return (sotoinpcb(so)); 2767 } 2768 2769 struct tcpcb * 2770 so_sototcpcb(struct socket *so) 2771 { 2772 2773 return (sototcpcb(so)); 2774 } 2775 2776 /* 2777 * Create an external-format (``xinpcb'') structure using the information in 2778 * the kernel-format in_pcb structure pointed to by inp. This is done to 2779 * reduce the spew of irrelevant information over this interface, to isolate 2780 * user code from changes in the kernel structure, and potentially to provide 2781 * information-hiding if we decide that some of this information should be 2782 * hidden from users. 2783 */ 2784 void 2785 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2786 { 2787 2788 bzero(xi, sizeof(*xi)); 2789 xi->xi_len = sizeof(struct xinpcb); 2790 if (inp->inp_socket) 2791 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2792 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2793 xi->inp_gencnt = inp->inp_gencnt; 2794 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2795 xi->inp_flow = inp->inp_flow; 2796 xi->inp_flowid = inp->inp_flowid; 2797 xi->inp_flowtype = inp->inp_flowtype; 2798 xi->inp_flags = inp->inp_flags; 2799 xi->inp_flags2 = inp->inp_flags2; 2800 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2801 xi->in6p_cksum = inp->in6p_cksum; 2802 xi->in6p_hops = inp->in6p_hops; 2803 xi->inp_ip_tos = inp->inp_ip_tos; 2804 xi->inp_vflag = inp->inp_vflag; 2805 xi->inp_ip_ttl = inp->inp_ip_ttl; 2806 xi->inp_ip_p = inp->inp_ip_p; 2807 xi->inp_ip_minttl = inp->inp_ip_minttl; 2808 } 2809 2810 int 2811 sysctl_setsockopt(SYSCTL_HANDLER_ARGS, struct inpcbinfo *pcbinfo, 2812 int (*ctloutput_set)(struct inpcb *, struct sockopt *)) 2813 { 2814 struct sockopt sopt; 2815 struct inpcb_iterator inpi = INP_ALL_ITERATOR(pcbinfo, 2816 INPLOOKUP_WLOCKPCB); 2817 struct inpcb *inp; 2818 struct sockopt_parameters *params; 2819 struct socket *so; 2820 int error; 2821 char buf[1024]; 2822 2823 if (req->oldptr != NULL || req->oldlen != 0) 2824 return (EINVAL); 2825 if (req->newptr == NULL) 2826 return (EPERM); 2827 if (req->newlen > sizeof(buf)) 2828 return (ENOMEM); 2829 error = SYSCTL_IN(req, buf, req->newlen); 2830 if (error != 0) 2831 return (error); 2832 if (req->newlen < sizeof(struct sockopt_parameters)) 2833 return (EINVAL); 2834 params = (struct sockopt_parameters *)buf; 2835 sopt.sopt_level = params->sop_level; 2836 sopt.sopt_name = params->sop_optname; 2837 sopt.sopt_dir = SOPT_SET; 2838 sopt.sopt_val = params->sop_optval; 2839 sopt.sopt_valsize = req->newlen - sizeof(struct sockopt_parameters); 2840 sopt.sopt_td = NULL; 2841 #ifdef INET6 2842 if (params->sop_inc.inc_flags & INC_ISIPV6) { 2843 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_laddr)) 2844 params->sop_inc.inc6_laddr.s6_addr16[1] = 2845 htons(params->sop_inc.inc6_zoneid & 0xffff); 2846 if (IN6_IS_SCOPE_LINKLOCAL(¶ms->sop_inc.inc6_faddr)) 2847 params->sop_inc.inc6_faddr.s6_addr16[1] = 2848 htons(params->sop_inc.inc6_zoneid & 0xffff); 2849 } 2850 #endif 2851 if (params->sop_inc.inc_lport != htons(0)) { 2852 if (params->sop_inc.inc_fport == htons(0)) 2853 inpi.hash = INP_PCBHASH_WILD(params->sop_inc.inc_lport, 2854 pcbinfo->ipi_hashmask); 2855 else 2856 #ifdef INET6 2857 if (params->sop_inc.inc_flags & INC_ISIPV6) 2858 inpi.hash = INP6_PCBHASH( 2859 ¶ms->sop_inc.inc6_faddr, 2860 params->sop_inc.inc_lport, 2861 params->sop_inc.inc_fport, 2862 pcbinfo->ipi_hashmask); 2863 else 2864 #endif 2865 inpi.hash = INP_PCBHASH( 2866 ¶ms->sop_inc.inc_faddr, 2867 params->sop_inc.inc_lport, 2868 params->sop_inc.inc_fport, 2869 pcbinfo->ipi_hashmask); 2870 } 2871 while ((inp = inp_next(&inpi)) != NULL) 2872 if (inp->inp_gencnt == params->sop_id) { 2873 if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { 2874 INP_WUNLOCK(inp); 2875 return (ECONNRESET); 2876 } 2877 so = inp->inp_socket; 2878 KASSERT(so != NULL, ("inp_socket == NULL")); 2879 soref(so); 2880 error = (*ctloutput_set)(inp, &sopt); 2881 sorele(so); 2882 break; 2883 } 2884 if (inp == NULL) 2885 error = ESRCH; 2886 return (error); 2887 } 2888 2889 #ifdef DDB 2890 static void 2891 db_print_indent(int indent) 2892 { 2893 int i; 2894 2895 for (i = 0; i < indent; i++) 2896 db_printf(" "); 2897 } 2898 2899 static void 2900 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2901 { 2902 char faddr_str[48], laddr_str[48]; 2903 2904 db_print_indent(indent); 2905 db_printf("%s at %p\n", name, inc); 2906 2907 indent += 2; 2908 2909 #ifdef INET6 2910 if (inc->inc_flags & INC_ISIPV6) { 2911 /* IPv6. */ 2912 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2913 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2914 } else 2915 #endif 2916 { 2917 /* IPv4. */ 2918 inet_ntoa_r(inc->inc_laddr, laddr_str); 2919 inet_ntoa_r(inc->inc_faddr, faddr_str); 2920 } 2921 db_print_indent(indent); 2922 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2923 ntohs(inc->inc_lport)); 2924 db_print_indent(indent); 2925 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2926 ntohs(inc->inc_fport)); 2927 } 2928 2929 static void 2930 db_print_inpflags(int inp_flags) 2931 { 2932 int comma; 2933 2934 comma = 0; 2935 if (inp_flags & INP_RECVOPTS) { 2936 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2937 comma = 1; 2938 } 2939 if (inp_flags & INP_RECVRETOPTS) { 2940 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2941 comma = 1; 2942 } 2943 if (inp_flags & INP_RECVDSTADDR) { 2944 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2945 comma = 1; 2946 } 2947 if (inp_flags & INP_ORIGDSTADDR) { 2948 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2949 comma = 1; 2950 } 2951 if (inp_flags & INP_HDRINCL) { 2952 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2953 comma = 1; 2954 } 2955 if (inp_flags & INP_HIGHPORT) { 2956 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2957 comma = 1; 2958 } 2959 if (inp_flags & INP_LOWPORT) { 2960 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2961 comma = 1; 2962 } 2963 if (inp_flags & INP_ANONPORT) { 2964 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2965 comma = 1; 2966 } 2967 if (inp_flags & INP_RECVIF) { 2968 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2969 comma = 1; 2970 } 2971 if (inp_flags & INP_MTUDISC) { 2972 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2973 comma = 1; 2974 } 2975 if (inp_flags & INP_RECVTTL) { 2976 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2977 comma = 1; 2978 } 2979 if (inp_flags & INP_DONTFRAG) { 2980 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2981 comma = 1; 2982 } 2983 if (inp_flags & INP_RECVTOS) { 2984 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2985 comma = 1; 2986 } 2987 if (inp_flags & IN6P_IPV6_V6ONLY) { 2988 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 2989 comma = 1; 2990 } 2991 if (inp_flags & IN6P_PKTINFO) { 2992 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 2993 comma = 1; 2994 } 2995 if (inp_flags & IN6P_HOPLIMIT) { 2996 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 2997 comma = 1; 2998 } 2999 if (inp_flags & IN6P_HOPOPTS) { 3000 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3001 comma = 1; 3002 } 3003 if (inp_flags & IN6P_DSTOPTS) { 3004 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3005 comma = 1; 3006 } 3007 if (inp_flags & IN6P_RTHDR) { 3008 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3009 comma = 1; 3010 } 3011 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3012 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3013 comma = 1; 3014 } 3015 if (inp_flags & IN6P_TCLASS) { 3016 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3017 comma = 1; 3018 } 3019 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3020 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3021 comma = 1; 3022 } 3023 if (inp_flags & INP_TIMEWAIT) { 3024 db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); 3025 comma = 1; 3026 } 3027 if (inp_flags & INP_ONESBCAST) { 3028 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3029 comma = 1; 3030 } 3031 if (inp_flags & INP_DROPPED) { 3032 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3033 comma = 1; 3034 } 3035 if (inp_flags & INP_SOCKREF) { 3036 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3037 comma = 1; 3038 } 3039 if (inp_flags & IN6P_RFC2292) { 3040 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3041 comma = 1; 3042 } 3043 if (inp_flags & IN6P_MTU) { 3044 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3045 comma = 1; 3046 } 3047 } 3048 3049 static void 3050 db_print_inpvflag(u_char inp_vflag) 3051 { 3052 int comma; 3053 3054 comma = 0; 3055 if (inp_vflag & INP_IPV4) { 3056 db_printf("%sINP_IPV4", comma ? ", " : ""); 3057 comma = 1; 3058 } 3059 if (inp_vflag & INP_IPV6) { 3060 db_printf("%sINP_IPV6", comma ? ", " : ""); 3061 comma = 1; 3062 } 3063 if (inp_vflag & INP_IPV6PROTO) { 3064 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3065 comma = 1; 3066 } 3067 } 3068 3069 static void 3070 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3071 { 3072 3073 db_print_indent(indent); 3074 db_printf("%s at %p\n", name, inp); 3075 3076 indent += 2; 3077 3078 db_print_indent(indent); 3079 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3080 3081 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3082 3083 db_print_indent(indent); 3084 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 3085 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 3086 3087 db_print_indent(indent); 3088 db_printf("inp_label: %p inp_flags: 0x%x (", 3089 inp->inp_label, inp->inp_flags); 3090 db_print_inpflags(inp->inp_flags); 3091 db_printf(")\n"); 3092 3093 db_print_indent(indent); 3094 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3095 inp->inp_vflag); 3096 db_print_inpvflag(inp->inp_vflag); 3097 db_printf(")\n"); 3098 3099 db_print_indent(indent); 3100 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3101 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3102 3103 db_print_indent(indent); 3104 #ifdef INET6 3105 if (inp->inp_vflag & INP_IPV6) { 3106 db_printf("in6p_options: %p in6p_outputopts: %p " 3107 "in6p_moptions: %p\n", inp->in6p_options, 3108 inp->in6p_outputopts, inp->in6p_moptions); 3109 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3110 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3111 inp->in6p_hops); 3112 } else 3113 #endif 3114 { 3115 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3116 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3117 inp->inp_options, inp->inp_moptions); 3118 } 3119 3120 db_print_indent(indent); 3121 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3122 (uintmax_t)inp->inp_gencnt); 3123 } 3124 3125 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3126 { 3127 struct inpcb *inp; 3128 3129 if (!have_addr) { 3130 db_printf("usage: show inpcb <addr>\n"); 3131 return; 3132 } 3133 inp = (struct inpcb *)addr; 3134 3135 db_print_inpcb(inp, "inpcb", 0); 3136 } 3137 #endif /* DDB */ 3138 3139 #ifdef RATELIMIT 3140 /* 3141 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3142 * if any. 3143 */ 3144 int 3145 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3146 { 3147 union if_snd_tag_modify_params params = { 3148 .rate_limit.max_rate = max_pacing_rate, 3149 .rate_limit.flags = M_NOWAIT, 3150 }; 3151 struct m_snd_tag *mst; 3152 int error; 3153 3154 mst = inp->inp_snd_tag; 3155 if (mst == NULL) 3156 return (EINVAL); 3157 3158 if (mst->sw->snd_tag_modify == NULL) { 3159 error = EOPNOTSUPP; 3160 } else { 3161 error = mst->sw->snd_tag_modify(mst, ¶ms); 3162 } 3163 return (error); 3164 } 3165 3166 /* 3167 * Query existing TX rate limit based on the existing 3168 * "inp->inp_snd_tag", if any. 3169 */ 3170 int 3171 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3172 { 3173 union if_snd_tag_query_params params = { }; 3174 struct m_snd_tag *mst; 3175 int error; 3176 3177 mst = inp->inp_snd_tag; 3178 if (mst == NULL) 3179 return (EINVAL); 3180 3181 if (mst->sw->snd_tag_query == NULL) { 3182 error = EOPNOTSUPP; 3183 } else { 3184 error = mst->sw->snd_tag_query(mst, ¶ms); 3185 if (error == 0 && p_max_pacing_rate != NULL) 3186 *p_max_pacing_rate = params.rate_limit.max_rate; 3187 } 3188 return (error); 3189 } 3190 3191 /* 3192 * Query existing TX queue level based on the existing 3193 * "inp->inp_snd_tag", if any. 3194 */ 3195 int 3196 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3197 { 3198 union if_snd_tag_query_params params = { }; 3199 struct m_snd_tag *mst; 3200 int error; 3201 3202 mst = inp->inp_snd_tag; 3203 if (mst == NULL) 3204 return (EINVAL); 3205 3206 if (mst->sw->snd_tag_query == NULL) 3207 return (EOPNOTSUPP); 3208 3209 error = mst->sw->snd_tag_query(mst, ¶ms); 3210 if (error == 0 && p_txqueue_level != NULL) 3211 *p_txqueue_level = params.rate_limit.queue_level; 3212 return (error); 3213 } 3214 3215 /* 3216 * Allocate a new TX rate limit send tag from the network interface 3217 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3218 */ 3219 int 3220 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3221 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3222 3223 { 3224 union if_snd_tag_alloc_params params = { 3225 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3226 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3227 .rate_limit.hdr.flowid = flowid, 3228 .rate_limit.hdr.flowtype = flowtype, 3229 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3230 .rate_limit.max_rate = max_pacing_rate, 3231 .rate_limit.flags = M_NOWAIT, 3232 }; 3233 int error; 3234 3235 INP_WLOCK_ASSERT(inp); 3236 3237 /* 3238 * If there is already a send tag, or the INP is being torn 3239 * down, allocating a new send tag is not allowed. Else send 3240 * tags may leak. 3241 */ 3242 if (*st != NULL || (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0) 3243 return (EINVAL); 3244 3245 error = m_snd_tag_alloc(ifp, ¶ms, st); 3246 #ifdef INET 3247 if (error == 0) { 3248 counter_u64_add(rate_limit_set_ok, 1); 3249 counter_u64_add(rate_limit_active, 1); 3250 } else if (error != EOPNOTSUPP) 3251 counter_u64_add(rate_limit_alloc_fail, 1); 3252 #endif 3253 return (error); 3254 } 3255 3256 void 3257 in_pcbdetach_tag(struct m_snd_tag *mst) 3258 { 3259 3260 m_snd_tag_rele(mst); 3261 #ifdef INET 3262 counter_u64_add(rate_limit_active, -1); 3263 #endif 3264 } 3265 3266 /* 3267 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3268 * if any: 3269 */ 3270 void 3271 in_pcbdetach_txrtlmt(struct inpcb *inp) 3272 { 3273 struct m_snd_tag *mst; 3274 3275 INP_WLOCK_ASSERT(inp); 3276 3277 mst = inp->inp_snd_tag; 3278 inp->inp_snd_tag = NULL; 3279 3280 if (mst == NULL) 3281 return; 3282 3283 m_snd_tag_rele(mst); 3284 #ifdef INET 3285 counter_u64_add(rate_limit_active, -1); 3286 #endif 3287 } 3288 3289 int 3290 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3291 { 3292 int error; 3293 3294 /* 3295 * If the existing send tag is for the wrong interface due to 3296 * a route change, first drop the existing tag. Set the 3297 * CHANGED flag so that we will keep trying to allocate a new 3298 * tag if we fail to allocate one this time. 3299 */ 3300 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3301 in_pcbdetach_txrtlmt(inp); 3302 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3303 } 3304 3305 /* 3306 * NOTE: When attaching to a network interface a reference is 3307 * made to ensure the network interface doesn't go away until 3308 * all ratelimit connections are gone. The network interface 3309 * pointers compared below represent valid network interfaces, 3310 * except when comparing towards NULL. 3311 */ 3312 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3313 error = 0; 3314 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3315 if (inp->inp_snd_tag != NULL) 3316 in_pcbdetach_txrtlmt(inp); 3317 error = 0; 3318 } else if (inp->inp_snd_tag == NULL) { 3319 /* 3320 * In order to utilize packet pacing with RSS, we need 3321 * to wait until there is a valid RSS hash before we 3322 * can proceed: 3323 */ 3324 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3325 error = EAGAIN; 3326 } else { 3327 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3328 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3329 } 3330 } else { 3331 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3332 } 3333 if (error == 0 || error == EOPNOTSUPP) 3334 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3335 3336 return (error); 3337 } 3338 3339 /* 3340 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3341 * is set in the fast path and will attach/detach/modify the TX rate 3342 * limit send tag based on the socket's so_max_pacing_rate value. 3343 */ 3344 void 3345 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3346 { 3347 struct socket *socket; 3348 uint32_t max_pacing_rate; 3349 bool did_upgrade; 3350 3351 if (inp == NULL) 3352 return; 3353 3354 socket = inp->inp_socket; 3355 if (socket == NULL) 3356 return; 3357 3358 if (!INP_WLOCKED(inp)) { 3359 /* 3360 * NOTE: If the write locking fails, we need to bail 3361 * out and use the non-ratelimited ring for the 3362 * transmit until there is a new chance to get the 3363 * write lock. 3364 */ 3365 if (!INP_TRY_UPGRADE(inp)) 3366 return; 3367 did_upgrade = 1; 3368 } else { 3369 did_upgrade = 0; 3370 } 3371 3372 /* 3373 * NOTE: The so_max_pacing_rate value is read unlocked, 3374 * because atomic updates are not required since the variable 3375 * is checked at every mbuf we send. It is assumed that the 3376 * variable read itself will be atomic. 3377 */ 3378 max_pacing_rate = socket->so_max_pacing_rate; 3379 3380 in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3381 3382 if (did_upgrade) 3383 INP_DOWNGRADE(inp); 3384 } 3385 3386 /* 3387 * Track route changes for TX rate limiting. 3388 */ 3389 void 3390 in_pcboutput_eagain(struct inpcb *inp) 3391 { 3392 bool did_upgrade; 3393 3394 if (inp == NULL) 3395 return; 3396 3397 if (inp->inp_snd_tag == NULL) 3398 return; 3399 3400 if (!INP_WLOCKED(inp)) { 3401 /* 3402 * NOTE: If the write locking fails, we need to bail 3403 * out and use the non-ratelimited ring for the 3404 * transmit until there is a new chance to get the 3405 * write lock. 3406 */ 3407 if (!INP_TRY_UPGRADE(inp)) 3408 return; 3409 did_upgrade = 1; 3410 } else { 3411 did_upgrade = 0; 3412 } 3413 3414 /* detach rate limiting */ 3415 in_pcbdetach_txrtlmt(inp); 3416 3417 /* make sure new mbuf send tag allocation is made */ 3418 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3419 3420 if (did_upgrade) 3421 INP_DOWNGRADE(inp); 3422 } 3423 3424 #ifdef INET 3425 static void 3426 rl_init(void *st) 3427 { 3428 rate_limit_new = counter_u64_alloc(M_WAITOK); 3429 rate_limit_chg = counter_u64_alloc(M_WAITOK); 3430 rate_limit_active = counter_u64_alloc(M_WAITOK); 3431 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3432 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3433 } 3434 3435 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3436 #endif 3437 #endif /* RATELIMIT */ 3438