1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1991, 1993, 1995 5 * The Regents of the University of California. 6 * Copyright (c) 2007-2009 Robert N. M. Watson 7 * Copyright (c) 2010-2011 Juniper Networks, Inc. 8 * All rights reserved. 9 * 10 * Portions of this software were developed by Robert N. M. Watson under 11 * contract to Juniper Networks, Inc. 12 * 13 * Redistribution and use in source and binary forms, with or without 14 * modification, are permitted provided that the following conditions 15 * are met: 16 * 1. Redistributions of source code must retain the above copyright 17 * notice, this list of conditions and the following disclaimer. 18 * 2. Redistributions in binary form must reproduce the above copyright 19 * notice, this list of conditions and the following disclaimer in the 20 * documentation and/or other materials provided with the distribution. 21 * 3. Neither the name of the University nor the names of its contributors 22 * may be used to endorse or promote products derived from this software 23 * without specific prior written permission. 24 * 25 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 26 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 27 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 28 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 29 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 30 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 31 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 32 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 33 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 34 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 35 * SUCH DAMAGE. 36 * 37 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 38 */ 39 40 #include <sys/cdefs.h> 41 __FBSDID("$FreeBSD$"); 42 43 #include "opt_ddb.h" 44 #include "opt_ipsec.h" 45 #include "opt_inet.h" 46 #include "opt_inet6.h" 47 #include "opt_ratelimit.h" 48 #include "opt_pcbgroup.h" 49 #include "opt_rss.h" 50 51 #include <sys/param.h> 52 #include <sys/systm.h> 53 #include <sys/lock.h> 54 #include <sys/malloc.h> 55 #include <sys/mbuf.h> 56 #include <sys/callout.h> 57 #include <sys/eventhandler.h> 58 #include <sys/domain.h> 59 #include <sys/protosw.h> 60 #include <sys/rmlock.h> 61 #include <sys/smp.h> 62 #include <sys/socket.h> 63 #include <sys/socketvar.h> 64 #include <sys/sockio.h> 65 #include <sys/priv.h> 66 #include <sys/proc.h> 67 #include <sys/refcount.h> 68 #include <sys/jail.h> 69 #include <sys/kernel.h> 70 #include <sys/sysctl.h> 71 72 #ifdef DDB 73 #include <ddb/ddb.h> 74 #endif 75 76 #include <vm/uma.h> 77 78 #include <net/if.h> 79 #include <net/if_var.h> 80 #include <net/if_types.h> 81 #include <net/if_llatbl.h> 82 #include <net/route.h> 83 #include <net/rss_config.h> 84 #include <net/vnet.h> 85 86 #if defined(INET) || defined(INET6) 87 #include <netinet/in.h> 88 #include <netinet/in_pcb.h> 89 #ifdef INET 90 #include <netinet/in_var.h> 91 #include <netinet/in_fib.h> 92 #endif 93 #include <netinet/ip_var.h> 94 #include <netinet/tcp_var.h> 95 #ifdef TCPHPTS 96 #include <netinet/tcp_hpts.h> 97 #endif 98 #include <netinet/udp.h> 99 #include <netinet/udp_var.h> 100 #ifdef INET6 101 #include <netinet/ip6.h> 102 #include <netinet6/in6_pcb.h> 103 #include <netinet6/in6_var.h> 104 #include <netinet6/ip6_var.h> 105 #endif /* INET6 */ 106 #include <net/route/nhop.h> 107 #endif 108 109 #include <netipsec/ipsec_support.h> 110 111 #include <security/mac/mac_framework.h> 112 113 #define INPCBLBGROUP_SIZMIN 8 114 #define INPCBLBGROUP_SIZMAX 256 115 116 static struct callout ipport_tick_callout; 117 118 /* 119 * These configure the range of local port addresses assigned to 120 * "unspecified" outgoing connections/packets/whatever. 121 */ 122 VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ 123 VNET_DEFINE(int, ipport_lowlastauto) = IPPORT_RESERVEDSTART; /* 600 */ 124 VNET_DEFINE(int, ipport_firstauto) = IPPORT_EPHEMERALFIRST; /* 10000 */ 125 VNET_DEFINE(int, ipport_lastauto) = IPPORT_EPHEMERALLAST; /* 65535 */ 126 VNET_DEFINE(int, ipport_hifirstauto) = IPPORT_HIFIRSTAUTO; /* 49152 */ 127 VNET_DEFINE(int, ipport_hilastauto) = IPPORT_HILASTAUTO; /* 65535 */ 128 129 /* 130 * Reserved ports accessible only to root. There are significant 131 * security considerations that must be accounted for when changing these, 132 * but the security benefits can be great. Please be careful. 133 */ 134 VNET_DEFINE(int, ipport_reservedhigh) = IPPORT_RESERVED - 1; /* 1023 */ 135 VNET_DEFINE(int, ipport_reservedlow); 136 137 /* Variables dealing with random ephemeral port allocation. */ 138 VNET_DEFINE(int, ipport_randomized) = 1; /* user controlled via sysctl */ 139 VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ 140 VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ 141 VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ 142 VNET_DEFINE(int, ipport_tcpallocs); 143 VNET_DEFINE_STATIC(int, ipport_tcplastcount); 144 145 #define V_ipport_tcplastcount VNET(ipport_tcplastcount) 146 147 static void in_pcbremlists(struct inpcb *inp); 148 #ifdef INET 149 static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, 150 struct in_addr faddr, u_int fport_arg, 151 struct in_addr laddr, u_int lport_arg, 152 int lookupflags, struct ifnet *ifp); 153 154 #define RANGECHK(var, min, max) \ 155 if ((var) < (min)) { (var) = (min); } \ 156 else if ((var) > (max)) { (var) = (max); } 157 158 static int 159 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 160 { 161 int error; 162 163 error = sysctl_handle_int(oidp, arg1, arg2, req); 164 if (error == 0) { 165 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 166 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 167 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 168 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 169 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 170 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 171 } 172 return (error); 173 } 174 175 #undef RANGECHK 176 177 static SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, 178 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 179 "IP Ports"); 180 181 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowfirst, 182 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 183 &VNET_NAME(ipport_lowfirstauto), 0, &sysctl_net_ipport_check, "I", 184 ""); 185 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, lowlast, 186 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 187 &VNET_NAME(ipport_lowlastauto), 0, &sysctl_net_ipport_check, "I", 188 ""); 189 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, first, 190 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 191 &VNET_NAME(ipport_firstauto), 0, &sysctl_net_ipport_check, "I", 192 ""); 193 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, last, 194 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 195 &VNET_NAME(ipport_lastauto), 0, &sysctl_net_ipport_check, "I", 196 ""); 197 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hifirst, 198 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 199 &VNET_NAME(ipport_hifirstauto), 0, &sysctl_net_ipport_check, "I", 200 ""); 201 SYSCTL_PROC(_net_inet_ip_portrange, OID_AUTO, hilast, 202 CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, 203 &VNET_NAME(ipport_hilastauto), 0, &sysctl_net_ipport_check, "I", 204 ""); 205 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedhigh, 206 CTLFLAG_VNET | CTLFLAG_RW | CTLFLAG_SECURE, 207 &VNET_NAME(ipport_reservedhigh), 0, ""); 208 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, reservedlow, 209 CTLFLAG_RW|CTLFLAG_SECURE, &VNET_NAME(ipport_reservedlow), 0, ""); 210 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomized, 211 CTLFLAG_VNET | CTLFLAG_RW, 212 &VNET_NAME(ipport_randomized), 0, "Enable random port allocation"); 213 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomcps, 214 CTLFLAG_VNET | CTLFLAG_RW, 215 &VNET_NAME(ipport_randomcps), 0, "Maximum number of random port " 216 "allocations before switching to a sequental one"); 217 SYSCTL_INT(_net_inet_ip_portrange, OID_AUTO, randomtime, 218 CTLFLAG_VNET | CTLFLAG_RW, 219 &VNET_NAME(ipport_randomtime), 0, 220 "Minimum time to keep sequental port " 221 "allocation before switching to a random one"); 222 223 #ifdef RATELIMIT 224 counter_u64_t rate_limit_active; 225 counter_u64_t rate_limit_alloc_fail; 226 counter_u64_t rate_limit_set_ok; 227 228 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, rl, CTLFLAG_RD | CTLFLAG_MPSAFE, 0, 229 "IP Rate Limiting"); 230 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, active, CTLFLAG_RD, 231 &rate_limit_active, "Active rate limited connections"); 232 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, alloc_fail, CTLFLAG_RD, 233 &rate_limit_alloc_fail, "Rate limited connection failures"); 234 SYSCTL_COUNTER_U64(_net_inet_ip_rl, OID_AUTO, set_ok, CTLFLAG_RD, 235 &rate_limit_set_ok, "Rate limited setting succeeded"); 236 #endif /* RATELIMIT */ 237 238 #endif /* INET */ 239 240 /* 241 * in_pcb.c: manage the Protocol Control Blocks. 242 * 243 * NOTE: It is assumed that most of these functions will be called with 244 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 245 * functions often modify hash chains or addresses in pcbs. 246 */ 247 248 static struct inpcblbgroup * 249 in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, 250 uint16_t port, const union in_dependaddr *addr, int size) 251 { 252 struct inpcblbgroup *grp; 253 size_t bytes; 254 255 bytes = __offsetof(struct inpcblbgroup, il_inp[size]); 256 grp = malloc(bytes, M_PCB, M_ZERO | M_NOWAIT); 257 if (!grp) 258 return (NULL); 259 grp->il_vflag = vflag; 260 grp->il_lport = port; 261 grp->il_dependladdr = *addr; 262 grp->il_inpsiz = size; 263 CK_LIST_INSERT_HEAD(hdr, grp, il_list); 264 return (grp); 265 } 266 267 static void 268 in_pcblbgroup_free_deferred(epoch_context_t ctx) 269 { 270 struct inpcblbgroup *grp; 271 272 grp = __containerof(ctx, struct inpcblbgroup, il_epoch_ctx); 273 free(grp, M_PCB); 274 } 275 276 static void 277 in_pcblbgroup_free(struct inpcblbgroup *grp) 278 { 279 280 CK_LIST_REMOVE(grp, il_list); 281 NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); 282 } 283 284 static struct inpcblbgroup * 285 in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, 286 struct inpcblbgroup *old_grp, int size) 287 { 288 struct inpcblbgroup *grp; 289 int i; 290 291 grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, 292 old_grp->il_lport, &old_grp->il_dependladdr, size); 293 if (grp == NULL) 294 return (NULL); 295 296 KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, 297 ("invalid new local group size %d and old local group count %d", 298 grp->il_inpsiz, old_grp->il_inpcnt)); 299 300 for (i = 0; i < old_grp->il_inpcnt; ++i) 301 grp->il_inp[i] = old_grp->il_inp[i]; 302 grp->il_inpcnt = old_grp->il_inpcnt; 303 in_pcblbgroup_free(old_grp); 304 return (grp); 305 } 306 307 /* 308 * PCB at index 'i' is removed from the group. Pull up the ones below il_inp[i] 309 * and shrink group if possible. 310 */ 311 static void 312 in_pcblbgroup_reorder(struct inpcblbgrouphead *hdr, struct inpcblbgroup **grpp, 313 int i) 314 { 315 struct inpcblbgroup *grp, *new_grp; 316 317 grp = *grpp; 318 for (; i + 1 < grp->il_inpcnt; ++i) 319 grp->il_inp[i] = grp->il_inp[i + 1]; 320 grp->il_inpcnt--; 321 322 if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && 323 grp->il_inpcnt <= grp->il_inpsiz / 4) { 324 /* Shrink this group. */ 325 new_grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz / 2); 326 if (new_grp != NULL) 327 *grpp = new_grp; 328 } 329 } 330 331 /* 332 * Add PCB to load balance group for SO_REUSEPORT_LB option. 333 */ 334 static int 335 in_pcbinslbgrouphash(struct inpcb *inp) 336 { 337 const static struct timeval interval = { 60, 0 }; 338 static struct timeval lastprint; 339 struct inpcbinfo *pcbinfo; 340 struct inpcblbgrouphead *hdr; 341 struct inpcblbgroup *grp; 342 uint32_t idx; 343 344 pcbinfo = inp->inp_pcbinfo; 345 346 INP_WLOCK_ASSERT(inp); 347 INP_HASH_WLOCK_ASSERT(pcbinfo); 348 349 /* 350 * Don't allow jailed socket to join local group. 351 */ 352 if (inp->inp_socket != NULL && jailed(inp->inp_socket->so_cred)) 353 return (0); 354 355 #ifdef INET6 356 /* 357 * Don't allow IPv4 mapped INET6 wild socket. 358 */ 359 if ((inp->inp_vflag & INP_IPV4) && 360 inp->inp_laddr.s_addr == INADDR_ANY && 361 INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) { 362 return (0); 363 } 364 #endif 365 366 idx = INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask); 367 hdr = &pcbinfo->ipi_lbgrouphashbase[idx]; 368 CK_LIST_FOREACH(grp, hdr, il_list) { 369 if (grp->il_vflag == inp->inp_vflag && 370 grp->il_lport == inp->inp_lport && 371 memcmp(&grp->il_dependladdr, 372 &inp->inp_inc.inc_ie.ie_dependladdr, 373 sizeof(grp->il_dependladdr)) == 0) 374 break; 375 } 376 if (grp == NULL) { 377 /* Create new load balance group. */ 378 grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, 379 inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, 380 INPCBLBGROUP_SIZMIN); 381 if (grp == NULL) 382 return (ENOBUFS); 383 } else if (grp->il_inpcnt == grp->il_inpsiz) { 384 if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { 385 if (ratecheck(&lastprint, &interval)) 386 printf("lb group port %d, limit reached\n", 387 ntohs(grp->il_lport)); 388 return (0); 389 } 390 391 /* Expand this local group. */ 392 grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); 393 if (grp == NULL) 394 return (ENOBUFS); 395 } 396 397 KASSERT(grp->il_inpcnt < grp->il_inpsiz, 398 ("invalid local group size %d and count %d", grp->il_inpsiz, 399 grp->il_inpcnt)); 400 401 grp->il_inp[grp->il_inpcnt] = inp; 402 grp->il_inpcnt++; 403 return (0); 404 } 405 406 /* 407 * Remove PCB from load balance group. 408 */ 409 static void 410 in_pcbremlbgrouphash(struct inpcb *inp) 411 { 412 struct inpcbinfo *pcbinfo; 413 struct inpcblbgrouphead *hdr; 414 struct inpcblbgroup *grp; 415 int i; 416 417 pcbinfo = inp->inp_pcbinfo; 418 419 INP_WLOCK_ASSERT(inp); 420 INP_HASH_WLOCK_ASSERT(pcbinfo); 421 422 hdr = &pcbinfo->ipi_lbgrouphashbase[ 423 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; 424 CK_LIST_FOREACH(grp, hdr, il_list) { 425 for (i = 0; i < grp->il_inpcnt; ++i) { 426 if (grp->il_inp[i] != inp) 427 continue; 428 429 if (grp->il_inpcnt == 1) { 430 /* We are the last, free this local group. */ 431 in_pcblbgroup_free(grp); 432 } else { 433 /* Pull up inpcbs, shrink group if possible. */ 434 in_pcblbgroup_reorder(hdr, &grp, i); 435 } 436 return; 437 } 438 } 439 } 440 441 /* 442 * Different protocols initialize their inpcbs differently - giving 443 * different name to the lock. But they all are disposed the same. 444 */ 445 static void 446 inpcb_fini(void *mem, int size) 447 { 448 struct inpcb *inp = mem; 449 450 INP_LOCK_DESTROY(inp); 451 } 452 453 /* 454 * Initialize an inpcbinfo -- we should be able to reduce the number of 455 * arguments in time. 456 */ 457 void 458 in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, 459 struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, 460 char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) 461 { 462 463 porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); 464 465 INP_INFO_LOCK_INIT(pcbinfo, name); 466 INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ 467 INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); 468 #ifdef VIMAGE 469 pcbinfo->ipi_vnet = curvnet; 470 #endif 471 pcbinfo->ipi_listhead = listhead; 472 CK_LIST_INIT(pcbinfo->ipi_listhead); 473 pcbinfo->ipi_count = 0; 474 pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, 475 &pcbinfo->ipi_hashmask); 476 pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, 477 &pcbinfo->ipi_porthashmask); 478 pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, 479 &pcbinfo->ipi_lbgrouphashmask); 480 #ifdef PCBGROUP 481 in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); 482 #endif 483 pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), 484 NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); 485 uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); 486 uma_zone_set_warning(pcbinfo->ipi_zone, 487 "kern.ipc.maxsockets limit reached"); 488 } 489 490 /* 491 * Destroy an inpcbinfo. 492 */ 493 void 494 in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) 495 { 496 497 KASSERT(pcbinfo->ipi_count == 0, 498 ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); 499 500 hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); 501 hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, 502 pcbinfo->ipi_porthashmask); 503 hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, 504 pcbinfo->ipi_lbgrouphashmask); 505 #ifdef PCBGROUP 506 in_pcbgroup_destroy(pcbinfo); 507 #endif 508 uma_zdestroy(pcbinfo->ipi_zone); 509 INP_LIST_LOCK_DESTROY(pcbinfo); 510 INP_HASH_LOCK_DESTROY(pcbinfo); 511 INP_INFO_LOCK_DESTROY(pcbinfo); 512 } 513 514 /* 515 * Allocate a PCB and associate it with the socket. 516 * On success return with the PCB locked. 517 */ 518 int 519 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 520 { 521 struct inpcb *inp; 522 int error; 523 524 error = 0; 525 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); 526 if (inp == NULL) 527 return (ENOBUFS); 528 bzero(&inp->inp_start_zero, inp_zero_size); 529 #ifdef NUMA 530 inp->inp_numa_domain = M_NODOM; 531 #endif 532 inp->inp_pcbinfo = pcbinfo; 533 inp->inp_socket = so; 534 inp->inp_cred = crhold(so->so_cred); 535 inp->inp_inc.inc_fibnum = so->so_fibnum; 536 #ifdef MAC 537 error = mac_inpcb_init(inp, M_NOWAIT); 538 if (error != 0) 539 goto out; 540 mac_inpcb_create(so, inp); 541 #endif 542 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 543 error = ipsec_init_pcbpolicy(inp); 544 if (error != 0) { 545 #ifdef MAC 546 mac_inpcb_destroy(inp); 547 #endif 548 goto out; 549 } 550 #endif /*IPSEC*/ 551 #ifdef INET6 552 if (INP_SOCKAF(so) == AF_INET6) { 553 inp->inp_vflag |= INP_IPV6PROTO; 554 if (V_ip6_v6only) 555 inp->inp_flags |= IN6P_IPV6_V6ONLY; 556 } 557 #endif 558 INP_WLOCK(inp); 559 INP_LIST_WLOCK(pcbinfo); 560 CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); 561 pcbinfo->ipi_count++; 562 so->so_pcb = (caddr_t)inp; 563 #ifdef INET6 564 if (V_ip6_auto_flowlabel) 565 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 566 #endif 567 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 568 refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ 569 570 /* 571 * Routes in inpcb's can cache L2 as well; they are guaranteed 572 * to be cleaned up. 573 */ 574 inp->inp_route.ro_flags = RT_LLE_CACHE; 575 INP_LIST_WUNLOCK(pcbinfo); 576 #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) 577 out: 578 if (error != 0) { 579 crfree(inp->inp_cred); 580 uma_zfree(pcbinfo->ipi_zone, inp); 581 } 582 #endif 583 return (error); 584 } 585 586 #ifdef INET 587 int 588 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 589 { 590 int anonport, error; 591 592 INP_WLOCK_ASSERT(inp); 593 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 594 595 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 596 return (EINVAL); 597 anonport = nam == NULL || ((struct sockaddr_in *)nam)->sin_port == 0; 598 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 599 &inp->inp_lport, cred); 600 if (error) 601 return (error); 602 if (in_pcbinshash(inp) != 0) { 603 inp->inp_laddr.s_addr = INADDR_ANY; 604 inp->inp_lport = 0; 605 return (EAGAIN); 606 } 607 if (anonport) 608 inp->inp_flags |= INP_ANONPORT; 609 return (0); 610 } 611 #endif 612 613 /* 614 * Select a local port (number) to use. 615 */ 616 #if defined(INET) || defined(INET6) 617 int 618 in_pcb_lport(struct inpcb *inp, struct in_addr *laddrp, u_short *lportp, 619 struct ucred *cred, int lookupflags) 620 { 621 struct inpcbinfo *pcbinfo; 622 struct inpcb *tmpinp; 623 unsigned short *lastport; 624 int count, dorandom, error; 625 u_short aux, first, last, lport; 626 #ifdef INET 627 struct in_addr laddr; 628 #endif 629 630 pcbinfo = inp->inp_pcbinfo; 631 632 /* 633 * Because no actual state changes occur here, a global write lock on 634 * the pcbinfo isn't required. 635 */ 636 INP_LOCK_ASSERT(inp); 637 INP_HASH_LOCK_ASSERT(pcbinfo); 638 639 if (inp->inp_flags & INP_HIGHPORT) { 640 first = V_ipport_hifirstauto; /* sysctl */ 641 last = V_ipport_hilastauto; 642 lastport = &pcbinfo->ipi_lasthi; 643 } else if (inp->inp_flags & INP_LOWPORT) { 644 error = priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT); 645 if (error) 646 return (error); 647 first = V_ipport_lowfirstauto; /* 1023 */ 648 last = V_ipport_lowlastauto; /* 600 */ 649 lastport = &pcbinfo->ipi_lastlow; 650 } else { 651 first = V_ipport_firstauto; /* sysctl */ 652 last = V_ipport_lastauto; 653 lastport = &pcbinfo->ipi_lastport; 654 } 655 /* 656 * For UDP(-Lite), use random port allocation as long as the user 657 * allows it. For TCP (and as of yet unknown) connections, 658 * use random port allocation only if the user allows it AND 659 * ipport_tick() allows it. 660 */ 661 if (V_ipport_randomized && 662 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo || 663 pcbinfo == &V_ulitecbinfo)) 664 dorandom = 1; 665 else 666 dorandom = 0; 667 /* 668 * It makes no sense to do random port allocation if 669 * we have the only port available. 670 */ 671 if (first == last) 672 dorandom = 0; 673 /* Make sure to not include UDP(-Lite) packets in the count. */ 674 if (pcbinfo != &V_udbinfo || pcbinfo != &V_ulitecbinfo) 675 V_ipport_tcpallocs++; 676 /* 677 * Instead of having two loops further down counting up or down 678 * make sure that first is always <= last and go with only one 679 * code path implementing all logic. 680 */ 681 if (first > last) { 682 aux = first; 683 first = last; 684 last = aux; 685 } 686 687 #ifdef INET 688 /* Make the compiler happy. */ 689 laddr.s_addr = 0; 690 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) { 691 KASSERT(laddrp != NULL, ("%s: laddrp NULL for v4 inp %p", 692 __func__, inp)); 693 laddr = *laddrp; 694 } 695 #endif 696 tmpinp = NULL; /* Make compiler happy. */ 697 lport = *lportp; 698 699 if (dorandom) 700 *lastport = first + (arc4random() % (last - first)); 701 702 count = last - first; 703 704 do { 705 if (count-- < 0) /* completely used? */ 706 return (EADDRNOTAVAIL); 707 ++*lastport; 708 if (*lastport < first || *lastport > last) 709 *lastport = first; 710 lport = htons(*lastport); 711 712 #ifdef INET6 713 if ((inp->inp_vflag & INP_IPV6) != 0) 714 tmpinp = in6_pcblookup_local(pcbinfo, 715 &inp->in6p_laddr, lport, lookupflags, cred); 716 #endif 717 #if defined(INET) && defined(INET6) 718 else 719 #endif 720 #ifdef INET 721 tmpinp = in_pcblookup_local(pcbinfo, laddr, 722 lport, lookupflags, cred); 723 #endif 724 } while (tmpinp != NULL); 725 726 #ifdef INET 727 if ((inp->inp_vflag & (INP_IPV4|INP_IPV6)) == INP_IPV4) 728 laddrp->s_addr = laddr.s_addr; 729 #endif 730 *lportp = lport; 731 732 return (0); 733 } 734 735 /* 736 * Return cached socket options. 737 */ 738 int 739 inp_so_options(const struct inpcb *inp) 740 { 741 int so_options; 742 743 so_options = 0; 744 745 if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) 746 so_options |= SO_REUSEPORT_LB; 747 if ((inp->inp_flags2 & INP_REUSEPORT) != 0) 748 so_options |= SO_REUSEPORT; 749 if ((inp->inp_flags2 & INP_REUSEADDR) != 0) 750 so_options |= SO_REUSEADDR; 751 return (so_options); 752 } 753 #endif /* INET || INET6 */ 754 755 /* 756 * Check if a new BINDMULTI socket is allowed to be created. 757 * 758 * ni points to the new inp. 759 * oi points to the exisitng inp. 760 * 761 * This checks whether the existing inp also has BINDMULTI and 762 * whether the credentials match. 763 */ 764 int 765 in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi) 766 { 767 /* Check permissions match */ 768 if ((ni->inp_flags2 & INP_BINDMULTI) && 769 (ni->inp_cred->cr_uid != 770 oi->inp_cred->cr_uid)) 771 return (0); 772 773 /* Check the existing inp has BINDMULTI set */ 774 if ((ni->inp_flags2 & INP_BINDMULTI) && 775 ((oi->inp_flags2 & INP_BINDMULTI) == 0)) 776 return (0); 777 778 /* 779 * We're okay - either INP_BINDMULTI isn't set on ni, or 780 * it is and it matches the checks. 781 */ 782 return (1); 783 } 784 785 #ifdef INET 786 /* 787 * Set up a bind operation on a PCB, performing port allocation 788 * as required, but do not actually modify the PCB. Callers can 789 * either complete the bind by setting inp_laddr/inp_lport and 790 * calling in_pcbinshash(), or they can just use the resulting 791 * port and address to authorise the sending of a once-off packet. 792 * 793 * On error, the values of *laddrp and *lportp are not changed. 794 */ 795 int 796 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 797 u_short *lportp, struct ucred *cred) 798 { 799 struct socket *so = inp->inp_socket; 800 struct sockaddr_in *sin; 801 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 802 struct in_addr laddr; 803 u_short lport = 0; 804 int lookupflags = 0, reuseport = (so->so_options & SO_REUSEPORT); 805 int error; 806 807 /* 808 * XXX: Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here 809 * so that we don't have to add to the (already messy) code below. 810 */ 811 int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); 812 813 /* 814 * No state changes, so read locks are sufficient here. 815 */ 816 INP_LOCK_ASSERT(inp); 817 INP_HASH_LOCK_ASSERT(pcbinfo); 818 819 if (CK_STAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ 820 return (EADDRNOTAVAIL); 821 laddr.s_addr = *laddrp; 822 if (nam != NULL && laddr.s_addr != INADDR_ANY) 823 return (EINVAL); 824 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) 825 lookupflags = INPLOOKUP_WILDCARD; 826 if (nam == NULL) { 827 if ((error = prison_local_ip4(cred, &laddr)) != 0) 828 return (error); 829 } else { 830 sin = (struct sockaddr_in *)nam; 831 if (nam->sa_len != sizeof (*sin)) 832 return (EINVAL); 833 #ifdef notdef 834 /* 835 * We should check the family, but old programs 836 * incorrectly fail to initialize it. 837 */ 838 if (sin->sin_family != AF_INET) 839 return (EAFNOSUPPORT); 840 #endif 841 error = prison_local_ip4(cred, &sin->sin_addr); 842 if (error) 843 return (error); 844 if (sin->sin_port != *lportp) { 845 /* Don't allow the port to change. */ 846 if (*lportp != 0) 847 return (EINVAL); 848 lport = sin->sin_port; 849 } 850 /* NB: lport is left as 0 if the port isn't being changed. */ 851 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 852 /* 853 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 854 * allow complete duplication of binding if 855 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 856 * and a multicast address is bound on both 857 * new and duplicated sockets. 858 */ 859 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) 860 reuseport = SO_REUSEADDR|SO_REUSEPORT; 861 /* 862 * XXX: How to deal with SO_REUSEPORT_LB here? 863 * Treat same as SO_REUSEPORT for now. 864 */ 865 if ((so->so_options & 866 (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) 867 reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; 868 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 869 sin->sin_port = 0; /* yech... */ 870 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 871 /* 872 * Is the address a local IP address? 873 * If INP_BINDANY is set, then the socket may be bound 874 * to any endpoint address, local or not. 875 */ 876 if ((inp->inp_flags & INP_BINDANY) == 0 && 877 ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) 878 return (EADDRNOTAVAIL); 879 } 880 laddr = sin->sin_addr; 881 if (lport) { 882 struct inpcb *t; 883 struct tcptw *tw; 884 885 /* GROSS */ 886 if (ntohs(lport) <= V_ipport_reservedhigh && 887 ntohs(lport) >= V_ipport_reservedlow && 888 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT)) 889 return (EACCES); 890 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 891 priv_check_cred(inp->inp_cred, PRIV_NETINET_REUSEPORT) != 0) { 892 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 893 lport, INPLOOKUP_WILDCARD, cred); 894 /* 895 * XXX 896 * This entire block sorely needs a rewrite. 897 */ 898 if (t && 899 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 900 ((t->inp_flags & INP_TIMEWAIT) == 0) && 901 (so->so_type != SOCK_STREAM || 902 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 903 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 904 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 905 (t->inp_flags2 & INP_REUSEPORT) || 906 (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && 907 (inp->inp_cred->cr_uid != 908 t->inp_cred->cr_uid)) 909 return (EADDRINUSE); 910 911 /* 912 * If the socket is a BINDMULTI socket, then 913 * the credentials need to match and the 914 * original socket also has to have been bound 915 * with BINDMULTI. 916 */ 917 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 918 return (EADDRINUSE); 919 } 920 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 921 lport, lookupflags, cred); 922 if (t && (t->inp_flags & INP_TIMEWAIT)) { 923 /* 924 * XXXRW: If an incpb has had its timewait 925 * state recycled, we treat the address as 926 * being in use (for now). This is better 927 * than a panic, but not desirable. 928 */ 929 tw = intotw(t); 930 if (tw == NULL || 931 ((reuseport & tw->tw_so_options) == 0 && 932 (reuseport_lb & 933 tw->tw_so_options) == 0)) { 934 return (EADDRINUSE); 935 } 936 } else if (t && 937 ((inp->inp_flags2 & INP_BINDMULTI) == 0) && 938 (reuseport & inp_so_options(t)) == 0 && 939 (reuseport_lb & inp_so_options(t)) == 0) { 940 #ifdef INET6 941 if (ntohl(sin->sin_addr.s_addr) != 942 INADDR_ANY || 943 ntohl(t->inp_laddr.s_addr) != 944 INADDR_ANY || 945 (inp->inp_vflag & INP_IPV6PROTO) == 0 || 946 (t->inp_vflag & INP_IPV6PROTO) == 0) 947 #endif 948 return (EADDRINUSE); 949 if (t && (! in_pcbbind_check_bindmulti(inp, t))) 950 return (EADDRINUSE); 951 } 952 } 953 } 954 if (*lportp != 0) 955 lport = *lportp; 956 if (lport == 0) { 957 error = in_pcb_lport(inp, &laddr, &lport, cred, lookupflags); 958 if (error != 0) 959 return (error); 960 961 } 962 *laddrp = laddr.s_addr; 963 *lportp = lport; 964 return (0); 965 } 966 967 /* 968 * Connect from a socket to a specified address. 969 * Both address and port must be specified in argument sin. 970 * If don't have a local address for this socket yet, 971 * then pick one. 972 */ 973 int 974 in_pcbconnect_mbuf(struct inpcb *inp, struct sockaddr *nam, 975 struct ucred *cred, struct mbuf *m, bool rehash) 976 { 977 u_short lport, fport; 978 in_addr_t laddr, faddr; 979 int anonport, error; 980 981 INP_WLOCK_ASSERT(inp); 982 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 983 984 lport = inp->inp_lport; 985 laddr = inp->inp_laddr.s_addr; 986 anonport = (lport == 0); 987 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 988 NULL, cred); 989 if (error) 990 return (error); 991 992 /* Do the initial binding of the local address if required. */ 993 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 994 KASSERT(rehash == true, 995 ("Rehashing required for unbound inps")); 996 inp->inp_lport = lport; 997 inp->inp_laddr.s_addr = laddr; 998 if (in_pcbinshash(inp) != 0) { 999 inp->inp_laddr.s_addr = INADDR_ANY; 1000 inp->inp_lport = 0; 1001 return (EAGAIN); 1002 } 1003 } 1004 1005 /* Commit the remaining changes. */ 1006 inp->inp_lport = lport; 1007 inp->inp_laddr.s_addr = laddr; 1008 inp->inp_faddr.s_addr = faddr; 1009 inp->inp_fport = fport; 1010 if (rehash) { 1011 in_pcbrehash_mbuf(inp, m); 1012 } else { 1013 in_pcbinshash_mbuf(inp, m); 1014 } 1015 1016 if (anonport) 1017 inp->inp_flags |= INP_ANONPORT; 1018 return (0); 1019 } 1020 1021 int 1022 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 1023 { 1024 1025 return (in_pcbconnect_mbuf(inp, nam, cred, NULL, true)); 1026 } 1027 1028 /* 1029 * Do proper source address selection on an unbound socket in case 1030 * of connect. Take jails into account as well. 1031 */ 1032 int 1033 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 1034 struct ucred *cred) 1035 { 1036 struct ifaddr *ifa; 1037 struct sockaddr *sa; 1038 struct sockaddr_in *sin, dst; 1039 struct nhop_object *nh; 1040 int error; 1041 1042 NET_EPOCH_ASSERT(); 1043 KASSERT(laddr != NULL, ("%s: laddr NULL", __func__)); 1044 /* 1045 * Bypass source address selection and use the primary jail IP 1046 * if requested. 1047 */ 1048 if (cred != NULL && !prison_saddrsel_ip4(cred, laddr)) 1049 return (0); 1050 1051 error = 0; 1052 1053 nh = NULL; 1054 bzero(&dst, sizeof(dst)); 1055 sin = &dst; 1056 sin->sin_family = AF_INET; 1057 sin->sin_len = sizeof(struct sockaddr_in); 1058 sin->sin_addr.s_addr = faddr->s_addr; 1059 1060 /* 1061 * If route is known our src addr is taken from the i/f, 1062 * else punt. 1063 * 1064 * Find out route to destination. 1065 */ 1066 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 1067 nh = fib4_lookup(inp->inp_inc.inc_fibnum, *faddr, 1068 0, NHR_NONE, 0); 1069 1070 /* 1071 * If we found a route, use the address corresponding to 1072 * the outgoing interface. 1073 * 1074 * Otherwise assume faddr is reachable on a directly connected 1075 * network and try to find a corresponding interface to take 1076 * the source address from. 1077 */ 1078 if (nh == NULL || nh->nh_ifp == NULL) { 1079 struct in_ifaddr *ia; 1080 struct ifnet *ifp; 1081 1082 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin, 1083 inp->inp_socket->so_fibnum)); 1084 if (ia == NULL) { 1085 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin, 0, 1086 inp->inp_socket->so_fibnum)); 1087 1088 } 1089 if (ia == NULL) { 1090 error = ENETUNREACH; 1091 goto done; 1092 } 1093 1094 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1095 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1096 goto done; 1097 } 1098 1099 ifp = ia->ia_ifp; 1100 ia = NULL; 1101 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1102 1103 sa = ifa->ifa_addr; 1104 if (sa->sa_family != AF_INET) 1105 continue; 1106 sin = (struct sockaddr_in *)sa; 1107 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1108 ia = (struct in_ifaddr *)ifa; 1109 break; 1110 } 1111 } 1112 if (ia != NULL) { 1113 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1114 goto done; 1115 } 1116 1117 /* 3. As a last resort return the 'default' jail address. */ 1118 error = prison_get_ip4(cred, laddr); 1119 goto done; 1120 } 1121 1122 /* 1123 * If the outgoing interface on the route found is not 1124 * a loopback interface, use the address from that interface. 1125 * In case of jails do those three steps: 1126 * 1. check if the interface address belongs to the jail. If so use it. 1127 * 2. check if we have any address on the outgoing interface 1128 * belonging to this jail. If so use it. 1129 * 3. as a last resort return the 'default' jail address. 1130 */ 1131 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) == 0) { 1132 struct in_ifaddr *ia; 1133 struct ifnet *ifp; 1134 1135 /* If not jailed, use the default returned. */ 1136 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1137 ia = (struct in_ifaddr *)nh->nh_ifa; 1138 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1139 goto done; 1140 } 1141 1142 /* Jailed. */ 1143 /* 1. Check if the iface address belongs to the jail. */ 1144 sin = (struct sockaddr_in *)nh->nh_ifa->ifa_addr; 1145 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1146 ia = (struct in_ifaddr *)nh->nh_ifa; 1147 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1148 goto done; 1149 } 1150 1151 /* 1152 * 2. Check if we have any address on the outgoing interface 1153 * belonging to this jail. 1154 */ 1155 ia = NULL; 1156 ifp = nh->nh_ifp; 1157 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1158 sa = ifa->ifa_addr; 1159 if (sa->sa_family != AF_INET) 1160 continue; 1161 sin = (struct sockaddr_in *)sa; 1162 if (prison_check_ip4(cred, &sin->sin_addr) == 0) { 1163 ia = (struct in_ifaddr *)ifa; 1164 break; 1165 } 1166 } 1167 if (ia != NULL) { 1168 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1169 goto done; 1170 } 1171 1172 /* 3. As a last resort return the 'default' jail address. */ 1173 error = prison_get_ip4(cred, laddr); 1174 goto done; 1175 } 1176 1177 /* 1178 * The outgoing interface is marked with 'loopback net', so a route 1179 * to ourselves is here. 1180 * Try to find the interface of the destination address and then 1181 * take the address from there. That interface is not necessarily 1182 * a loopback interface. 1183 * In case of jails, check that it is an address of the jail 1184 * and if we cannot find, fall back to the 'default' jail address. 1185 */ 1186 if ((nh->nh_ifp->if_flags & IFF_LOOPBACK) != 0) { 1187 struct in_ifaddr *ia; 1188 1189 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&dst), 1190 inp->inp_socket->so_fibnum)); 1191 if (ia == NULL) 1192 ia = ifatoia(ifa_ifwithnet(sintosa(&dst), 0, 1193 inp->inp_socket->so_fibnum)); 1194 if (ia == NULL) 1195 ia = ifatoia(ifa_ifwithaddr(sintosa(&dst))); 1196 1197 if (cred == NULL || !prison_flag(cred, PR_IP4)) { 1198 if (ia == NULL) { 1199 error = ENETUNREACH; 1200 goto done; 1201 } 1202 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1203 goto done; 1204 } 1205 1206 /* Jailed. */ 1207 if (ia != NULL) { 1208 struct ifnet *ifp; 1209 1210 ifp = ia->ia_ifp; 1211 ia = NULL; 1212 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 1213 sa = ifa->ifa_addr; 1214 if (sa->sa_family != AF_INET) 1215 continue; 1216 sin = (struct sockaddr_in *)sa; 1217 if (prison_check_ip4(cred, 1218 &sin->sin_addr) == 0) { 1219 ia = (struct in_ifaddr *)ifa; 1220 break; 1221 } 1222 } 1223 if (ia != NULL) { 1224 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 1225 goto done; 1226 } 1227 } 1228 1229 /* 3. As a last resort return the 'default' jail address. */ 1230 error = prison_get_ip4(cred, laddr); 1231 goto done; 1232 } 1233 1234 done: 1235 return (error); 1236 } 1237 1238 /* 1239 * Set up for a connect from a socket to the specified address. 1240 * On entry, *laddrp and *lportp should contain the current local 1241 * address and port for the PCB; these are updated to the values 1242 * that should be placed in inp_laddr and inp_lport to complete 1243 * the connect. 1244 * 1245 * On success, *faddrp and *fportp will be set to the remote address 1246 * and port. These are not updated in the error case. 1247 * 1248 * If the operation fails because the connection already exists, 1249 * *oinpp will be set to the PCB of that connection so that the 1250 * caller can decide to override it. In all other cases, *oinpp 1251 * is set to NULL. 1252 */ 1253 int 1254 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 1255 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 1256 struct inpcb **oinpp, struct ucred *cred) 1257 { 1258 struct rm_priotracker in_ifa_tracker; 1259 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 1260 struct in_ifaddr *ia; 1261 struct inpcb *oinp; 1262 struct in_addr laddr, faddr; 1263 u_short lport, fport; 1264 int error; 1265 1266 /* 1267 * Because a global state change doesn't actually occur here, a read 1268 * lock is sufficient. 1269 */ 1270 NET_EPOCH_ASSERT(); 1271 INP_LOCK_ASSERT(inp); 1272 INP_HASH_LOCK_ASSERT(inp->inp_pcbinfo); 1273 1274 if (oinpp != NULL) 1275 *oinpp = NULL; 1276 if (nam->sa_len != sizeof (*sin)) 1277 return (EINVAL); 1278 if (sin->sin_family != AF_INET) 1279 return (EAFNOSUPPORT); 1280 if (sin->sin_port == 0) 1281 return (EADDRNOTAVAIL); 1282 laddr.s_addr = *laddrp; 1283 lport = *lportp; 1284 faddr = sin->sin_addr; 1285 fport = sin->sin_port; 1286 1287 if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { 1288 /* 1289 * If the destination address is INADDR_ANY, 1290 * use the primary local address. 1291 * If the supplied address is INADDR_BROADCAST, 1292 * and the primary interface supports broadcast, 1293 * choose the broadcast address for that interface. 1294 */ 1295 if (faddr.s_addr == INADDR_ANY) { 1296 IN_IFADDR_RLOCK(&in_ifa_tracker); 1297 faddr = 1298 IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 1299 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 1300 if (cred != NULL && 1301 (error = prison_get_ip4(cred, &faddr)) != 0) 1302 return (error); 1303 } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { 1304 IN_IFADDR_RLOCK(&in_ifa_tracker); 1305 if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 1306 IFF_BROADCAST) 1307 faddr = satosin(&CK_STAILQ_FIRST( 1308 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 1309 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 1310 } 1311 } 1312 if (laddr.s_addr == INADDR_ANY) { 1313 error = in_pcbladdr(inp, &faddr, &laddr, cred); 1314 /* 1315 * If the destination address is multicast and an outgoing 1316 * interface has been set as a multicast option, prefer the 1317 * address of that interface as our source address. 1318 */ 1319 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 1320 inp->inp_moptions != NULL) { 1321 struct ip_moptions *imo; 1322 struct ifnet *ifp; 1323 1324 imo = inp->inp_moptions; 1325 if (imo->imo_multicast_ifp != NULL) { 1326 ifp = imo->imo_multicast_ifp; 1327 IN_IFADDR_RLOCK(&in_ifa_tracker); 1328 CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { 1329 if ((ia->ia_ifp == ifp) && 1330 (cred == NULL || 1331 prison_check_ip4(cred, 1332 &ia->ia_addr.sin_addr) == 0)) 1333 break; 1334 } 1335 if (ia == NULL) 1336 error = EADDRNOTAVAIL; 1337 else { 1338 laddr = ia->ia_addr.sin_addr; 1339 error = 0; 1340 } 1341 IN_IFADDR_RUNLOCK(&in_ifa_tracker); 1342 } 1343 } 1344 if (error) 1345 return (error); 1346 } 1347 oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, fport, 1348 laddr, lport, 0, NULL); 1349 if (oinp != NULL) { 1350 if (oinpp != NULL) 1351 *oinpp = oinp; 1352 return (EADDRINUSE); 1353 } 1354 if (lport == 0) { 1355 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, 1356 cred); 1357 if (error) 1358 return (error); 1359 } 1360 *laddrp = laddr.s_addr; 1361 *lportp = lport; 1362 *faddrp = faddr.s_addr; 1363 *fportp = fport; 1364 return (0); 1365 } 1366 1367 void 1368 in_pcbdisconnect(struct inpcb *inp) 1369 { 1370 1371 INP_WLOCK_ASSERT(inp); 1372 INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); 1373 1374 inp->inp_faddr.s_addr = INADDR_ANY; 1375 inp->inp_fport = 0; 1376 in_pcbrehash(inp); 1377 } 1378 #endif /* INET */ 1379 1380 /* 1381 * in_pcbdetach() is responsibe for disassociating a socket from an inpcb. 1382 * For most protocols, this will be invoked immediately prior to calling 1383 * in_pcbfree(). However, with TCP the inpcb may significantly outlive the 1384 * socket, in which case in_pcbfree() is deferred. 1385 */ 1386 void 1387 in_pcbdetach(struct inpcb *inp) 1388 { 1389 1390 KASSERT(inp->inp_socket != NULL, ("%s: inp_socket == NULL", __func__)); 1391 1392 #ifdef RATELIMIT 1393 if (inp->inp_snd_tag != NULL) 1394 in_pcbdetach_txrtlmt(inp); 1395 #endif 1396 inp->inp_socket->so_pcb = NULL; 1397 inp->inp_socket = NULL; 1398 } 1399 1400 /* 1401 * in_pcbref() bumps the reference count on an inpcb in order to maintain 1402 * stability of an inpcb pointer despite the inpcb lock being released. This 1403 * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, 1404 * but where the inpcb lock may already held, or when acquiring a reference 1405 * via a pcbgroup. 1406 * 1407 * in_pcbref() should be used only to provide brief memory stability, and 1408 * must always be followed by a call to INP_WLOCK() and in_pcbrele() to 1409 * garbage collect the inpcb if it has been in_pcbfree()'d from another 1410 * context. Until in_pcbrele() has returned that the inpcb is still valid, 1411 * lock and rele are the *only* safe operations that may be performed on the 1412 * inpcb. 1413 * 1414 * While the inpcb will not be freed, releasing the inpcb lock means that the 1415 * connection's state may change, so the caller should be careful to 1416 * revalidate any cached state on reacquiring the lock. Drop the reference 1417 * using in_pcbrele(). 1418 */ 1419 void 1420 in_pcbref(struct inpcb *inp) 1421 { 1422 1423 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); 1424 1425 refcount_acquire(&inp->inp_refcount); 1426 } 1427 1428 /* 1429 * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to 1430 * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we 1431 * return a flag indicating whether or not the inpcb remains valid. If it is 1432 * valid, we return with the inpcb lock held. 1433 * 1434 * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a 1435 * reference on an inpcb. Historically more work was done here (actually, in 1436 * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the 1437 * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely 1438 * about memory stability (and continued use of the write lock). 1439 */ 1440 int 1441 in_pcbrele_rlocked(struct inpcb *inp) 1442 { 1443 struct inpcbinfo *pcbinfo; 1444 1445 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); 1446 1447 INP_RLOCK_ASSERT(inp); 1448 1449 if (refcount_release(&inp->inp_refcount) == 0) { 1450 /* 1451 * If the inpcb has been freed, let the caller know, even if 1452 * this isn't the last reference. 1453 */ 1454 if (inp->inp_flags2 & INP_FREED) { 1455 INP_RUNLOCK(inp); 1456 return (1); 1457 } 1458 return (0); 1459 } 1460 1461 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1462 #ifdef TCPHPTS 1463 if (inp->inp_in_hpts || inp->inp_in_input) { 1464 struct tcp_hpts_entry *hpts; 1465 /* 1466 * We should not be on the hpts at 1467 * this point in any form. we must 1468 * get the lock to be sure. 1469 */ 1470 hpts = tcp_hpts_lock(inp); 1471 if (inp->inp_in_hpts) 1472 panic("Hpts:%p inp:%p at free still on hpts", 1473 hpts, inp); 1474 mtx_unlock(&hpts->p_mtx); 1475 hpts = tcp_input_lock(inp); 1476 if (inp->inp_in_input) 1477 panic("Hpts:%p inp:%p at free still on input hpts", 1478 hpts, inp); 1479 mtx_unlock(&hpts->p_mtx); 1480 } 1481 #endif 1482 INP_RUNLOCK(inp); 1483 pcbinfo = inp->inp_pcbinfo; 1484 uma_zfree(pcbinfo->ipi_zone, inp); 1485 return (1); 1486 } 1487 1488 int 1489 in_pcbrele_wlocked(struct inpcb *inp) 1490 { 1491 struct inpcbinfo *pcbinfo; 1492 1493 KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); 1494 1495 INP_WLOCK_ASSERT(inp); 1496 1497 if (refcount_release(&inp->inp_refcount) == 0) { 1498 /* 1499 * If the inpcb has been freed, let the caller know, even if 1500 * this isn't the last reference. 1501 */ 1502 if (inp->inp_flags2 & INP_FREED) { 1503 INP_WUNLOCK(inp); 1504 return (1); 1505 } 1506 return (0); 1507 } 1508 1509 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1510 #ifdef TCPHPTS 1511 if (inp->inp_in_hpts || inp->inp_in_input) { 1512 struct tcp_hpts_entry *hpts; 1513 /* 1514 * We should not be on the hpts at 1515 * this point in any form. we must 1516 * get the lock to be sure. 1517 */ 1518 hpts = tcp_hpts_lock(inp); 1519 if (inp->inp_in_hpts) 1520 panic("Hpts:%p inp:%p at free still on hpts", 1521 hpts, inp); 1522 mtx_unlock(&hpts->p_mtx); 1523 hpts = tcp_input_lock(inp); 1524 if (inp->inp_in_input) 1525 panic("Hpts:%p inp:%p at free still on input hpts", 1526 hpts, inp); 1527 mtx_unlock(&hpts->p_mtx); 1528 } 1529 #endif 1530 INP_WUNLOCK(inp); 1531 pcbinfo = inp->inp_pcbinfo; 1532 uma_zfree(pcbinfo->ipi_zone, inp); 1533 return (1); 1534 } 1535 1536 /* 1537 * Temporary wrapper. 1538 */ 1539 int 1540 in_pcbrele(struct inpcb *inp) 1541 { 1542 1543 return (in_pcbrele_wlocked(inp)); 1544 } 1545 1546 void 1547 in_pcblist_rele_rlocked(epoch_context_t ctx) 1548 { 1549 struct in_pcblist *il; 1550 struct inpcb *inp; 1551 struct inpcbinfo *pcbinfo; 1552 int i, n; 1553 1554 il = __containerof(ctx, struct in_pcblist, il_epoch_ctx); 1555 pcbinfo = il->il_pcbinfo; 1556 n = il->il_count; 1557 INP_INFO_WLOCK(pcbinfo); 1558 for (i = 0; i < n; i++) { 1559 inp = il->il_inp_list[i]; 1560 INP_RLOCK(inp); 1561 if (!in_pcbrele_rlocked(inp)) 1562 INP_RUNLOCK(inp); 1563 } 1564 INP_INFO_WUNLOCK(pcbinfo); 1565 free(il, M_TEMP); 1566 } 1567 1568 static void 1569 inpcbport_free(epoch_context_t ctx) 1570 { 1571 struct inpcbport *phd; 1572 1573 phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); 1574 free(phd, M_PCB); 1575 } 1576 1577 static void 1578 in_pcbfree_deferred(epoch_context_t ctx) 1579 { 1580 struct inpcb *inp; 1581 int released __unused; 1582 1583 inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); 1584 1585 INP_WLOCK(inp); 1586 CURVNET_SET(inp->inp_vnet); 1587 #ifdef INET 1588 struct ip_moptions *imo = inp->inp_moptions; 1589 inp->inp_moptions = NULL; 1590 #endif 1591 /* XXXRW: Do as much as possible here. */ 1592 #if defined(IPSEC) || defined(IPSEC_SUPPORT) 1593 if (inp->inp_sp != NULL) 1594 ipsec_delete_pcbpolicy(inp); 1595 #endif 1596 #ifdef INET6 1597 struct ip6_moptions *im6o = NULL; 1598 if (inp->inp_vflag & INP_IPV6PROTO) { 1599 ip6_freepcbopts(inp->in6p_outputopts); 1600 im6o = inp->in6p_moptions; 1601 inp->in6p_moptions = NULL; 1602 } 1603 #endif 1604 if (inp->inp_options) 1605 (void)m_free(inp->inp_options); 1606 inp->inp_vflag = 0; 1607 crfree(inp->inp_cred); 1608 #ifdef MAC 1609 mac_inpcb_destroy(inp); 1610 #endif 1611 released = in_pcbrele_wlocked(inp); 1612 MPASS(released); 1613 #ifdef INET6 1614 ip6_freemoptions(im6o); 1615 #endif 1616 #ifdef INET 1617 inp_freemoptions(imo); 1618 #endif 1619 CURVNET_RESTORE(); 1620 } 1621 1622 /* 1623 * Unconditionally schedule an inpcb to be freed by decrementing its 1624 * reference count, which should occur only after the inpcb has been detached 1625 * from its socket. If another thread holds a temporary reference (acquired 1626 * using in_pcbref()) then the free is deferred until that reference is 1627 * released using in_pcbrele(), but the inpcb is still unlocked. Almost all 1628 * work, including removal from global lists, is done in this context, where 1629 * the pcbinfo lock is held. 1630 */ 1631 void 1632 in_pcbfree(struct inpcb *inp) 1633 { 1634 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1635 1636 KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); 1637 KASSERT((inp->inp_flags2 & INP_FREED) == 0, 1638 ("%s: called twice for pcb %p", __func__, inp)); 1639 if (inp->inp_flags2 & INP_FREED) { 1640 INP_WUNLOCK(inp); 1641 return; 1642 } 1643 1644 INP_WLOCK_ASSERT(inp); 1645 INP_LIST_WLOCK(pcbinfo); 1646 in_pcbremlists(inp); 1647 INP_LIST_WUNLOCK(pcbinfo); 1648 RO_INVALIDATE_CACHE(&inp->inp_route); 1649 /* mark as destruction in progress */ 1650 inp->inp_flags2 |= INP_FREED; 1651 INP_WUNLOCK(inp); 1652 NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); 1653 } 1654 1655 /* 1656 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 1657 * port reservation, and preventing it from being returned by inpcb lookups. 1658 * 1659 * It is used by TCP to mark an inpcb as unused and avoid future packet 1660 * delivery or event notification when a socket remains open but TCP has 1661 * closed. This might occur as a result of a shutdown()-initiated TCP close 1662 * or a RST on the wire, and allows the port binding to be reused while still 1663 * maintaining the invariant that so_pcb always points to a valid inpcb until 1664 * in_pcbdetach(). 1665 * 1666 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 1667 * in_pcbnotifyall() and in_pcbpurgeif0()? 1668 */ 1669 void 1670 in_pcbdrop(struct inpcb *inp) 1671 { 1672 1673 INP_WLOCK_ASSERT(inp); 1674 #ifdef INVARIANTS 1675 if (inp->inp_socket != NULL && inp->inp_ppcb != NULL) 1676 MPASS(inp->inp_refcount > 1); 1677 #endif 1678 1679 /* 1680 * XXXRW: Possibly we should protect the setting of INP_DROPPED with 1681 * the hash lock...? 1682 */ 1683 inp->inp_flags |= INP_DROPPED; 1684 if (inp->inp_flags & INP_INHASHLIST) { 1685 struct inpcbport *phd = inp->inp_phd; 1686 1687 INP_HASH_WLOCK(inp->inp_pcbinfo); 1688 in_pcbremlbgrouphash(inp); 1689 CK_LIST_REMOVE(inp, inp_hash); 1690 CK_LIST_REMOVE(inp, inp_portlist); 1691 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 1692 CK_LIST_REMOVE(phd, phd_hash); 1693 NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); 1694 } 1695 INP_HASH_WUNLOCK(inp->inp_pcbinfo); 1696 inp->inp_flags &= ~INP_INHASHLIST; 1697 #ifdef PCBGROUP 1698 in_pcbgroup_remove(inp); 1699 #endif 1700 } 1701 } 1702 1703 #ifdef INET 1704 /* 1705 * Common routines to return the socket addresses associated with inpcbs. 1706 */ 1707 struct sockaddr * 1708 in_sockaddr(in_port_t port, struct in_addr *addr_p) 1709 { 1710 struct sockaddr_in *sin; 1711 1712 sin = malloc(sizeof *sin, M_SONAME, 1713 M_WAITOK | M_ZERO); 1714 sin->sin_family = AF_INET; 1715 sin->sin_len = sizeof(*sin); 1716 sin->sin_addr = *addr_p; 1717 sin->sin_port = port; 1718 1719 return (struct sockaddr *)sin; 1720 } 1721 1722 int 1723 in_getsockaddr(struct socket *so, struct sockaddr **nam) 1724 { 1725 struct inpcb *inp; 1726 struct in_addr addr; 1727 in_port_t port; 1728 1729 inp = sotoinpcb(so); 1730 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 1731 1732 INP_RLOCK(inp); 1733 port = inp->inp_lport; 1734 addr = inp->inp_laddr; 1735 INP_RUNLOCK(inp); 1736 1737 *nam = in_sockaddr(port, &addr); 1738 return 0; 1739 } 1740 1741 int 1742 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 1743 { 1744 struct inpcb *inp; 1745 struct in_addr addr; 1746 in_port_t port; 1747 1748 inp = sotoinpcb(so); 1749 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1750 1751 INP_RLOCK(inp); 1752 port = inp->inp_fport; 1753 addr = inp->inp_faddr; 1754 INP_RUNLOCK(inp); 1755 1756 *nam = in_sockaddr(port, &addr); 1757 return 0; 1758 } 1759 1760 void 1761 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1762 struct inpcb *(*notify)(struct inpcb *, int)) 1763 { 1764 struct inpcb *inp, *inp_temp; 1765 1766 INP_INFO_WLOCK(pcbinfo); 1767 CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { 1768 INP_WLOCK(inp); 1769 #ifdef INET6 1770 if ((inp->inp_vflag & INP_IPV4) == 0) { 1771 INP_WUNLOCK(inp); 1772 continue; 1773 } 1774 #endif 1775 if (inp->inp_faddr.s_addr != faddr.s_addr || 1776 inp->inp_socket == NULL) { 1777 INP_WUNLOCK(inp); 1778 continue; 1779 } 1780 if ((*notify)(inp, errno)) 1781 INP_WUNLOCK(inp); 1782 } 1783 INP_INFO_WUNLOCK(pcbinfo); 1784 } 1785 1786 void 1787 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1788 { 1789 struct inpcb *inp; 1790 struct in_multi *inm; 1791 struct in_mfilter *imf; 1792 struct ip_moptions *imo; 1793 1794 INP_INFO_WLOCK(pcbinfo); 1795 CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { 1796 INP_WLOCK(inp); 1797 imo = inp->inp_moptions; 1798 if ((inp->inp_vflag & INP_IPV4) && 1799 imo != NULL) { 1800 /* 1801 * Unselect the outgoing interface if it is being 1802 * detached. 1803 */ 1804 if (imo->imo_multicast_ifp == ifp) 1805 imo->imo_multicast_ifp = NULL; 1806 1807 /* 1808 * Drop multicast group membership if we joined 1809 * through the interface being detached. 1810 * 1811 * XXX This can all be deferred to an epoch_call 1812 */ 1813 restart: 1814 IP_MFILTER_FOREACH(imf, &imo->imo_head) { 1815 if ((inm = imf->imf_inm) == NULL) 1816 continue; 1817 if (inm->inm_ifp != ifp) 1818 continue; 1819 ip_mfilter_remove(&imo->imo_head, imf); 1820 IN_MULTI_LOCK_ASSERT(); 1821 in_leavegroup_locked(inm, NULL); 1822 ip_mfilter_free(imf); 1823 goto restart; 1824 } 1825 } 1826 INP_WUNLOCK(inp); 1827 } 1828 INP_INFO_WUNLOCK(pcbinfo); 1829 } 1830 1831 /* 1832 * Lookup a PCB based on the local address and port. Caller must hold the 1833 * hash lock. No inpcb locks or references are acquired. 1834 */ 1835 #define INP_LOOKUP_MAPPED_PCB_COST 3 1836 struct inpcb * 1837 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1838 u_short lport, int lookupflags, struct ucred *cred) 1839 { 1840 struct inpcb *inp; 1841 #ifdef INET6 1842 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1843 #else 1844 int matchwild = 3; 1845 #endif 1846 int wildcard; 1847 1848 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 1849 ("%s: invalid lookup flags %d", __func__, lookupflags)); 1850 1851 INP_HASH_LOCK_ASSERT(pcbinfo); 1852 1853 if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { 1854 struct inpcbhead *head; 1855 /* 1856 * Look for an unconnected (wildcard foreign addr) PCB that 1857 * matches the local address and port we're looking for. 1858 */ 1859 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 1860 0, pcbinfo->ipi_hashmask)]; 1861 CK_LIST_FOREACH(inp, head, inp_hash) { 1862 #ifdef INET6 1863 /* XXX inp locking */ 1864 if ((inp->inp_vflag & INP_IPV4) == 0) 1865 continue; 1866 #endif 1867 if (inp->inp_faddr.s_addr == INADDR_ANY && 1868 inp->inp_laddr.s_addr == laddr.s_addr && 1869 inp->inp_lport == lport) { 1870 /* 1871 * Found? 1872 */ 1873 if (cred == NULL || 1874 prison_equal_ip4(cred->cr_prison, 1875 inp->inp_cred->cr_prison)) 1876 return (inp); 1877 } 1878 } 1879 /* 1880 * Not found. 1881 */ 1882 return (NULL); 1883 } else { 1884 struct inpcbporthead *porthash; 1885 struct inpcbport *phd; 1886 struct inpcb *match = NULL; 1887 /* 1888 * Best fit PCB lookup. 1889 * 1890 * First see if this local port is in use by looking on the 1891 * port hash list. 1892 */ 1893 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 1894 pcbinfo->ipi_porthashmask)]; 1895 CK_LIST_FOREACH(phd, porthash, phd_hash) { 1896 if (phd->phd_port == lport) 1897 break; 1898 } 1899 if (phd != NULL) { 1900 /* 1901 * Port is in use by one or more PCBs. Look for best 1902 * fit. 1903 */ 1904 CK_LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1905 wildcard = 0; 1906 if (cred != NULL && 1907 !prison_equal_ip4(inp->inp_cred->cr_prison, 1908 cred->cr_prison)) 1909 continue; 1910 #ifdef INET6 1911 /* XXX inp locking */ 1912 if ((inp->inp_vflag & INP_IPV4) == 0) 1913 continue; 1914 /* 1915 * We never select the PCB that has 1916 * INP_IPV6 flag and is bound to :: if 1917 * we have another PCB which is bound 1918 * to 0.0.0.0. If a PCB has the 1919 * INP_IPV6 flag, then we set its cost 1920 * higher than IPv4 only PCBs. 1921 * 1922 * Note that the case only happens 1923 * when a socket is bound to ::, under 1924 * the condition that the use of the 1925 * mapped address is allowed. 1926 */ 1927 if ((inp->inp_vflag & INP_IPV6) != 0) 1928 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 1929 #endif 1930 if (inp->inp_faddr.s_addr != INADDR_ANY) 1931 wildcard++; 1932 if (inp->inp_laddr.s_addr != INADDR_ANY) { 1933 if (laddr.s_addr == INADDR_ANY) 1934 wildcard++; 1935 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1936 continue; 1937 } else { 1938 if (laddr.s_addr != INADDR_ANY) 1939 wildcard++; 1940 } 1941 if (wildcard < matchwild) { 1942 match = inp; 1943 matchwild = wildcard; 1944 if (matchwild == 0) 1945 break; 1946 } 1947 } 1948 } 1949 return (match); 1950 } 1951 } 1952 #undef INP_LOOKUP_MAPPED_PCB_COST 1953 1954 static struct inpcb * 1955 in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, 1956 const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, 1957 uint16_t fport, int lookupflags) 1958 { 1959 struct inpcb *local_wild; 1960 const struct inpcblbgrouphead *hdr; 1961 struct inpcblbgroup *grp; 1962 uint32_t idx; 1963 1964 INP_HASH_LOCK_ASSERT(pcbinfo); 1965 1966 hdr = &pcbinfo->ipi_lbgrouphashbase[ 1967 INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; 1968 1969 /* 1970 * Order of socket selection: 1971 * 1. non-wild. 1972 * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). 1973 * 1974 * NOTE: 1975 * - Load balanced group does not contain jailed sockets 1976 * - Load balanced group does not contain IPv4 mapped INET6 wild sockets 1977 */ 1978 local_wild = NULL; 1979 CK_LIST_FOREACH(grp, hdr, il_list) { 1980 #ifdef INET6 1981 if (!(grp->il_vflag & INP_IPV4)) 1982 continue; 1983 #endif 1984 if (grp->il_lport != lport) 1985 continue; 1986 1987 idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % 1988 grp->il_inpcnt; 1989 if (grp->il_laddr.s_addr == laddr->s_addr) 1990 return (grp->il_inp[idx]); 1991 if (grp->il_laddr.s_addr == INADDR_ANY && 1992 (lookupflags & INPLOOKUP_WILDCARD) != 0) 1993 local_wild = grp->il_inp[idx]; 1994 } 1995 return (local_wild); 1996 } 1997 1998 #ifdef PCBGROUP 1999 /* 2000 * Lookup PCB in hash list, using pcbgroup tables. 2001 */ 2002 static struct inpcb * 2003 in_pcblookup_group(struct inpcbinfo *pcbinfo, struct inpcbgroup *pcbgroup, 2004 struct in_addr faddr, u_int fport_arg, struct in_addr laddr, 2005 u_int lport_arg, int lookupflags, struct ifnet *ifp) 2006 { 2007 struct inpcbhead *head; 2008 struct inpcb *inp, *tmpinp; 2009 u_short fport = fport_arg, lport = lport_arg; 2010 bool locked; 2011 2012 /* 2013 * First look for an exact match. 2014 */ 2015 tmpinp = NULL; 2016 INP_GROUP_LOCK(pcbgroup); 2017 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, 2018 pcbgroup->ipg_hashmask)]; 2019 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { 2020 #ifdef INET6 2021 /* XXX inp locking */ 2022 if ((inp->inp_vflag & INP_IPV4) == 0) 2023 continue; 2024 #endif 2025 if (inp->inp_faddr.s_addr == faddr.s_addr && 2026 inp->inp_laddr.s_addr == laddr.s_addr && 2027 inp->inp_fport == fport && 2028 inp->inp_lport == lport) { 2029 /* 2030 * XXX We should be able to directly return 2031 * the inp here, without any checks. 2032 * Well unless both bound with SO_REUSEPORT? 2033 */ 2034 if (prison_flag(inp->inp_cred, PR_IP4)) 2035 goto found; 2036 if (tmpinp == NULL) 2037 tmpinp = inp; 2038 } 2039 } 2040 if (tmpinp != NULL) { 2041 inp = tmpinp; 2042 goto found; 2043 } 2044 2045 #ifdef RSS 2046 /* 2047 * For incoming connections, we may wish to do a wildcard 2048 * match for an RSS-local socket. 2049 */ 2050 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2051 struct inpcb *local_wild = NULL, *local_exact = NULL; 2052 #ifdef INET6 2053 struct inpcb *local_wild_mapped = NULL; 2054 #endif 2055 struct inpcb *jail_wild = NULL; 2056 struct inpcbhead *head; 2057 int injail; 2058 2059 /* 2060 * Order of socket selection - we always prefer jails. 2061 * 1. jailed, non-wild. 2062 * 2. jailed, wild. 2063 * 3. non-jailed, non-wild. 2064 * 4. non-jailed, wild. 2065 */ 2066 2067 head = &pcbgroup->ipg_hashbase[INP_PCBHASH(INADDR_ANY, 2068 lport, 0, pcbgroup->ipg_hashmask)]; 2069 CK_LIST_FOREACH(inp, head, inp_pcbgrouphash) { 2070 #ifdef INET6 2071 /* XXX inp locking */ 2072 if ((inp->inp_vflag & INP_IPV4) == 0) 2073 continue; 2074 #endif 2075 if (inp->inp_faddr.s_addr != INADDR_ANY || 2076 inp->inp_lport != lport) 2077 continue; 2078 2079 injail = prison_flag(inp->inp_cred, PR_IP4); 2080 if (injail) { 2081 if (prison_check_ip4(inp->inp_cred, 2082 &laddr) != 0) 2083 continue; 2084 } else { 2085 if (local_exact != NULL) 2086 continue; 2087 } 2088 2089 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2090 if (injail) 2091 goto found; 2092 else 2093 local_exact = inp; 2094 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2095 #ifdef INET6 2096 /* XXX inp locking, NULL check */ 2097 if (inp->inp_vflag & INP_IPV6PROTO) 2098 local_wild_mapped = inp; 2099 else 2100 #endif 2101 if (injail) 2102 jail_wild = inp; 2103 else 2104 local_wild = inp; 2105 } 2106 } /* LIST_FOREACH */ 2107 2108 inp = jail_wild; 2109 if (inp == NULL) 2110 inp = local_exact; 2111 if (inp == NULL) 2112 inp = local_wild; 2113 #ifdef INET6 2114 if (inp == NULL) 2115 inp = local_wild_mapped; 2116 #endif 2117 if (inp != NULL) 2118 goto found; 2119 } 2120 #endif 2121 2122 /* 2123 * Then look for a wildcard match, if requested. 2124 */ 2125 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2126 struct inpcb *local_wild = NULL, *local_exact = NULL; 2127 #ifdef INET6 2128 struct inpcb *local_wild_mapped = NULL; 2129 #endif 2130 struct inpcb *jail_wild = NULL; 2131 struct inpcbhead *head; 2132 int injail; 2133 2134 /* 2135 * Order of socket selection - we always prefer jails. 2136 * 1. jailed, non-wild. 2137 * 2. jailed, wild. 2138 * 3. non-jailed, non-wild. 2139 * 4. non-jailed, wild. 2140 */ 2141 head = &pcbinfo->ipi_wildbase[INP_PCBHASH(INADDR_ANY, lport, 2142 0, pcbinfo->ipi_wildmask)]; 2143 CK_LIST_FOREACH(inp, head, inp_pcbgroup_wild) { 2144 #ifdef INET6 2145 /* XXX inp locking */ 2146 if ((inp->inp_vflag & INP_IPV4) == 0) 2147 continue; 2148 #endif 2149 if (inp->inp_faddr.s_addr != INADDR_ANY || 2150 inp->inp_lport != lport) 2151 continue; 2152 2153 injail = prison_flag(inp->inp_cred, PR_IP4); 2154 if (injail) { 2155 if (prison_check_ip4(inp->inp_cred, 2156 &laddr) != 0) 2157 continue; 2158 } else { 2159 if (local_exact != NULL) 2160 continue; 2161 } 2162 2163 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2164 if (injail) 2165 goto found; 2166 else 2167 local_exact = inp; 2168 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2169 #ifdef INET6 2170 /* XXX inp locking, NULL check */ 2171 if (inp->inp_vflag & INP_IPV6PROTO) 2172 local_wild_mapped = inp; 2173 else 2174 #endif 2175 if (injail) 2176 jail_wild = inp; 2177 else 2178 local_wild = inp; 2179 } 2180 } /* LIST_FOREACH */ 2181 inp = jail_wild; 2182 if (inp == NULL) 2183 inp = local_exact; 2184 if (inp == NULL) 2185 inp = local_wild; 2186 #ifdef INET6 2187 if (inp == NULL) 2188 inp = local_wild_mapped; 2189 #endif 2190 if (inp != NULL) 2191 goto found; 2192 } /* if (lookupflags & INPLOOKUP_WILDCARD) */ 2193 INP_GROUP_UNLOCK(pcbgroup); 2194 return (NULL); 2195 2196 found: 2197 if (lookupflags & INPLOOKUP_WLOCKPCB) 2198 locked = INP_TRY_WLOCK(inp); 2199 else if (lookupflags & INPLOOKUP_RLOCKPCB) 2200 locked = INP_TRY_RLOCK(inp); 2201 else 2202 panic("%s: locking bug", __func__); 2203 if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { 2204 if (lookupflags & INPLOOKUP_WLOCKPCB) 2205 INP_WUNLOCK(inp); 2206 else 2207 INP_RUNLOCK(inp); 2208 return (NULL); 2209 } else if (!locked) 2210 in_pcbref(inp); 2211 INP_GROUP_UNLOCK(pcbgroup); 2212 if (!locked) { 2213 if (lookupflags & INPLOOKUP_WLOCKPCB) { 2214 INP_WLOCK(inp); 2215 if (in_pcbrele_wlocked(inp)) 2216 return (NULL); 2217 } else { 2218 INP_RLOCK(inp); 2219 if (in_pcbrele_rlocked(inp)) 2220 return (NULL); 2221 } 2222 } 2223 #ifdef INVARIANTS 2224 if (lookupflags & INPLOOKUP_WLOCKPCB) 2225 INP_WLOCK_ASSERT(inp); 2226 else 2227 INP_RLOCK_ASSERT(inp); 2228 #endif 2229 return (inp); 2230 } 2231 #endif /* PCBGROUP */ 2232 2233 /* 2234 * Lookup PCB in hash list, using pcbinfo tables. This variation assumes 2235 * that the caller has locked the hash list, and will not perform any further 2236 * locking or reference operations on either the hash list or the connection. 2237 */ 2238 static struct inpcb * 2239 in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2240 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, 2241 struct ifnet *ifp) 2242 { 2243 struct inpcbhead *head; 2244 struct inpcb *inp, *tmpinp; 2245 u_short fport = fport_arg, lport = lport_arg; 2246 2247 KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, 2248 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2249 INP_HASH_LOCK_ASSERT(pcbinfo); 2250 2251 /* 2252 * First look for an exact match. 2253 */ 2254 tmpinp = NULL; 2255 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, 2256 pcbinfo->ipi_hashmask)]; 2257 CK_LIST_FOREACH(inp, head, inp_hash) { 2258 #ifdef INET6 2259 /* XXX inp locking */ 2260 if ((inp->inp_vflag & INP_IPV4) == 0) 2261 continue; 2262 #endif 2263 if (inp->inp_faddr.s_addr == faddr.s_addr && 2264 inp->inp_laddr.s_addr == laddr.s_addr && 2265 inp->inp_fport == fport && 2266 inp->inp_lport == lport) { 2267 /* 2268 * XXX We should be able to directly return 2269 * the inp here, without any checks. 2270 * Well unless both bound with SO_REUSEPORT? 2271 */ 2272 if (prison_flag(inp->inp_cred, PR_IP4)) 2273 return (inp); 2274 if (tmpinp == NULL) 2275 tmpinp = inp; 2276 } 2277 } 2278 if (tmpinp != NULL) 2279 return (tmpinp); 2280 2281 /* 2282 * Then look in lb group (for wildcard match). 2283 */ 2284 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2285 inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, 2286 fport, lookupflags); 2287 if (inp != NULL) 2288 return (inp); 2289 } 2290 2291 /* 2292 * Then look for a wildcard match, if requested. 2293 */ 2294 if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { 2295 struct inpcb *local_wild = NULL, *local_exact = NULL; 2296 #ifdef INET6 2297 struct inpcb *local_wild_mapped = NULL; 2298 #endif 2299 struct inpcb *jail_wild = NULL; 2300 int injail; 2301 2302 /* 2303 * Order of socket selection - we always prefer jails. 2304 * 1. jailed, non-wild. 2305 * 2. jailed, wild. 2306 * 3. non-jailed, non-wild. 2307 * 4. non-jailed, wild. 2308 */ 2309 2310 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 2311 0, pcbinfo->ipi_hashmask)]; 2312 CK_LIST_FOREACH(inp, head, inp_hash) { 2313 #ifdef INET6 2314 /* XXX inp locking */ 2315 if ((inp->inp_vflag & INP_IPV4) == 0) 2316 continue; 2317 #endif 2318 if (inp->inp_faddr.s_addr != INADDR_ANY || 2319 inp->inp_lport != lport) 2320 continue; 2321 2322 injail = prison_flag(inp->inp_cred, PR_IP4); 2323 if (injail) { 2324 if (prison_check_ip4(inp->inp_cred, 2325 &laddr) != 0) 2326 continue; 2327 } else { 2328 if (local_exact != NULL) 2329 continue; 2330 } 2331 2332 if (inp->inp_laddr.s_addr == laddr.s_addr) { 2333 if (injail) 2334 return (inp); 2335 else 2336 local_exact = inp; 2337 } else if (inp->inp_laddr.s_addr == INADDR_ANY) { 2338 #ifdef INET6 2339 /* XXX inp locking, NULL check */ 2340 if (inp->inp_vflag & INP_IPV6PROTO) 2341 local_wild_mapped = inp; 2342 else 2343 #endif 2344 if (injail) 2345 jail_wild = inp; 2346 else 2347 local_wild = inp; 2348 } 2349 } /* LIST_FOREACH */ 2350 if (jail_wild != NULL) 2351 return (jail_wild); 2352 if (local_exact != NULL) 2353 return (local_exact); 2354 if (local_wild != NULL) 2355 return (local_wild); 2356 #ifdef INET6 2357 if (local_wild_mapped != NULL) 2358 return (local_wild_mapped); 2359 #endif 2360 } /* if ((lookupflags & INPLOOKUP_WILDCARD) != 0) */ 2361 2362 return (NULL); 2363 } 2364 2365 /* 2366 * Lookup PCB in hash list, using pcbinfo tables. This variation locks the 2367 * hash list lock, and will return the inpcb locked (i.e., requires 2368 * INPLOOKUP_LOCKPCB). 2369 */ 2370 static struct inpcb * 2371 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2372 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2373 struct ifnet *ifp) 2374 { 2375 struct inpcb *inp; 2376 2377 inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, 2378 (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); 2379 if (inp != NULL) { 2380 if (lookupflags & INPLOOKUP_WLOCKPCB) { 2381 INP_WLOCK(inp); 2382 if (__predict_false(inp->inp_flags2 & INP_FREED)) { 2383 INP_WUNLOCK(inp); 2384 inp = NULL; 2385 } 2386 } else if (lookupflags & INPLOOKUP_RLOCKPCB) { 2387 INP_RLOCK(inp); 2388 if (__predict_false(inp->inp_flags2 & INP_FREED)) { 2389 INP_RUNLOCK(inp); 2390 inp = NULL; 2391 } 2392 } else 2393 panic("%s: locking bug", __func__); 2394 #ifdef INVARIANTS 2395 if (inp != NULL) { 2396 if (lookupflags & INPLOOKUP_WLOCKPCB) 2397 INP_WLOCK_ASSERT(inp); 2398 else 2399 INP_RLOCK_ASSERT(inp); 2400 } 2401 #endif 2402 } 2403 2404 return (inp); 2405 } 2406 2407 /* 2408 * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf 2409 * from which a pre-calculated hash value may be extracted. 2410 * 2411 * Possibly more of this logic should be in in_pcbgroup.c. 2412 */ 2413 struct inpcb * 2414 in_pcblookup(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, 2415 struct in_addr laddr, u_int lport, int lookupflags, struct ifnet *ifp) 2416 { 2417 #if defined(PCBGROUP) && !defined(RSS) 2418 struct inpcbgroup *pcbgroup; 2419 #endif 2420 2421 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2422 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2423 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2424 ("%s: LOCKPCB not set", __func__)); 2425 2426 /* 2427 * When not using RSS, use connection groups in preference to the 2428 * reservation table when looking up 4-tuples. When using RSS, just 2429 * use the reservation table, due to the cost of the Toeplitz hash 2430 * in software. 2431 * 2432 * XXXRW: This policy belongs in the pcbgroup code, as in principle 2433 * we could be doing RSS with a non-Toeplitz hash that is affordable 2434 * in software. 2435 */ 2436 #if defined(PCBGROUP) && !defined(RSS) 2437 if (in_pcbgroup_enabled(pcbinfo)) { 2438 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, 2439 fport); 2440 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, 2441 laddr, lport, lookupflags, ifp)); 2442 } 2443 #endif 2444 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2445 lookupflags, ifp)); 2446 } 2447 2448 struct inpcb * 2449 in_pcblookup_mbuf(struct inpcbinfo *pcbinfo, struct in_addr faddr, 2450 u_int fport, struct in_addr laddr, u_int lport, int lookupflags, 2451 struct ifnet *ifp, struct mbuf *m) 2452 { 2453 #ifdef PCBGROUP 2454 struct inpcbgroup *pcbgroup; 2455 #endif 2456 2457 KASSERT((lookupflags & ~INPLOOKUP_MASK) == 0, 2458 ("%s: invalid lookup flags %d", __func__, lookupflags)); 2459 KASSERT((lookupflags & (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)) != 0, 2460 ("%s: LOCKPCB not set", __func__)); 2461 2462 #ifdef PCBGROUP 2463 /* 2464 * If we can use a hardware-generated hash to look up the connection 2465 * group, use that connection group to find the inpcb. Otherwise 2466 * fall back on a software hash -- or the reservation table if we're 2467 * using RSS. 2468 * 2469 * XXXRW: As above, that policy belongs in the pcbgroup code. 2470 */ 2471 if (in_pcbgroup_enabled(pcbinfo) && 2472 !(M_HASHTYPE_TEST(m, M_HASHTYPE_NONE))) { 2473 pcbgroup = in_pcbgroup_byhash(pcbinfo, M_HASHTYPE_GET(m), 2474 m->m_pkthdr.flowid); 2475 if (pcbgroup != NULL) 2476 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, 2477 fport, laddr, lport, lookupflags, ifp)); 2478 #ifndef RSS 2479 pcbgroup = in_pcbgroup_bytuple(pcbinfo, laddr, lport, faddr, 2480 fport); 2481 return (in_pcblookup_group(pcbinfo, pcbgroup, faddr, fport, 2482 laddr, lport, lookupflags, ifp)); 2483 #endif 2484 } 2485 #endif 2486 return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, 2487 lookupflags, ifp)); 2488 } 2489 #endif /* INET */ 2490 2491 /* 2492 * Insert PCB onto various hash lists. 2493 */ 2494 static int 2495 in_pcbinshash_internal(struct inpcb *inp, struct mbuf *m) 2496 { 2497 struct inpcbhead *pcbhash; 2498 struct inpcbporthead *pcbporthash; 2499 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2500 struct inpcbport *phd; 2501 u_int32_t hashkey_faddr; 2502 int so_options; 2503 2504 INP_WLOCK_ASSERT(inp); 2505 INP_HASH_WLOCK_ASSERT(pcbinfo); 2506 2507 KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, 2508 ("in_pcbinshash: INP_INHASHLIST")); 2509 2510 #ifdef INET6 2511 if (inp->inp_vflag & INP_IPV6) 2512 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2513 else 2514 #endif 2515 hashkey_faddr = inp->inp_faddr.s_addr; 2516 2517 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2518 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2519 2520 pcbporthash = &pcbinfo->ipi_porthashbase[ 2521 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 2522 2523 /* 2524 * Add entry to load balance group. 2525 * Only do this if SO_REUSEPORT_LB is set. 2526 */ 2527 so_options = inp_so_options(inp); 2528 if (so_options & SO_REUSEPORT_LB) { 2529 int ret = in_pcbinslbgrouphash(inp); 2530 if (ret) { 2531 /* pcb lb group malloc fail (ret=ENOBUFS). */ 2532 return (ret); 2533 } 2534 } 2535 2536 /* 2537 * Go through port list and look for a head for this lport. 2538 */ 2539 CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { 2540 if (phd->phd_port == inp->inp_lport) 2541 break; 2542 } 2543 /* 2544 * If none exists, malloc one and tack it on. 2545 */ 2546 if (phd == NULL) { 2547 phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); 2548 if (phd == NULL) { 2549 return (ENOBUFS); /* XXX */ 2550 } 2551 bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); 2552 phd->phd_port = inp->inp_lport; 2553 CK_LIST_INIT(&phd->phd_pcblist); 2554 CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 2555 } 2556 inp->inp_phd = phd; 2557 CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 2558 CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 2559 inp->inp_flags |= INP_INHASHLIST; 2560 #ifdef PCBGROUP 2561 if (m != NULL) { 2562 in_pcbgroup_update_mbuf(inp, m); 2563 } else { 2564 in_pcbgroup_update(inp); 2565 } 2566 #endif 2567 return (0); 2568 } 2569 2570 int 2571 in_pcbinshash(struct inpcb *inp) 2572 { 2573 2574 return (in_pcbinshash_internal(inp, NULL)); 2575 } 2576 2577 int 2578 in_pcbinshash_mbuf(struct inpcb *inp, struct mbuf *m) 2579 { 2580 2581 return (in_pcbinshash_internal(inp, m)); 2582 } 2583 2584 /* 2585 * Move PCB to the proper hash bucket when { faddr, fport } have been 2586 * changed. NOTE: This does not handle the case of the lport changing (the 2587 * hashed port list would have to be updated as well), so the lport must 2588 * not change after in_pcbinshash() has been called. 2589 */ 2590 void 2591 in_pcbrehash_mbuf(struct inpcb *inp, struct mbuf *m) 2592 { 2593 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2594 struct inpcbhead *head; 2595 u_int32_t hashkey_faddr; 2596 2597 INP_WLOCK_ASSERT(inp); 2598 INP_HASH_WLOCK_ASSERT(pcbinfo); 2599 2600 KASSERT(inp->inp_flags & INP_INHASHLIST, 2601 ("in_pcbrehash: !INP_INHASHLIST")); 2602 2603 #ifdef INET6 2604 if (inp->inp_vflag & INP_IPV6) 2605 hashkey_faddr = INP6_PCBHASHKEY(&inp->in6p_faddr); 2606 else 2607 #endif 2608 hashkey_faddr = inp->inp_faddr.s_addr; 2609 2610 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 2611 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 2612 2613 CK_LIST_REMOVE(inp, inp_hash); 2614 CK_LIST_INSERT_HEAD(head, inp, inp_hash); 2615 2616 #ifdef PCBGROUP 2617 if (m != NULL) 2618 in_pcbgroup_update_mbuf(inp, m); 2619 else 2620 in_pcbgroup_update(inp); 2621 #endif 2622 } 2623 2624 void 2625 in_pcbrehash(struct inpcb *inp) 2626 { 2627 2628 in_pcbrehash_mbuf(inp, NULL); 2629 } 2630 2631 /* 2632 * Remove PCB from various lists. 2633 */ 2634 static void 2635 in_pcbremlists(struct inpcb *inp) 2636 { 2637 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 2638 2639 INP_WLOCK_ASSERT(inp); 2640 INP_LIST_WLOCK_ASSERT(pcbinfo); 2641 2642 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 2643 if (inp->inp_flags & INP_INHASHLIST) { 2644 struct inpcbport *phd = inp->inp_phd; 2645 2646 INP_HASH_WLOCK(pcbinfo); 2647 2648 /* XXX: Only do if SO_REUSEPORT_LB set? */ 2649 in_pcbremlbgrouphash(inp); 2650 2651 CK_LIST_REMOVE(inp, inp_hash); 2652 CK_LIST_REMOVE(inp, inp_portlist); 2653 if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { 2654 CK_LIST_REMOVE(phd, phd_hash); 2655 NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); 2656 } 2657 INP_HASH_WUNLOCK(pcbinfo); 2658 inp->inp_flags &= ~INP_INHASHLIST; 2659 } 2660 CK_LIST_REMOVE(inp, inp_list); 2661 pcbinfo->ipi_count--; 2662 #ifdef PCBGROUP 2663 in_pcbgroup_remove(inp); 2664 #endif 2665 } 2666 2667 /* 2668 * Check for alternatives when higher level complains 2669 * about service problems. For now, invalidate cached 2670 * routing information. If the route was created dynamically 2671 * (by a redirect), time to try a default gateway again. 2672 */ 2673 void 2674 in_losing(struct inpcb *inp) 2675 { 2676 2677 RO_INVALIDATE_CACHE(&inp->inp_route); 2678 return; 2679 } 2680 2681 /* 2682 * A set label operation has occurred at the socket layer, propagate the 2683 * label change into the in_pcb for the socket. 2684 */ 2685 void 2686 in_pcbsosetlabel(struct socket *so) 2687 { 2688 #ifdef MAC 2689 struct inpcb *inp; 2690 2691 inp = sotoinpcb(so); 2692 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 2693 2694 INP_WLOCK(inp); 2695 SOCK_LOCK(so); 2696 mac_inpcb_sosetlabel(so, inp); 2697 SOCK_UNLOCK(so); 2698 INP_WUNLOCK(inp); 2699 #endif 2700 } 2701 2702 /* 2703 * ipport_tick runs once per second, determining if random port allocation 2704 * should be continued. If more than ipport_randomcps ports have been 2705 * allocated in the last second, then we return to sequential port 2706 * allocation. We return to random allocation only once we drop below 2707 * ipport_randomcps for at least ipport_randomtime seconds. 2708 */ 2709 static void 2710 ipport_tick(void *xtp) 2711 { 2712 VNET_ITERATOR_DECL(vnet_iter); 2713 2714 VNET_LIST_RLOCK_NOSLEEP(); 2715 VNET_FOREACH(vnet_iter) { 2716 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ 2717 if (V_ipport_tcpallocs <= 2718 V_ipport_tcplastcount + V_ipport_randomcps) { 2719 if (V_ipport_stoprandom > 0) 2720 V_ipport_stoprandom--; 2721 } else 2722 V_ipport_stoprandom = V_ipport_randomtime; 2723 V_ipport_tcplastcount = V_ipport_tcpallocs; 2724 CURVNET_RESTORE(); 2725 } 2726 VNET_LIST_RUNLOCK_NOSLEEP(); 2727 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); 2728 } 2729 2730 static void 2731 ip_fini(void *xtp) 2732 { 2733 2734 callout_stop(&ipport_tick_callout); 2735 } 2736 2737 /* 2738 * The ipport_callout should start running at about the time we attach the 2739 * inet or inet6 domains. 2740 */ 2741 static void 2742 ipport_tick_init(const void *unused __unused) 2743 { 2744 2745 /* Start ipport_tick. */ 2746 callout_init(&ipport_tick_callout, 1); 2747 callout_reset(&ipport_tick_callout, 1, ipport_tick, NULL); 2748 EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, 2749 SHUTDOWN_PRI_DEFAULT); 2750 } 2751 SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, 2752 ipport_tick_init, NULL); 2753 2754 void 2755 inp_wlock(struct inpcb *inp) 2756 { 2757 2758 INP_WLOCK(inp); 2759 } 2760 2761 void 2762 inp_wunlock(struct inpcb *inp) 2763 { 2764 2765 INP_WUNLOCK(inp); 2766 } 2767 2768 void 2769 inp_rlock(struct inpcb *inp) 2770 { 2771 2772 INP_RLOCK(inp); 2773 } 2774 2775 void 2776 inp_runlock(struct inpcb *inp) 2777 { 2778 2779 INP_RUNLOCK(inp); 2780 } 2781 2782 #ifdef INVARIANT_SUPPORT 2783 void 2784 inp_lock_assert(struct inpcb *inp) 2785 { 2786 2787 INP_WLOCK_ASSERT(inp); 2788 } 2789 2790 void 2791 inp_unlock_assert(struct inpcb *inp) 2792 { 2793 2794 INP_UNLOCK_ASSERT(inp); 2795 } 2796 #endif 2797 2798 void 2799 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) 2800 { 2801 struct inpcb *inp; 2802 2803 INP_INFO_WLOCK(&V_tcbinfo); 2804 CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { 2805 INP_WLOCK(inp); 2806 func(inp, arg); 2807 INP_WUNLOCK(inp); 2808 } 2809 INP_INFO_WUNLOCK(&V_tcbinfo); 2810 } 2811 2812 struct socket * 2813 inp_inpcbtosocket(struct inpcb *inp) 2814 { 2815 2816 INP_WLOCK_ASSERT(inp); 2817 return (inp->inp_socket); 2818 } 2819 2820 struct tcpcb * 2821 inp_inpcbtotcpcb(struct inpcb *inp) 2822 { 2823 2824 INP_WLOCK_ASSERT(inp); 2825 return ((struct tcpcb *)inp->inp_ppcb); 2826 } 2827 2828 int 2829 inp_ip_tos_get(const struct inpcb *inp) 2830 { 2831 2832 return (inp->inp_ip_tos); 2833 } 2834 2835 void 2836 inp_ip_tos_set(struct inpcb *inp, int val) 2837 { 2838 2839 inp->inp_ip_tos = val; 2840 } 2841 2842 void 2843 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 2844 uint32_t *faddr, uint16_t *fp) 2845 { 2846 2847 INP_LOCK_ASSERT(inp); 2848 *laddr = inp->inp_laddr.s_addr; 2849 *faddr = inp->inp_faddr.s_addr; 2850 *lp = inp->inp_lport; 2851 *fp = inp->inp_fport; 2852 } 2853 2854 struct inpcb * 2855 so_sotoinpcb(struct socket *so) 2856 { 2857 2858 return (sotoinpcb(so)); 2859 } 2860 2861 struct tcpcb * 2862 so_sototcpcb(struct socket *so) 2863 { 2864 2865 return (sototcpcb(so)); 2866 } 2867 2868 /* 2869 * Create an external-format (``xinpcb'') structure using the information in 2870 * the kernel-format in_pcb structure pointed to by inp. This is done to 2871 * reduce the spew of irrelevant information over this interface, to isolate 2872 * user code from changes in the kernel structure, and potentially to provide 2873 * information-hiding if we decide that some of this information should be 2874 * hidden from users. 2875 */ 2876 void 2877 in_pcbtoxinpcb(const struct inpcb *inp, struct xinpcb *xi) 2878 { 2879 2880 bzero(xi, sizeof(*xi)); 2881 xi->xi_len = sizeof(struct xinpcb); 2882 if (inp->inp_socket) 2883 sotoxsocket(inp->inp_socket, &xi->xi_socket); 2884 bcopy(&inp->inp_inc, &xi->inp_inc, sizeof(struct in_conninfo)); 2885 xi->inp_gencnt = inp->inp_gencnt; 2886 xi->inp_ppcb = (uintptr_t)inp->inp_ppcb; 2887 xi->inp_flow = inp->inp_flow; 2888 xi->inp_flowid = inp->inp_flowid; 2889 xi->inp_flowtype = inp->inp_flowtype; 2890 xi->inp_flags = inp->inp_flags; 2891 xi->inp_flags2 = inp->inp_flags2; 2892 xi->inp_rss_listen_bucket = inp->inp_rss_listen_bucket; 2893 xi->in6p_cksum = inp->in6p_cksum; 2894 xi->in6p_hops = inp->in6p_hops; 2895 xi->inp_ip_tos = inp->inp_ip_tos; 2896 xi->inp_vflag = inp->inp_vflag; 2897 xi->inp_ip_ttl = inp->inp_ip_ttl; 2898 xi->inp_ip_p = inp->inp_ip_p; 2899 xi->inp_ip_minttl = inp->inp_ip_minttl; 2900 } 2901 2902 #ifdef DDB 2903 static void 2904 db_print_indent(int indent) 2905 { 2906 int i; 2907 2908 for (i = 0; i < indent; i++) 2909 db_printf(" "); 2910 } 2911 2912 static void 2913 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 2914 { 2915 char faddr_str[48], laddr_str[48]; 2916 2917 db_print_indent(indent); 2918 db_printf("%s at %p\n", name, inc); 2919 2920 indent += 2; 2921 2922 #ifdef INET6 2923 if (inc->inc_flags & INC_ISIPV6) { 2924 /* IPv6. */ 2925 ip6_sprintf(laddr_str, &inc->inc6_laddr); 2926 ip6_sprintf(faddr_str, &inc->inc6_faddr); 2927 } else 2928 #endif 2929 { 2930 /* IPv4. */ 2931 inet_ntoa_r(inc->inc_laddr, laddr_str); 2932 inet_ntoa_r(inc->inc_faddr, faddr_str); 2933 } 2934 db_print_indent(indent); 2935 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 2936 ntohs(inc->inc_lport)); 2937 db_print_indent(indent); 2938 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 2939 ntohs(inc->inc_fport)); 2940 } 2941 2942 static void 2943 db_print_inpflags(int inp_flags) 2944 { 2945 int comma; 2946 2947 comma = 0; 2948 if (inp_flags & INP_RECVOPTS) { 2949 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 2950 comma = 1; 2951 } 2952 if (inp_flags & INP_RECVRETOPTS) { 2953 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 2954 comma = 1; 2955 } 2956 if (inp_flags & INP_RECVDSTADDR) { 2957 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 2958 comma = 1; 2959 } 2960 if (inp_flags & INP_ORIGDSTADDR) { 2961 db_printf("%sINP_ORIGDSTADDR", comma ? ", " : ""); 2962 comma = 1; 2963 } 2964 if (inp_flags & INP_HDRINCL) { 2965 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 2966 comma = 1; 2967 } 2968 if (inp_flags & INP_HIGHPORT) { 2969 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 2970 comma = 1; 2971 } 2972 if (inp_flags & INP_LOWPORT) { 2973 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 2974 comma = 1; 2975 } 2976 if (inp_flags & INP_ANONPORT) { 2977 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 2978 comma = 1; 2979 } 2980 if (inp_flags & INP_RECVIF) { 2981 db_printf("%sINP_RECVIF", comma ? ", " : ""); 2982 comma = 1; 2983 } 2984 if (inp_flags & INP_MTUDISC) { 2985 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 2986 comma = 1; 2987 } 2988 if (inp_flags & INP_RECVTTL) { 2989 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 2990 comma = 1; 2991 } 2992 if (inp_flags & INP_DONTFRAG) { 2993 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 2994 comma = 1; 2995 } 2996 if (inp_flags & INP_RECVTOS) { 2997 db_printf("%sINP_RECVTOS", comma ? ", " : ""); 2998 comma = 1; 2999 } 3000 if (inp_flags & IN6P_IPV6_V6ONLY) { 3001 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 3002 comma = 1; 3003 } 3004 if (inp_flags & IN6P_PKTINFO) { 3005 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 3006 comma = 1; 3007 } 3008 if (inp_flags & IN6P_HOPLIMIT) { 3009 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 3010 comma = 1; 3011 } 3012 if (inp_flags & IN6P_HOPOPTS) { 3013 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 3014 comma = 1; 3015 } 3016 if (inp_flags & IN6P_DSTOPTS) { 3017 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 3018 comma = 1; 3019 } 3020 if (inp_flags & IN6P_RTHDR) { 3021 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 3022 comma = 1; 3023 } 3024 if (inp_flags & IN6P_RTHDRDSTOPTS) { 3025 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 3026 comma = 1; 3027 } 3028 if (inp_flags & IN6P_TCLASS) { 3029 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 3030 comma = 1; 3031 } 3032 if (inp_flags & IN6P_AUTOFLOWLABEL) { 3033 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 3034 comma = 1; 3035 } 3036 if (inp_flags & INP_TIMEWAIT) { 3037 db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); 3038 comma = 1; 3039 } 3040 if (inp_flags & INP_ONESBCAST) { 3041 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 3042 comma = 1; 3043 } 3044 if (inp_flags & INP_DROPPED) { 3045 db_printf("%sINP_DROPPED", comma ? ", " : ""); 3046 comma = 1; 3047 } 3048 if (inp_flags & INP_SOCKREF) { 3049 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 3050 comma = 1; 3051 } 3052 if (inp_flags & IN6P_RFC2292) { 3053 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 3054 comma = 1; 3055 } 3056 if (inp_flags & IN6P_MTU) { 3057 db_printf("IN6P_MTU%s", comma ? ", " : ""); 3058 comma = 1; 3059 } 3060 } 3061 3062 static void 3063 db_print_inpvflag(u_char inp_vflag) 3064 { 3065 int comma; 3066 3067 comma = 0; 3068 if (inp_vflag & INP_IPV4) { 3069 db_printf("%sINP_IPV4", comma ? ", " : ""); 3070 comma = 1; 3071 } 3072 if (inp_vflag & INP_IPV6) { 3073 db_printf("%sINP_IPV6", comma ? ", " : ""); 3074 comma = 1; 3075 } 3076 if (inp_vflag & INP_IPV6PROTO) { 3077 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 3078 comma = 1; 3079 } 3080 } 3081 3082 static void 3083 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 3084 { 3085 3086 db_print_indent(indent); 3087 db_printf("%s at %p\n", name, inp); 3088 3089 indent += 2; 3090 3091 db_print_indent(indent); 3092 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 3093 3094 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 3095 3096 db_print_indent(indent); 3097 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 3098 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 3099 3100 db_print_indent(indent); 3101 db_printf("inp_label: %p inp_flags: 0x%x (", 3102 inp->inp_label, inp->inp_flags); 3103 db_print_inpflags(inp->inp_flags); 3104 db_printf(")\n"); 3105 3106 db_print_indent(indent); 3107 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 3108 inp->inp_vflag); 3109 db_print_inpvflag(inp->inp_vflag); 3110 db_printf(")\n"); 3111 3112 db_print_indent(indent); 3113 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 3114 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 3115 3116 db_print_indent(indent); 3117 #ifdef INET6 3118 if (inp->inp_vflag & INP_IPV6) { 3119 db_printf("in6p_options: %p in6p_outputopts: %p " 3120 "in6p_moptions: %p\n", inp->in6p_options, 3121 inp->in6p_outputopts, inp->in6p_moptions); 3122 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 3123 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 3124 inp->in6p_hops); 3125 } else 3126 #endif 3127 { 3128 db_printf("inp_ip_tos: %d inp_ip_options: %p " 3129 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 3130 inp->inp_options, inp->inp_moptions); 3131 } 3132 3133 db_print_indent(indent); 3134 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 3135 (uintmax_t)inp->inp_gencnt); 3136 } 3137 3138 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 3139 { 3140 struct inpcb *inp; 3141 3142 if (!have_addr) { 3143 db_printf("usage: show inpcb <addr>\n"); 3144 return; 3145 } 3146 inp = (struct inpcb *)addr; 3147 3148 db_print_inpcb(inp, "inpcb", 0); 3149 } 3150 #endif /* DDB */ 3151 3152 #ifdef RATELIMIT 3153 /* 3154 * Modify TX rate limit based on the existing "inp->inp_snd_tag", 3155 * if any. 3156 */ 3157 int 3158 in_pcbmodify_txrtlmt(struct inpcb *inp, uint32_t max_pacing_rate) 3159 { 3160 union if_snd_tag_modify_params params = { 3161 .rate_limit.max_rate = max_pacing_rate, 3162 .rate_limit.flags = M_NOWAIT, 3163 }; 3164 struct m_snd_tag *mst; 3165 struct ifnet *ifp; 3166 int error; 3167 3168 mst = inp->inp_snd_tag; 3169 if (mst == NULL) 3170 return (EINVAL); 3171 3172 ifp = mst->ifp; 3173 if (ifp == NULL) 3174 return (EINVAL); 3175 3176 if (ifp->if_snd_tag_modify == NULL) { 3177 error = EOPNOTSUPP; 3178 } else { 3179 error = ifp->if_snd_tag_modify(mst, ¶ms); 3180 } 3181 return (error); 3182 } 3183 3184 /* 3185 * Query existing TX rate limit based on the existing 3186 * "inp->inp_snd_tag", if any. 3187 */ 3188 int 3189 in_pcbquery_txrtlmt(struct inpcb *inp, uint32_t *p_max_pacing_rate) 3190 { 3191 union if_snd_tag_query_params params = { }; 3192 struct m_snd_tag *mst; 3193 struct ifnet *ifp; 3194 int error; 3195 3196 mst = inp->inp_snd_tag; 3197 if (mst == NULL) 3198 return (EINVAL); 3199 3200 ifp = mst->ifp; 3201 if (ifp == NULL) 3202 return (EINVAL); 3203 3204 if (ifp->if_snd_tag_query == NULL) { 3205 error = EOPNOTSUPP; 3206 } else { 3207 error = ifp->if_snd_tag_query(mst, ¶ms); 3208 if (error == 0 && p_max_pacing_rate != NULL) 3209 *p_max_pacing_rate = params.rate_limit.max_rate; 3210 } 3211 return (error); 3212 } 3213 3214 /* 3215 * Query existing TX queue level based on the existing 3216 * "inp->inp_snd_tag", if any. 3217 */ 3218 int 3219 in_pcbquery_txrlevel(struct inpcb *inp, uint32_t *p_txqueue_level) 3220 { 3221 union if_snd_tag_query_params params = { }; 3222 struct m_snd_tag *mst; 3223 struct ifnet *ifp; 3224 int error; 3225 3226 mst = inp->inp_snd_tag; 3227 if (mst == NULL) 3228 return (EINVAL); 3229 3230 ifp = mst->ifp; 3231 if (ifp == NULL) 3232 return (EINVAL); 3233 3234 if (ifp->if_snd_tag_query == NULL) 3235 return (EOPNOTSUPP); 3236 3237 error = ifp->if_snd_tag_query(mst, ¶ms); 3238 if (error == 0 && p_txqueue_level != NULL) 3239 *p_txqueue_level = params.rate_limit.queue_level; 3240 return (error); 3241 } 3242 3243 /* 3244 * Allocate a new TX rate limit send tag from the network interface 3245 * given by the "ifp" argument and save it in "inp->inp_snd_tag": 3246 */ 3247 int 3248 in_pcbattach_txrtlmt(struct inpcb *inp, struct ifnet *ifp, 3249 uint32_t flowtype, uint32_t flowid, uint32_t max_pacing_rate, struct m_snd_tag **st) 3250 3251 { 3252 union if_snd_tag_alloc_params params = { 3253 .rate_limit.hdr.type = (max_pacing_rate == -1U) ? 3254 IF_SND_TAG_TYPE_UNLIMITED : IF_SND_TAG_TYPE_RATE_LIMIT, 3255 .rate_limit.hdr.flowid = flowid, 3256 .rate_limit.hdr.flowtype = flowtype, 3257 .rate_limit.hdr.numa_domain = inp->inp_numa_domain, 3258 .rate_limit.max_rate = max_pacing_rate, 3259 .rate_limit.flags = M_NOWAIT, 3260 }; 3261 int error; 3262 3263 INP_WLOCK_ASSERT(inp); 3264 3265 if (*st != NULL) 3266 return (EINVAL); 3267 3268 if (ifp->if_snd_tag_alloc == NULL) { 3269 error = EOPNOTSUPP; 3270 } else { 3271 error = ifp->if_snd_tag_alloc(ifp, ¶ms, &inp->inp_snd_tag); 3272 3273 #ifdef INET 3274 if (error == 0) { 3275 counter_u64_add(rate_limit_set_ok, 1); 3276 counter_u64_add(rate_limit_active, 1); 3277 } else 3278 counter_u64_add(rate_limit_alloc_fail, 1); 3279 #endif 3280 } 3281 return (error); 3282 } 3283 3284 void 3285 in_pcbdetach_tag(struct ifnet *ifp, struct m_snd_tag *mst) 3286 { 3287 if (ifp == NULL) 3288 return; 3289 3290 /* 3291 * If the device was detached while we still had reference(s) 3292 * on the ifp, we assume if_snd_tag_free() was replaced with 3293 * stubs. 3294 */ 3295 ifp->if_snd_tag_free(mst); 3296 3297 /* release reference count on network interface */ 3298 if_rele(ifp); 3299 #ifdef INET 3300 counter_u64_add(rate_limit_active, -1); 3301 #endif 3302 } 3303 3304 /* 3305 * Free an existing TX rate limit tag based on the "inp->inp_snd_tag", 3306 * if any: 3307 */ 3308 void 3309 in_pcbdetach_txrtlmt(struct inpcb *inp) 3310 { 3311 struct m_snd_tag *mst; 3312 3313 INP_WLOCK_ASSERT(inp); 3314 3315 mst = inp->inp_snd_tag; 3316 inp->inp_snd_tag = NULL; 3317 3318 if (mst == NULL) 3319 return; 3320 3321 m_snd_tag_rele(mst); 3322 } 3323 3324 int 3325 in_pcboutput_txrtlmt_locked(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb, uint32_t max_pacing_rate) 3326 { 3327 int error; 3328 3329 /* 3330 * If the existing send tag is for the wrong interface due to 3331 * a route change, first drop the existing tag. Set the 3332 * CHANGED flag so that we will keep trying to allocate a new 3333 * tag if we fail to allocate one this time. 3334 */ 3335 if (inp->inp_snd_tag != NULL && inp->inp_snd_tag->ifp != ifp) { 3336 in_pcbdetach_txrtlmt(inp); 3337 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3338 } 3339 3340 /* 3341 * NOTE: When attaching to a network interface a reference is 3342 * made to ensure the network interface doesn't go away until 3343 * all ratelimit connections are gone. The network interface 3344 * pointers compared below represent valid network interfaces, 3345 * except when comparing towards NULL. 3346 */ 3347 if (max_pacing_rate == 0 && inp->inp_snd_tag == NULL) { 3348 error = 0; 3349 } else if (!(ifp->if_capenable & IFCAP_TXRTLMT)) { 3350 if (inp->inp_snd_tag != NULL) 3351 in_pcbdetach_txrtlmt(inp); 3352 error = 0; 3353 } else if (inp->inp_snd_tag == NULL) { 3354 /* 3355 * In order to utilize packet pacing with RSS, we need 3356 * to wait until there is a valid RSS hash before we 3357 * can proceed: 3358 */ 3359 if (M_HASHTYPE_GET(mb) == M_HASHTYPE_NONE) { 3360 error = EAGAIN; 3361 } else { 3362 error = in_pcbattach_txrtlmt(inp, ifp, M_HASHTYPE_GET(mb), 3363 mb->m_pkthdr.flowid, max_pacing_rate, &inp->inp_snd_tag); 3364 } 3365 } else { 3366 error = in_pcbmodify_txrtlmt(inp, max_pacing_rate); 3367 } 3368 if (error == 0 || error == EOPNOTSUPP) 3369 inp->inp_flags2 &= ~INP_RATE_LIMIT_CHANGED; 3370 3371 return (error); 3372 } 3373 3374 /* 3375 * This function should be called when the INP_RATE_LIMIT_CHANGED flag 3376 * is set in the fast path and will attach/detach/modify the TX rate 3377 * limit send tag based on the socket's so_max_pacing_rate value. 3378 */ 3379 void 3380 in_pcboutput_txrtlmt(struct inpcb *inp, struct ifnet *ifp, struct mbuf *mb) 3381 { 3382 struct socket *socket; 3383 uint32_t max_pacing_rate; 3384 bool did_upgrade; 3385 int error; 3386 3387 if (inp == NULL) 3388 return; 3389 3390 socket = inp->inp_socket; 3391 if (socket == NULL) 3392 return; 3393 3394 if (!INP_WLOCKED(inp)) { 3395 /* 3396 * NOTE: If the write locking fails, we need to bail 3397 * out and use the non-ratelimited ring for the 3398 * transmit until there is a new chance to get the 3399 * write lock. 3400 */ 3401 if (!INP_TRY_UPGRADE(inp)) 3402 return; 3403 did_upgrade = 1; 3404 } else { 3405 did_upgrade = 0; 3406 } 3407 3408 /* 3409 * NOTE: The so_max_pacing_rate value is read unlocked, 3410 * because atomic updates are not required since the variable 3411 * is checked at every mbuf we send. It is assumed that the 3412 * variable read itself will be atomic. 3413 */ 3414 max_pacing_rate = socket->so_max_pacing_rate; 3415 3416 error = in_pcboutput_txrtlmt_locked(inp, ifp, mb, max_pacing_rate); 3417 3418 if (did_upgrade) 3419 INP_DOWNGRADE(inp); 3420 } 3421 3422 /* 3423 * Track route changes for TX rate limiting. 3424 */ 3425 void 3426 in_pcboutput_eagain(struct inpcb *inp) 3427 { 3428 bool did_upgrade; 3429 3430 if (inp == NULL) 3431 return; 3432 3433 if (inp->inp_snd_tag == NULL) 3434 return; 3435 3436 if (!INP_WLOCKED(inp)) { 3437 /* 3438 * NOTE: If the write locking fails, we need to bail 3439 * out and use the non-ratelimited ring for the 3440 * transmit until there is a new chance to get the 3441 * write lock. 3442 */ 3443 if (!INP_TRY_UPGRADE(inp)) 3444 return; 3445 did_upgrade = 1; 3446 } else { 3447 did_upgrade = 0; 3448 } 3449 3450 /* detach rate limiting */ 3451 in_pcbdetach_txrtlmt(inp); 3452 3453 /* make sure new mbuf send tag allocation is made */ 3454 inp->inp_flags2 |= INP_RATE_LIMIT_CHANGED; 3455 3456 if (did_upgrade) 3457 INP_DOWNGRADE(inp); 3458 } 3459 3460 #ifdef INET 3461 static void 3462 rl_init(void *st) 3463 { 3464 rate_limit_active = counter_u64_alloc(M_WAITOK); 3465 rate_limit_alloc_fail = counter_u64_alloc(M_WAITOK); 3466 rate_limit_set_ok = counter_u64_alloc(M_WAITOK); 3467 } 3468 3469 SYSINIT(rl, SI_SUB_PROTO_DOMAININIT, SI_ORDER_ANY, rl_init, NULL); 3470 #endif 3471 #endif /* RATELIMIT */ 3472