1 /*- 2 * Copyright (c) 1982, 1986, 1991, 1993, 1995 3 * The Regents of the University of California. 4 * Copyright (c) 2007 Robert N. M. Watson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 4. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 * 31 * @(#)in_pcb.c 8.4 (Berkeley) 5/24/95 32 */ 33 34 #include <sys/cdefs.h> 35 __FBSDID("$FreeBSD$"); 36 37 #include "opt_ddb.h" 38 #include "opt_ipsec.h" 39 #include "opt_inet6.h" 40 #include "opt_mac.h" 41 42 #include <sys/param.h> 43 #include <sys/systm.h> 44 #include <sys/malloc.h> 45 #include <sys/mbuf.h> 46 #include <sys/domain.h> 47 #include <sys/protosw.h> 48 #include <sys/socket.h> 49 #include <sys/socketvar.h> 50 #include <sys/priv.h> 51 #include <sys/proc.h> 52 #include <sys/jail.h> 53 #include <sys/kernel.h> 54 #include <sys/sysctl.h> 55 #include <sys/vimage.h> 56 57 #ifdef DDB 58 #include <ddb/ddb.h> 59 #endif 60 61 #include <vm/uma.h> 62 63 #include <net/if.h> 64 #include <net/if_types.h> 65 #include <net/route.h> 66 67 #include <netinet/in.h> 68 #include <netinet/in_pcb.h> 69 #include <netinet/in_var.h> 70 #include <netinet/ip_var.h> 71 #include <netinet/tcp_var.h> 72 #include <netinet/udp.h> 73 #include <netinet/udp_var.h> 74 #ifdef INET6 75 #include <netinet/ip6.h> 76 #include <netinet6/ip6_var.h> 77 #endif /* INET6 */ 78 79 80 #ifdef IPSEC 81 #include <netipsec/ipsec.h> 82 #include <netipsec/key.h> 83 #endif /* IPSEC */ 84 85 #include <security/mac/mac_framework.h> 86 87 /* 88 * These configure the range of local port addresses assigned to 89 * "unspecified" outgoing connections/packets/whatever. 90 */ 91 int ipport_lowfirstauto = IPPORT_RESERVED - 1; /* 1023 */ 92 int ipport_lowlastauto = IPPORT_RESERVEDSTART; /* 600 */ 93 int ipport_firstauto = IPPORT_EPHEMERALFIRST; /* 10000 */ 94 int ipport_lastauto = IPPORT_EPHEMERALLAST; /* 65535 */ 95 int ipport_hifirstauto = IPPORT_HIFIRSTAUTO; /* 49152 */ 96 int ipport_hilastauto = IPPORT_HILASTAUTO; /* 65535 */ 97 98 /* 99 * Reserved ports accessible only to root. There are significant 100 * security considerations that must be accounted for when changing these, 101 * but the security benefits can be great. Please be careful. 102 */ 103 int ipport_reservedhigh = IPPORT_RESERVED - 1; /* 1023 */ 104 int ipport_reservedlow = 0; 105 106 /* Variables dealing with random ephemeral port allocation. */ 107 int ipport_randomized = 1; /* user controlled via sysctl */ 108 int ipport_randomcps = 10; /* user controlled via sysctl */ 109 int ipport_randomtime = 45; /* user controlled via sysctl */ 110 int ipport_stoprandom = 0; /* toggled by ipport_tick */ 111 int ipport_tcpallocs; 112 int ipport_tcplastcount; 113 114 #define RANGECHK(var, min, max) \ 115 if ((var) < (min)) { (var) = (min); } \ 116 else if ((var) > (max)) { (var) = (max); } 117 118 static int 119 sysctl_net_ipport_check(SYSCTL_HANDLER_ARGS) 120 { 121 int error; 122 123 error = sysctl_handle_int(oidp, oidp->oid_arg1, oidp->oid_arg2, req); 124 if (error == 0) { 125 RANGECHK(V_ipport_lowfirstauto, 1, IPPORT_RESERVED - 1); 126 RANGECHK(V_ipport_lowlastauto, 1, IPPORT_RESERVED - 1); 127 RANGECHK(V_ipport_firstauto, IPPORT_RESERVED, IPPORT_MAX); 128 RANGECHK(V_ipport_lastauto, IPPORT_RESERVED, IPPORT_MAX); 129 RANGECHK(V_ipport_hifirstauto, IPPORT_RESERVED, IPPORT_MAX); 130 RANGECHK(V_ipport_hilastauto, IPPORT_RESERVED, IPPORT_MAX); 131 } 132 return (error); 133 } 134 135 #undef RANGECHK 136 137 SYSCTL_NODE(_net_inet_ip, IPPROTO_IP, portrange, CTLFLAG_RW, 0, "IP Ports"); 138 139 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 140 lowfirst, CTLTYPE_INT|CTLFLAG_RW, ipport_lowfirstauto, 0, 141 &sysctl_net_ipport_check, "I", ""); 142 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 143 lowlast, CTLTYPE_INT|CTLFLAG_RW, ipport_lowlastauto, 0, 144 &sysctl_net_ipport_check, "I", ""); 145 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 146 first, CTLTYPE_INT|CTLFLAG_RW, ipport_firstauto, 0, 147 &sysctl_net_ipport_check, "I", ""); 148 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 149 last, CTLTYPE_INT|CTLFLAG_RW, ipport_lastauto, 0, 150 &sysctl_net_ipport_check, "I", ""); 151 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 152 hifirst, CTLTYPE_INT|CTLFLAG_RW, ipport_hifirstauto, 0, 153 &sysctl_net_ipport_check, "I", ""); 154 SYSCTL_V_PROC(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 155 hilast, CTLTYPE_INT|CTLFLAG_RW, ipport_hilastauto, 0, 156 &sysctl_net_ipport_check, "I", ""); 157 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, 158 reservedhigh, CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedhigh, 0, ""); 159 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, reservedlow, 160 CTLFLAG_RW|CTLFLAG_SECURE, ipport_reservedlow, 0, ""); 161 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomized, 162 CTLFLAG_RW, ipport_randomized, 0, "Enable random port allocation"); 163 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomcps, 164 CTLFLAG_RW, ipport_randomcps, 0, "Maximum number of random port " 165 "allocations before switching to a sequental one"); 166 SYSCTL_V_INT(V_NET, vnet_inet, _net_inet_ip_portrange, OID_AUTO, randomtime, 167 CTLFLAG_RW, ipport_randomtime, 0, 168 "Minimum time to keep sequental port " 169 "allocation before switching to a random one"); 170 171 /* 172 * in_pcb.c: manage the Protocol Control Blocks. 173 * 174 * NOTE: It is assumed that most of these functions will be called with 175 * the pcbinfo lock held, and often, the inpcb lock held, as these utility 176 * functions often modify hash chains or addresses in pcbs. 177 */ 178 179 /* 180 * Allocate a PCB and associate it with the socket. 181 * On success return with the PCB locked. 182 */ 183 int 184 in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) 185 { 186 #ifdef INET6 187 INIT_VNET_INET6(curvnet); 188 #endif 189 struct inpcb *inp; 190 int error; 191 192 INP_INFO_WLOCK_ASSERT(pcbinfo); 193 error = 0; 194 inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); 195 if (inp == NULL) 196 return (ENOBUFS); 197 bzero(inp, inp_zero_size); 198 inp->inp_pcbinfo = pcbinfo; 199 inp->inp_socket = so; 200 inp->inp_cred = crhold(so->so_cred); 201 inp->inp_inc.inc_fibnum = so->so_fibnum; 202 #ifdef MAC 203 error = mac_inpcb_init(inp, M_NOWAIT); 204 if (error != 0) 205 goto out; 206 SOCK_LOCK(so); 207 mac_inpcb_create(so, inp); 208 SOCK_UNLOCK(so); 209 #endif 210 211 #ifdef IPSEC 212 error = ipsec_init_policy(so, &inp->inp_sp); 213 if (error != 0) { 214 #ifdef MAC 215 mac_inpcb_destroy(inp); 216 #endif 217 goto out; 218 } 219 #endif /*IPSEC*/ 220 #ifdef INET6 221 if (INP_SOCKAF(so) == AF_INET6) { 222 inp->inp_vflag |= INP_IPV6PROTO; 223 if (V_ip6_v6only) 224 inp->inp_flags |= IN6P_IPV6_V6ONLY; 225 } 226 #endif 227 LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); 228 pcbinfo->ipi_count++; 229 so->so_pcb = (caddr_t)inp; 230 #ifdef INET6 231 if (V_ip6_auto_flowlabel) 232 inp->inp_flags |= IN6P_AUTOFLOWLABEL; 233 #endif 234 INP_WLOCK(inp); 235 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 236 237 #if defined(IPSEC) || defined(MAC) 238 out: 239 if (error != 0) { 240 crfree(inp->inp_cred); 241 uma_zfree(pcbinfo->ipi_zone, inp); 242 } 243 #endif 244 return (error); 245 } 246 247 int 248 in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 249 { 250 int anonport, error; 251 252 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 253 INP_WLOCK_ASSERT(inp); 254 255 if (inp->inp_lport != 0 || inp->inp_laddr.s_addr != INADDR_ANY) 256 return (EINVAL); 257 anonport = inp->inp_lport == 0 && (nam == NULL || 258 ((struct sockaddr_in *)nam)->sin_port == 0); 259 error = in_pcbbind_setup(inp, nam, &inp->inp_laddr.s_addr, 260 &inp->inp_lport, cred); 261 if (error) 262 return (error); 263 if (in_pcbinshash(inp) != 0) { 264 inp->inp_laddr.s_addr = INADDR_ANY; 265 inp->inp_lport = 0; 266 return (EAGAIN); 267 } 268 if (anonport) 269 inp->inp_flags |= INP_ANONPORT; 270 return (0); 271 } 272 273 /* 274 * Set up a bind operation on a PCB, performing port allocation 275 * as required, but do not actually modify the PCB. Callers can 276 * either complete the bind by setting inp_laddr/inp_lport and 277 * calling in_pcbinshash(), or they can just use the resulting 278 * port and address to authorise the sending of a once-off packet. 279 * 280 * On error, the values of *laddrp and *lportp are not changed. 281 */ 282 int 283 in_pcbbind_setup(struct inpcb *inp, struct sockaddr *nam, in_addr_t *laddrp, 284 u_short *lportp, struct ucred *cred) 285 { 286 INIT_VNET_INET(inp->inp_vnet); 287 struct socket *so = inp->inp_socket; 288 unsigned short *lastport; 289 struct sockaddr_in *sin; 290 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 291 struct in_addr laddr; 292 u_short lport = 0; 293 int wild = 0, reuseport = (so->so_options & SO_REUSEPORT); 294 int error, prison = 0; 295 int dorandom; 296 297 /* 298 * Because no actual state changes occur here, a global write lock on 299 * the pcbinfo isn't required. 300 */ 301 INP_INFO_LOCK_ASSERT(pcbinfo); 302 INP_LOCK_ASSERT(inp); 303 304 if (TAILQ_EMPTY(&V_in_ifaddrhead)) /* XXX broken! */ 305 return (EADDRNOTAVAIL); 306 laddr.s_addr = *laddrp; 307 if (nam != NULL && laddr.s_addr != INADDR_ANY) 308 return (EINVAL); 309 if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) 310 wild = INPLOOKUP_WILDCARD; 311 if (nam) { 312 sin = (struct sockaddr_in *)nam; 313 if (nam->sa_len != sizeof (*sin)) 314 return (EINVAL); 315 #ifdef notdef 316 /* 317 * We should check the family, but old programs 318 * incorrectly fail to initialize it. 319 */ 320 if (sin->sin_family != AF_INET) 321 return (EAFNOSUPPORT); 322 #endif 323 if (sin->sin_addr.s_addr != INADDR_ANY) 324 if (prison_ip(cred, 0, &sin->sin_addr.s_addr)) 325 return(EINVAL); 326 if (sin->sin_port != *lportp) { 327 /* Don't allow the port to change. */ 328 if (*lportp != 0) 329 return (EINVAL); 330 lport = sin->sin_port; 331 } 332 /* NB: lport is left as 0 if the port isn't being changed. */ 333 if (IN_MULTICAST(ntohl(sin->sin_addr.s_addr))) { 334 /* 335 * Treat SO_REUSEADDR as SO_REUSEPORT for multicast; 336 * allow complete duplication of binding if 337 * SO_REUSEPORT is set, or if SO_REUSEADDR is set 338 * and a multicast address is bound on both 339 * new and duplicated sockets. 340 */ 341 if (so->so_options & SO_REUSEADDR) 342 reuseport = SO_REUSEADDR|SO_REUSEPORT; 343 } else if (sin->sin_addr.s_addr != INADDR_ANY) { 344 sin->sin_port = 0; /* yech... */ 345 bzero(&sin->sin_zero, sizeof(sin->sin_zero)); 346 if (ifa_ifwithaddr((struct sockaddr *)sin) == 0) 347 return (EADDRNOTAVAIL); 348 } 349 laddr = sin->sin_addr; 350 if (lport) { 351 struct inpcb *t; 352 struct tcptw *tw; 353 354 /* GROSS */ 355 if (ntohs(lport) <= V_ipport_reservedhigh && 356 ntohs(lport) >= V_ipport_reservedlow && 357 priv_check_cred(cred, PRIV_NETINET_RESERVEDPORT, 358 0)) 359 return (EACCES); 360 if (jailed(cred)) 361 prison = 1; 362 if (!IN_MULTICAST(ntohl(sin->sin_addr.s_addr)) && 363 priv_check_cred(inp->inp_cred, 364 PRIV_NETINET_REUSEPORT, 0) != 0) { 365 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 366 lport, prison ? 0 : INPLOOKUP_WILDCARD, 367 cred); 368 /* 369 * XXX 370 * This entire block sorely needs a rewrite. 371 */ 372 if (t && 373 ((t->inp_vflag & INP_TIMEWAIT) == 0) && 374 (so->so_type != SOCK_STREAM || 375 ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && 376 (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || 377 ntohl(t->inp_laddr.s_addr) != INADDR_ANY || 378 (t->inp_socket->so_options & 379 SO_REUSEPORT) == 0) && 380 (inp->inp_cred->cr_uid != 381 t->inp_cred->cr_uid)) 382 return (EADDRINUSE); 383 } 384 if (prison && prison_ip(cred, 0, &sin->sin_addr.s_addr)) 385 return (EADDRNOTAVAIL); 386 t = in_pcblookup_local(pcbinfo, sin->sin_addr, 387 lport, prison ? 0 : wild, cred); 388 if (t && (t->inp_vflag & INP_TIMEWAIT)) { 389 /* 390 * XXXRW: If an incpb has had its timewait 391 * state recycled, we treat the address as 392 * being in use (for now). This is better 393 * than a panic, but not desirable. 394 */ 395 tw = intotw(inp); 396 if (tw == NULL || 397 (reuseport & tw->tw_so_options) == 0) 398 return (EADDRINUSE); 399 } else if (t && 400 (reuseport & t->inp_socket->so_options) == 0) { 401 #ifdef INET6 402 if (ntohl(sin->sin_addr.s_addr) != 403 INADDR_ANY || 404 ntohl(t->inp_laddr.s_addr) != 405 INADDR_ANY || 406 INP_SOCKAF(so) == 407 INP_SOCKAF(t->inp_socket)) 408 #endif 409 return (EADDRINUSE); 410 } 411 } 412 } 413 if (*lportp != 0) 414 lport = *lportp; 415 if (lport == 0) { 416 u_short first, last, aux; 417 int count; 418 419 if (laddr.s_addr != INADDR_ANY) 420 if (prison_ip(cred, 0, &laddr.s_addr)) 421 return (EINVAL); 422 423 if (inp->inp_flags & INP_HIGHPORT) { 424 first = V_ipport_hifirstauto; /* sysctl */ 425 last = V_ipport_hilastauto; 426 lastport = &pcbinfo->ipi_lasthi; 427 } else if (inp->inp_flags & INP_LOWPORT) { 428 error = priv_check_cred(cred, 429 PRIV_NETINET_RESERVEDPORT, 0); 430 if (error) 431 return error; 432 first = V_ipport_lowfirstauto; /* 1023 */ 433 last = V_ipport_lowlastauto; /* 600 */ 434 lastport = &pcbinfo->ipi_lastlow; 435 } else { 436 first = V_ipport_firstauto; /* sysctl */ 437 last = V_ipport_lastauto; 438 lastport = &pcbinfo->ipi_lastport; 439 } 440 /* 441 * For UDP, use random port allocation as long as the user 442 * allows it. For TCP (and as of yet unknown) connections, 443 * use random port allocation only if the user allows it AND 444 * ipport_tick() allows it. 445 */ 446 if (V_ipport_randomized && 447 (!V_ipport_stoprandom || pcbinfo == &V_udbinfo)) 448 dorandom = 1; 449 else 450 dorandom = 0; 451 /* 452 * It makes no sense to do random port allocation if 453 * we have the only port available. 454 */ 455 if (first == last) 456 dorandom = 0; 457 /* Make sure to not include UDP packets in the count. */ 458 if (pcbinfo != &V_udbinfo) 459 V_ipport_tcpallocs++; 460 /* 461 * Simple check to ensure all ports are not used up causing 462 * a deadlock here. 463 */ 464 if (first > last) { 465 aux = first; 466 first = last; 467 last = aux; 468 } 469 470 if (dorandom) 471 *lastport = first + 472 (arc4random() % (last - first)); 473 474 count = last - first; 475 476 do { 477 if (count-- < 0) /* completely used? */ 478 return (EADDRNOTAVAIL); 479 ++*lastport; 480 if (*lastport < first || *lastport > last) 481 *lastport = first; 482 lport = htons(*lastport); 483 } while (in_pcblookup_local(pcbinfo, laddr, 484 lport, wild, cred)); 485 } 486 if (prison_ip(cred, 0, &laddr.s_addr)) 487 return (EINVAL); 488 *laddrp = laddr.s_addr; 489 *lportp = lport; 490 return (0); 491 } 492 493 /* 494 * Connect from a socket to a specified address. 495 * Both address and port must be specified in argument sin. 496 * If don't have a local address for this socket yet, 497 * then pick one. 498 */ 499 int 500 in_pcbconnect(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) 501 { 502 u_short lport, fport; 503 in_addr_t laddr, faddr; 504 int anonport, error; 505 506 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 507 INP_WLOCK_ASSERT(inp); 508 509 lport = inp->inp_lport; 510 laddr = inp->inp_laddr.s_addr; 511 anonport = (lport == 0); 512 error = in_pcbconnect_setup(inp, nam, &laddr, &lport, &faddr, &fport, 513 NULL, cred); 514 if (error) 515 return (error); 516 517 /* Do the initial binding of the local address if required. */ 518 if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { 519 inp->inp_lport = lport; 520 inp->inp_laddr.s_addr = laddr; 521 if (in_pcbinshash(inp) != 0) { 522 inp->inp_laddr.s_addr = INADDR_ANY; 523 inp->inp_lport = 0; 524 return (EAGAIN); 525 } 526 } 527 528 /* Commit the remaining changes. */ 529 inp->inp_lport = lport; 530 inp->inp_laddr.s_addr = laddr; 531 inp->inp_faddr.s_addr = faddr; 532 inp->inp_fport = fport; 533 in_pcbrehash(inp); 534 535 if (anonport) 536 inp->inp_flags |= INP_ANONPORT; 537 return (0); 538 } 539 540 /* 541 * Do proper source address selection on an unbound socket in case 542 * of connect. Take jails into account as well. 543 */ 544 static int 545 in_pcbladdr(struct inpcb *inp, struct in_addr *faddr, struct in_addr *laddr, 546 struct ucred *cred) 547 { 548 struct in_ifaddr *ia; 549 struct ifaddr *ifa; 550 struct sockaddr *sa; 551 struct sockaddr_in *sin; 552 struct route sro; 553 int error; 554 555 KASSERT(laddr != NULL, ("%s: null laddr", __func__)); 556 557 error = 0; 558 ia = NULL; 559 bzero(&sro, sizeof(sro)); 560 561 sin = (struct sockaddr_in *)&sro.ro_dst; 562 sin->sin_family = AF_INET; 563 sin->sin_len = sizeof(struct sockaddr_in); 564 sin->sin_addr.s_addr = faddr->s_addr; 565 566 /* 567 * If route is known our src addr is taken from the i/f, 568 * else punt. 569 * 570 * Find out route to destination. 571 */ 572 if ((inp->inp_socket->so_options & SO_DONTROUTE) == 0) 573 in_rtalloc_ign(&sro, RTF_CLONING, inp->inp_inc.inc_fibnum); 574 575 /* 576 * If we found a route, use the address corresponding to 577 * the outgoing interface. 578 * 579 * Otherwise assume faddr is reachable on a directly connected 580 * network and try to find a corresponding interface to take 581 * the source address from. 582 */ 583 if (sro.ro_rt == NULL || sro.ro_rt->rt_ifp == NULL) { 584 struct ifnet *ifp; 585 586 ia = ifatoia(ifa_ifwithdstaddr((struct sockaddr *)sin)); 587 if (ia == NULL) 588 ia = ifatoia(ifa_ifwithnet((struct sockaddr *)sin)); 589 if (ia == NULL) { 590 error = ENETUNREACH; 591 goto done; 592 } 593 594 if (cred == NULL || !jailed(cred)) { 595 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 596 goto done; 597 } 598 599 ifp = ia->ia_ifp; 600 ia = NULL; 601 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 602 603 sa = ifa->ifa_addr; 604 if (sa->sa_family != AF_INET) 605 continue; 606 sin = (struct sockaddr_in *)sa; 607 if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { 608 ia = (struct in_ifaddr *)ifa; 609 break; 610 } 611 } 612 if (ia != NULL) { 613 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 614 goto done; 615 } 616 617 /* 3. As a last resort return the 'default' jail address. */ 618 laddr->s_addr = htonl(prison_getip(cred)); 619 goto done; 620 } 621 622 /* 623 * If the outgoing interface on the route found is not 624 * a loopback interface, use the address from that interface. 625 * In case of jails do those three steps: 626 * 1. check if the interface address belongs to the jail. If so use it. 627 * 2. check if we have any address on the outgoing interface 628 * belonging to this jail. If so use it. 629 * 3. as a last resort return the 'default' jail address. 630 */ 631 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) == 0) { 632 633 /* If not jailed, use the default returned. */ 634 if (cred == NULL || !jailed(cred)) { 635 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; 636 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 637 goto done; 638 } 639 640 /* Jailed. */ 641 /* 1. Check if the iface address belongs to the jail. */ 642 sin = (struct sockaddr_in *)sro.ro_rt->rt_ifa->ifa_addr; 643 if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { 644 ia = (struct in_ifaddr *)sro.ro_rt->rt_ifa; 645 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 646 goto done; 647 } 648 649 /* 650 * 2. Check if we have any address on the outgoing interface 651 * belonging to this jail. 652 */ 653 TAILQ_FOREACH(ifa, &sro.ro_rt->rt_ifp->if_addrhead, ifa_link) { 654 655 sa = ifa->ifa_addr; 656 if (sa->sa_family != AF_INET) 657 continue; 658 sin = (struct sockaddr_in *)sa; 659 if (htonl(prison_getip(cred)) == sin->sin_addr.s_addr) { 660 ia = (struct in_ifaddr *)ifa; 661 break; 662 } 663 } 664 if (ia != NULL) { 665 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 666 goto done; 667 } 668 669 /* 3. As a last resort return the 'default' jail address. */ 670 laddr->s_addr = htonl(prison_getip(cred)); 671 goto done; 672 } 673 674 /* 675 * The outgoing interface is marked with 'loopback net', so a route 676 * to ourselves is here. 677 * Try to find the interface of the destination address and then 678 * take the address from there. That interface is not necessarily 679 * a loopback interface. 680 * In case of jails, check that it is an address of the jail 681 * and if we cannot find, fall back to the 'default' jail address. 682 */ 683 if ((sro.ro_rt->rt_ifp->if_flags & IFF_LOOPBACK) != 0) { 684 struct sockaddr_in sain; 685 686 bzero(&sain, sizeof(struct sockaddr_in)); 687 sain.sin_family = AF_INET; 688 sain.sin_len = sizeof(struct sockaddr_in); 689 sain.sin_addr.s_addr = faddr->s_addr; 690 691 ia = ifatoia(ifa_ifwithdstaddr(sintosa(&sain))); 692 if (ia == NULL) 693 ia = ifatoia(ifa_ifwithnet(sintosa(&sain))); 694 695 if (cred == NULL || !jailed(cred)) { 696 if (ia == NULL) { 697 error = ENETUNREACH; 698 goto done; 699 } 700 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 701 goto done; 702 } 703 704 /* Jailed. */ 705 if (ia != NULL) { 706 struct ifnet *ifp; 707 708 ifp = ia->ia_ifp; 709 ia = NULL; 710 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 711 712 sa = ifa->ifa_addr; 713 if (sa->sa_family != AF_INET) 714 continue; 715 sin = (struct sockaddr_in *)sa; 716 if (htonl(prison_getip(cred)) == 717 sin->sin_addr.s_addr) { 718 ia = (struct in_ifaddr *)ifa; 719 break; 720 } 721 } 722 if (ia != NULL) { 723 laddr->s_addr = ia->ia_addr.sin_addr.s_addr; 724 goto done; 725 } 726 } 727 728 /* 3. As a last resort return the 'default' jail address. */ 729 laddr->s_addr = htonl(prison_getip(cred)); 730 goto done; 731 } 732 733 done: 734 if (sro.ro_rt != NULL) 735 RTFREE(sro.ro_rt); 736 return (error); 737 } 738 739 /* 740 * Set up for a connect from a socket to the specified address. 741 * On entry, *laddrp and *lportp should contain the current local 742 * address and port for the PCB; these are updated to the values 743 * that should be placed in inp_laddr and inp_lport to complete 744 * the connect. 745 * 746 * On success, *faddrp and *fportp will be set to the remote address 747 * and port. These are not updated in the error case. 748 * 749 * If the operation fails because the connection already exists, 750 * *oinpp will be set to the PCB of that connection so that the 751 * caller can decide to override it. In all other cases, *oinpp 752 * is set to NULL. 753 */ 754 int 755 in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, 756 in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, 757 struct inpcb **oinpp, struct ucred *cred) 758 { 759 INIT_VNET_INET(inp->inp_vnet); 760 struct sockaddr_in *sin = (struct sockaddr_in *)nam; 761 struct in_ifaddr *ia; 762 struct inpcb *oinp; 763 struct in_addr laddr, faddr; 764 u_short lport, fport; 765 int error; 766 767 /* 768 * Because a global state change doesn't actually occur here, a read 769 * lock is sufficient. 770 */ 771 INP_INFO_LOCK_ASSERT(inp->inp_pcbinfo); 772 INP_LOCK_ASSERT(inp); 773 774 if (oinpp != NULL) 775 *oinpp = NULL; 776 if (nam->sa_len != sizeof (*sin)) 777 return (EINVAL); 778 if (sin->sin_family != AF_INET) 779 return (EAFNOSUPPORT); 780 if (sin->sin_port == 0) 781 return (EADDRNOTAVAIL); 782 laddr.s_addr = *laddrp; 783 lport = *lportp; 784 faddr = sin->sin_addr; 785 fport = sin->sin_port; 786 787 if (!TAILQ_EMPTY(&V_in_ifaddrhead)) { 788 /* 789 * If the destination address is INADDR_ANY, 790 * use the primary local address. 791 * If the supplied address is INADDR_BROADCAST, 792 * and the primary interface supports broadcast, 793 * choose the broadcast address for that interface. 794 */ 795 if (faddr.s_addr == INADDR_ANY) 796 faddr = IA_SIN(TAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; 797 else if (faddr.s_addr == (u_long)INADDR_BROADCAST && 798 (TAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & 799 IFF_BROADCAST)) 800 faddr = satosin(&TAILQ_FIRST( 801 &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; 802 } 803 if (laddr.s_addr == INADDR_ANY) { 804 error = in_pcbladdr(inp, &faddr, &laddr, cred); 805 if (error) 806 return (error); 807 808 /* 809 * If the destination address is multicast and an outgoing 810 * interface has been set as a multicast option, use the 811 * address of that interface as our source address. 812 */ 813 if (IN_MULTICAST(ntohl(faddr.s_addr)) && 814 inp->inp_moptions != NULL) { 815 struct ip_moptions *imo; 816 struct ifnet *ifp; 817 818 imo = inp->inp_moptions; 819 if (imo->imo_multicast_ifp != NULL) { 820 ifp = imo->imo_multicast_ifp; 821 TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) 822 if (ia->ia_ifp == ifp) 823 break; 824 if (ia == NULL) 825 return (EADDRNOTAVAIL); 826 laddr = ia->ia_addr.sin_addr; 827 } 828 } 829 } 830 831 oinp = in_pcblookup_hash(inp->inp_pcbinfo, faddr, fport, laddr, lport, 832 0, NULL); 833 if (oinp != NULL) { 834 if (oinpp != NULL) 835 *oinpp = oinp; 836 return (EADDRINUSE); 837 } 838 if (lport == 0) { 839 error = in_pcbbind_setup(inp, NULL, &laddr.s_addr, &lport, 840 cred); 841 if (error) 842 return (error); 843 } 844 *laddrp = laddr.s_addr; 845 *lportp = lport; 846 *faddrp = faddr.s_addr; 847 *fportp = fport; 848 return (0); 849 } 850 851 void 852 in_pcbdisconnect(struct inpcb *inp) 853 { 854 855 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 856 INP_WLOCK_ASSERT(inp); 857 858 inp->inp_faddr.s_addr = INADDR_ANY; 859 inp->inp_fport = 0; 860 in_pcbrehash(inp); 861 } 862 863 /* 864 * Historically, in_pcbdetach() included the functionality now found in 865 * in_pcbfree() and in_pcbdrop(). They are now broken out to reflect the 866 * more complex life cycle of TCP. 867 * 868 * in_pcbdetach() is responsibe for disconnecting the socket from an inpcb. 869 * For most protocols, this will be invoked immediately prior to calling 870 * in_pcbfree(). However, for TCP the inpcb may significantly outlive the 871 * socket, in which case in_pcbfree() may be deferred. 872 */ 873 void 874 in_pcbdetach(struct inpcb *inp) 875 { 876 877 KASSERT(inp->inp_socket != NULL, ("in_pcbdetach: inp_socket == NULL")); 878 879 inp->inp_socket->so_pcb = NULL; 880 inp->inp_socket = NULL; 881 } 882 883 /* 884 * in_pcbfree() is responsible for freeing an already-detached inpcb, as well 885 * as removing it from any global inpcb lists it might be on. 886 */ 887 void 888 in_pcbfree(struct inpcb *inp) 889 { 890 struct inpcbinfo *ipi = inp->inp_pcbinfo; 891 892 KASSERT(inp->inp_socket == NULL, ("in_pcbfree: inp_socket != NULL")); 893 894 INP_INFO_WLOCK_ASSERT(ipi); 895 INP_WLOCK_ASSERT(inp); 896 897 #ifdef IPSEC 898 ipsec4_delete_pcbpolicy(inp); 899 #endif /*IPSEC*/ 900 inp->inp_gencnt = ++ipi->ipi_gencnt; 901 in_pcbremlists(inp); 902 if (inp->inp_options) 903 (void)m_free(inp->inp_options); 904 if (inp->inp_moptions != NULL) 905 inp_freemoptions(inp->inp_moptions); 906 inp->inp_vflag = 0; 907 crfree(inp->inp_cred); 908 909 #ifdef MAC 910 mac_inpcb_destroy(inp); 911 #endif 912 INP_WUNLOCK(inp); 913 uma_zfree(ipi->ipi_zone, inp); 914 } 915 916 /* 917 * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and 918 * port reservation, and preventing it from being returned by inpcb lookups. 919 * 920 * It is used by TCP to mark an inpcb as unused and avoid future packet 921 * delivery or event notification when a socket remains open but TCP has 922 * closed. This might occur as a result of a shutdown()-initiated TCP close 923 * or a RST on the wire, and allows the port binding to be reused while still 924 * maintaining the invariant that so_pcb always points to a valid inpcb until 925 * in_pcbdetach(). 926 * 927 * XXXRW: An inp_lport of 0 is used to indicate that the inpcb is not on hash 928 * lists, but can lead to confusing netstat output, as open sockets with 929 * closed TCP connections will no longer appear to have their bound port 930 * number. An explicit flag would be better, as it would allow us to leave 931 * the port number intact after the connection is dropped. 932 * 933 * XXXRW: Possibly in_pcbdrop() should also prevent future notifications by 934 * in_pcbnotifyall() and in_pcbpurgeif0()? 935 */ 936 void 937 in_pcbdrop(struct inpcb *inp) 938 { 939 940 INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); 941 INP_WLOCK_ASSERT(inp); 942 943 inp->inp_vflag |= INP_DROPPED; 944 if (inp->inp_lport) { 945 struct inpcbport *phd = inp->inp_phd; 946 947 LIST_REMOVE(inp, inp_hash); 948 LIST_REMOVE(inp, inp_portlist); 949 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 950 LIST_REMOVE(phd, phd_hash); 951 free(phd, M_PCB); 952 } 953 inp->inp_lport = 0; 954 } 955 } 956 957 /* 958 * Common routines to return the socket addresses associated with inpcbs. 959 */ 960 struct sockaddr * 961 in_sockaddr(in_port_t port, struct in_addr *addr_p) 962 { 963 struct sockaddr_in *sin; 964 965 MALLOC(sin, struct sockaddr_in *, sizeof *sin, M_SONAME, 966 M_WAITOK | M_ZERO); 967 sin->sin_family = AF_INET; 968 sin->sin_len = sizeof(*sin); 969 sin->sin_addr = *addr_p; 970 sin->sin_port = port; 971 972 return (struct sockaddr *)sin; 973 } 974 975 int 976 in_getsockaddr(struct socket *so, struct sockaddr **nam) 977 { 978 struct inpcb *inp; 979 struct in_addr addr; 980 in_port_t port; 981 982 inp = sotoinpcb(so); 983 KASSERT(inp != NULL, ("in_getsockaddr: inp == NULL")); 984 985 INP_RLOCK(inp); 986 port = inp->inp_lport; 987 addr = inp->inp_laddr; 988 INP_RUNLOCK(inp); 989 990 *nam = in_sockaddr(port, &addr); 991 return 0; 992 } 993 994 int 995 in_getpeeraddr(struct socket *so, struct sockaddr **nam) 996 { 997 struct inpcb *inp; 998 struct in_addr addr; 999 in_port_t port; 1000 1001 inp = sotoinpcb(so); 1002 KASSERT(inp != NULL, ("in_getpeeraddr: inp == NULL")); 1003 1004 INP_RLOCK(inp); 1005 port = inp->inp_fport; 1006 addr = inp->inp_faddr; 1007 INP_RUNLOCK(inp); 1008 1009 *nam = in_sockaddr(port, &addr); 1010 return 0; 1011 } 1012 1013 void 1014 in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, 1015 struct inpcb *(*notify)(struct inpcb *, int)) 1016 { 1017 struct inpcb *inp, *inp_temp; 1018 1019 INP_INFO_WLOCK(pcbinfo); 1020 LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { 1021 INP_WLOCK(inp); 1022 #ifdef INET6 1023 if ((inp->inp_vflag & INP_IPV4) == 0) { 1024 INP_WUNLOCK(inp); 1025 continue; 1026 } 1027 #endif 1028 if (inp->inp_faddr.s_addr != faddr.s_addr || 1029 inp->inp_socket == NULL) { 1030 INP_WUNLOCK(inp); 1031 continue; 1032 } 1033 if ((*notify)(inp, errno)) 1034 INP_WUNLOCK(inp); 1035 } 1036 INP_INFO_WUNLOCK(pcbinfo); 1037 } 1038 1039 void 1040 in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) 1041 { 1042 struct inpcb *inp; 1043 struct ip_moptions *imo; 1044 int i, gap; 1045 1046 INP_INFO_RLOCK(pcbinfo); 1047 LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { 1048 INP_WLOCK(inp); 1049 imo = inp->inp_moptions; 1050 if ((inp->inp_vflag & INP_IPV4) && 1051 imo != NULL) { 1052 /* 1053 * Unselect the outgoing interface if it is being 1054 * detached. 1055 */ 1056 if (imo->imo_multicast_ifp == ifp) 1057 imo->imo_multicast_ifp = NULL; 1058 1059 /* 1060 * Drop multicast group membership if we joined 1061 * through the interface being detached. 1062 */ 1063 for (i = 0, gap = 0; i < imo->imo_num_memberships; 1064 i++) { 1065 if (imo->imo_membership[i]->inm_ifp == ifp) { 1066 in_delmulti(imo->imo_membership[i]); 1067 gap++; 1068 } else if (gap != 0) 1069 imo->imo_membership[i - gap] = 1070 imo->imo_membership[i]; 1071 } 1072 imo->imo_num_memberships -= gap; 1073 } 1074 INP_WUNLOCK(inp); 1075 } 1076 INP_INFO_RUNLOCK(pcbinfo); 1077 } 1078 1079 /* 1080 * Lookup a PCB based on the local address and port. 1081 */ 1082 #define INP_LOOKUP_MAPPED_PCB_COST 3 1083 struct inpcb * 1084 in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, 1085 u_short lport, int wild_okay, struct ucred *cred) 1086 { 1087 struct inpcb *inp; 1088 #ifdef INET6 1089 int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; 1090 #else 1091 int matchwild = 3; 1092 #endif 1093 int wildcard; 1094 1095 INP_INFO_LOCK_ASSERT(pcbinfo); 1096 1097 if (!wild_okay) { 1098 struct inpcbhead *head; 1099 /* 1100 * Look for an unconnected (wildcard foreign addr) PCB that 1101 * matches the local address and port we're looking for. 1102 */ 1103 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 1104 0, pcbinfo->ipi_hashmask)]; 1105 LIST_FOREACH(inp, head, inp_hash) { 1106 #ifdef INET6 1107 if ((inp->inp_vflag & INP_IPV4) == 0) 1108 continue; 1109 #endif 1110 if (inp->inp_faddr.s_addr == INADDR_ANY && 1111 inp->inp_laddr.s_addr == laddr.s_addr && 1112 inp->inp_lport == lport) { 1113 /* 1114 * Found. 1115 */ 1116 return (inp); 1117 } 1118 } 1119 /* 1120 * Not found. 1121 */ 1122 return (NULL); 1123 } else { 1124 struct inpcbporthead *porthash; 1125 struct inpcbport *phd; 1126 struct inpcb *match = NULL; 1127 /* 1128 * Best fit PCB lookup. 1129 * 1130 * First see if this local port is in use by looking on the 1131 * port hash list. 1132 */ 1133 porthash = &pcbinfo->ipi_porthashbase[INP_PCBPORTHASH(lport, 1134 pcbinfo->ipi_porthashmask)]; 1135 LIST_FOREACH(phd, porthash, phd_hash) { 1136 if (phd->phd_port == lport) 1137 break; 1138 } 1139 if (phd != NULL) { 1140 /* 1141 * Port is in use by one or more PCBs. Look for best 1142 * fit. 1143 */ 1144 LIST_FOREACH(inp, &phd->phd_pcblist, inp_portlist) { 1145 wildcard = 0; 1146 #ifdef INET6 1147 if ((inp->inp_vflag & INP_IPV4) == 0) 1148 continue; 1149 /* 1150 * We never select the PCB that has 1151 * INP_IPV6 flag and is bound to :: if 1152 * we have another PCB which is bound 1153 * to 0.0.0.0. If a PCB has the 1154 * INP_IPV6 flag, then we set its cost 1155 * higher than IPv4 only PCBs. 1156 * 1157 * Note that the case only happens 1158 * when a socket is bound to ::, under 1159 * the condition that the use of the 1160 * mapped address is allowed. 1161 */ 1162 if ((inp->inp_vflag & INP_IPV6) != 0) 1163 wildcard += INP_LOOKUP_MAPPED_PCB_COST; 1164 #endif 1165 if (inp->inp_faddr.s_addr != INADDR_ANY) 1166 wildcard++; 1167 if (inp->inp_laddr.s_addr != INADDR_ANY) { 1168 if (laddr.s_addr == INADDR_ANY) 1169 wildcard++; 1170 else if (inp->inp_laddr.s_addr != laddr.s_addr) 1171 continue; 1172 } else { 1173 if (laddr.s_addr != INADDR_ANY) 1174 wildcard++; 1175 } 1176 if (wildcard < matchwild) { 1177 match = inp; 1178 matchwild = wildcard; 1179 if (matchwild == 0) { 1180 break; 1181 } 1182 } 1183 } 1184 } 1185 return (match); 1186 } 1187 } 1188 #undef INP_LOOKUP_MAPPED_PCB_COST 1189 1190 /* 1191 * Lookup PCB in hash list. 1192 */ 1193 struct inpcb * 1194 in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, 1195 u_int fport_arg, struct in_addr laddr, u_int lport_arg, int wildcard, 1196 struct ifnet *ifp) 1197 { 1198 struct inpcbhead *head; 1199 struct inpcb *inp; 1200 u_short fport = fport_arg, lport = lport_arg; 1201 1202 INP_INFO_LOCK_ASSERT(pcbinfo); 1203 1204 /* 1205 * First look for an exact match. 1206 */ 1207 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(faddr.s_addr, lport, fport, 1208 pcbinfo->ipi_hashmask)]; 1209 LIST_FOREACH(inp, head, inp_hash) { 1210 #ifdef INET6 1211 if ((inp->inp_vflag & INP_IPV4) == 0) 1212 continue; 1213 #endif 1214 if (inp->inp_faddr.s_addr == faddr.s_addr && 1215 inp->inp_laddr.s_addr == laddr.s_addr && 1216 inp->inp_fport == fport && 1217 inp->inp_lport == lport) 1218 return (inp); 1219 } 1220 1221 /* 1222 * Then look for a wildcard match, if requested. 1223 */ 1224 if (wildcard) { 1225 struct inpcb *local_wild = NULL; 1226 #ifdef INET6 1227 struct inpcb *local_wild_mapped = NULL; 1228 #endif 1229 1230 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(INADDR_ANY, lport, 1231 0, pcbinfo->ipi_hashmask)]; 1232 LIST_FOREACH(inp, head, inp_hash) { 1233 #ifdef INET6 1234 if ((inp->inp_vflag & INP_IPV4) == 0) 1235 continue; 1236 #endif 1237 if (inp->inp_faddr.s_addr == INADDR_ANY && 1238 inp->inp_lport == lport) { 1239 if (ifp && ifp->if_type == IFT_FAITH && 1240 (inp->inp_flags & INP_FAITH) == 0) 1241 continue; 1242 if (inp->inp_laddr.s_addr == laddr.s_addr) 1243 return (inp); 1244 else if (inp->inp_laddr.s_addr == INADDR_ANY) { 1245 #ifdef INET6 1246 if (INP_CHECK_SOCKAF(inp->inp_socket, 1247 AF_INET6)) 1248 local_wild_mapped = inp; 1249 else 1250 #endif 1251 local_wild = inp; 1252 } 1253 } 1254 } 1255 #ifdef INET6 1256 if (local_wild == NULL) 1257 return (local_wild_mapped); 1258 #endif 1259 return (local_wild); 1260 } 1261 return (NULL); 1262 } 1263 1264 /* 1265 * Insert PCB onto various hash lists. 1266 */ 1267 int 1268 in_pcbinshash(struct inpcb *inp) 1269 { 1270 struct inpcbhead *pcbhash; 1271 struct inpcbporthead *pcbporthash; 1272 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1273 struct inpcbport *phd; 1274 u_int32_t hashkey_faddr; 1275 1276 INP_INFO_WLOCK_ASSERT(pcbinfo); 1277 INP_WLOCK_ASSERT(inp); 1278 1279 #ifdef INET6 1280 if (inp->inp_vflag & INP_IPV6) 1281 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; 1282 else 1283 #endif /* INET6 */ 1284 hashkey_faddr = inp->inp_faddr.s_addr; 1285 1286 pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 1287 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 1288 1289 pcbporthash = &pcbinfo->ipi_porthashbase[ 1290 INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; 1291 1292 /* 1293 * Go through port list and look for a head for this lport. 1294 */ 1295 LIST_FOREACH(phd, pcbporthash, phd_hash) { 1296 if (phd->phd_port == inp->inp_lport) 1297 break; 1298 } 1299 /* 1300 * If none exists, malloc one and tack it on. 1301 */ 1302 if (phd == NULL) { 1303 MALLOC(phd, struct inpcbport *, sizeof(struct inpcbport), M_PCB, M_NOWAIT); 1304 if (phd == NULL) { 1305 return (ENOBUFS); /* XXX */ 1306 } 1307 phd->phd_port = inp->inp_lport; 1308 LIST_INIT(&phd->phd_pcblist); 1309 LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); 1310 } 1311 inp->inp_phd = phd; 1312 LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); 1313 LIST_INSERT_HEAD(pcbhash, inp, inp_hash); 1314 return (0); 1315 } 1316 1317 /* 1318 * Move PCB to the proper hash bucket when { faddr, fport } have been 1319 * changed. NOTE: This does not handle the case of the lport changing (the 1320 * hashed port list would have to be updated as well), so the lport must 1321 * not change after in_pcbinshash() has been called. 1322 */ 1323 void 1324 in_pcbrehash(struct inpcb *inp) 1325 { 1326 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1327 struct inpcbhead *head; 1328 u_int32_t hashkey_faddr; 1329 1330 INP_INFO_WLOCK_ASSERT(pcbinfo); 1331 INP_WLOCK_ASSERT(inp); 1332 1333 #ifdef INET6 1334 if (inp->inp_vflag & INP_IPV6) 1335 hashkey_faddr = inp->in6p_faddr.s6_addr32[3] /* XXX */; 1336 else 1337 #endif /* INET6 */ 1338 hashkey_faddr = inp->inp_faddr.s_addr; 1339 1340 head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, 1341 inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; 1342 1343 LIST_REMOVE(inp, inp_hash); 1344 LIST_INSERT_HEAD(head, inp, inp_hash); 1345 } 1346 1347 /* 1348 * Remove PCB from various lists. 1349 */ 1350 void 1351 in_pcbremlists(struct inpcb *inp) 1352 { 1353 struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; 1354 1355 INP_INFO_WLOCK_ASSERT(pcbinfo); 1356 INP_WLOCK_ASSERT(inp); 1357 1358 inp->inp_gencnt = ++pcbinfo->ipi_gencnt; 1359 if (inp->inp_lport) { 1360 struct inpcbport *phd = inp->inp_phd; 1361 1362 LIST_REMOVE(inp, inp_hash); 1363 LIST_REMOVE(inp, inp_portlist); 1364 if (LIST_FIRST(&phd->phd_pcblist) == NULL) { 1365 LIST_REMOVE(phd, phd_hash); 1366 free(phd, M_PCB); 1367 } 1368 } 1369 LIST_REMOVE(inp, inp_list); 1370 pcbinfo->ipi_count--; 1371 } 1372 1373 /* 1374 * A set label operation has occurred at the socket layer, propagate the 1375 * label change into the in_pcb for the socket. 1376 */ 1377 void 1378 in_pcbsosetlabel(struct socket *so) 1379 { 1380 #ifdef MAC 1381 struct inpcb *inp; 1382 1383 inp = sotoinpcb(so); 1384 KASSERT(inp != NULL, ("in_pcbsosetlabel: so->so_pcb == NULL")); 1385 1386 INP_WLOCK(inp); 1387 SOCK_LOCK(so); 1388 mac_inpcb_sosetlabel(so, inp); 1389 SOCK_UNLOCK(so); 1390 INP_WUNLOCK(inp); 1391 #endif 1392 } 1393 1394 /* 1395 * ipport_tick runs once per second, determining if random port allocation 1396 * should be continued. If more than ipport_randomcps ports have been 1397 * allocated in the last second, then we return to sequential port 1398 * allocation. We return to random allocation only once we drop below 1399 * ipport_randomcps for at least ipport_randomtime seconds. 1400 */ 1401 void 1402 ipport_tick(void *xtp) 1403 { 1404 VNET_ITERATOR_DECL(vnet_iter); 1405 1406 VNET_LIST_RLOCK(); 1407 VNET_FOREACH(vnet_iter) { 1408 CURVNET_SET(vnet_iter); /* XXX appease INVARIANTS here */ 1409 INIT_VNET_INET(vnet_iter); 1410 if (V_ipport_tcpallocs <= 1411 V_ipport_tcplastcount + V_ipport_randomcps) { 1412 if (V_ipport_stoprandom > 0) 1413 V_ipport_stoprandom--; 1414 } else 1415 V_ipport_stoprandom = V_ipport_randomtime; 1416 V_ipport_tcplastcount = V_ipport_tcpallocs; 1417 CURVNET_RESTORE(); 1418 } 1419 VNET_LIST_RUNLOCK(); 1420 callout_reset(&ipport_tick_callout, hz, ipport_tick, NULL); 1421 } 1422 1423 void 1424 inp_wlock(struct inpcb *inp) 1425 { 1426 1427 INP_WLOCK(inp); 1428 } 1429 1430 void 1431 inp_wunlock(struct inpcb *inp) 1432 { 1433 1434 INP_WUNLOCK(inp); 1435 } 1436 1437 void 1438 inp_rlock(struct inpcb *inp) 1439 { 1440 1441 INP_RLOCK(inp); 1442 } 1443 1444 void 1445 inp_runlock(struct inpcb *inp) 1446 { 1447 1448 INP_RUNLOCK(inp); 1449 } 1450 1451 #ifdef INVARIANTS 1452 void 1453 inp_lock_assert(struct inpcb *inp) 1454 { 1455 1456 INP_WLOCK_ASSERT(inp); 1457 } 1458 1459 void 1460 inp_unlock_assert(struct inpcb *inp) 1461 { 1462 1463 INP_UNLOCK_ASSERT(inp); 1464 } 1465 #endif 1466 1467 void 1468 inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) 1469 { 1470 INIT_VNET_INET(curvnet); 1471 struct inpcb *inp; 1472 1473 INP_INFO_RLOCK(&V_tcbinfo); 1474 LIST_FOREACH(inp, tcbinfo.ipi_listhead, inp_list) { 1475 INP_WLOCK(inp); 1476 func(inp, arg); 1477 INP_WUNLOCK(inp); 1478 } 1479 INP_INFO_RUNLOCK(&V_tcbinfo); 1480 } 1481 1482 struct socket * 1483 inp_inpcbtosocket(struct inpcb *inp) 1484 { 1485 1486 INP_WLOCK_ASSERT(inp); 1487 return (inp->inp_socket); 1488 } 1489 1490 struct tcpcb * 1491 inp_inpcbtotcpcb(struct inpcb *inp) 1492 { 1493 1494 INP_WLOCK_ASSERT(inp); 1495 return ((struct tcpcb *)inp->inp_ppcb); 1496 } 1497 1498 int 1499 inp_ip_tos_get(const struct inpcb *inp) 1500 { 1501 1502 return (inp->inp_ip_tos); 1503 } 1504 1505 void 1506 inp_ip_tos_set(struct inpcb *inp, int val) 1507 { 1508 1509 inp->inp_ip_tos = val; 1510 } 1511 1512 void 1513 inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, 1514 uint32_t *faddr, uint16_t *fp) 1515 { 1516 1517 INP_LOCK_ASSERT(inp); 1518 *laddr = inp->inp_laddr.s_addr; 1519 *faddr = inp->inp_faddr.s_addr; 1520 *lp = inp->inp_lport; 1521 *fp = inp->inp_fport; 1522 } 1523 1524 struct inpcb * 1525 so_sotoinpcb(struct socket *so) 1526 { 1527 1528 return (sotoinpcb(so)); 1529 } 1530 1531 struct tcpcb * 1532 so_sototcpcb(struct socket *so) 1533 { 1534 1535 return (sototcpcb(so)); 1536 } 1537 1538 #ifdef DDB 1539 static void 1540 db_print_indent(int indent) 1541 { 1542 int i; 1543 1544 for (i = 0; i < indent; i++) 1545 db_printf(" "); 1546 } 1547 1548 static void 1549 db_print_inconninfo(struct in_conninfo *inc, const char *name, int indent) 1550 { 1551 char faddr_str[48], laddr_str[48]; 1552 1553 db_print_indent(indent); 1554 db_printf("%s at %p\n", name, inc); 1555 1556 indent += 2; 1557 1558 #ifdef INET6 1559 if (inc->inc_flags == 1) { 1560 /* IPv6. */ 1561 ip6_sprintf(laddr_str, &inc->inc6_laddr); 1562 ip6_sprintf(faddr_str, &inc->inc6_faddr); 1563 } else { 1564 #endif 1565 /* IPv4. */ 1566 inet_ntoa_r(inc->inc_laddr, laddr_str); 1567 inet_ntoa_r(inc->inc_faddr, faddr_str); 1568 #ifdef INET6 1569 } 1570 #endif 1571 db_print_indent(indent); 1572 db_printf("inc_laddr %s inc_lport %u\n", laddr_str, 1573 ntohs(inc->inc_lport)); 1574 db_print_indent(indent); 1575 db_printf("inc_faddr %s inc_fport %u\n", faddr_str, 1576 ntohs(inc->inc_fport)); 1577 } 1578 1579 static void 1580 db_print_inpflags(int inp_flags) 1581 { 1582 int comma; 1583 1584 comma = 0; 1585 if (inp_flags & INP_RECVOPTS) { 1586 db_printf("%sINP_RECVOPTS", comma ? ", " : ""); 1587 comma = 1; 1588 } 1589 if (inp_flags & INP_RECVRETOPTS) { 1590 db_printf("%sINP_RECVRETOPTS", comma ? ", " : ""); 1591 comma = 1; 1592 } 1593 if (inp_flags & INP_RECVDSTADDR) { 1594 db_printf("%sINP_RECVDSTADDR", comma ? ", " : ""); 1595 comma = 1; 1596 } 1597 if (inp_flags & INP_HDRINCL) { 1598 db_printf("%sINP_HDRINCL", comma ? ", " : ""); 1599 comma = 1; 1600 } 1601 if (inp_flags & INP_HIGHPORT) { 1602 db_printf("%sINP_HIGHPORT", comma ? ", " : ""); 1603 comma = 1; 1604 } 1605 if (inp_flags & INP_LOWPORT) { 1606 db_printf("%sINP_LOWPORT", comma ? ", " : ""); 1607 comma = 1; 1608 } 1609 if (inp_flags & INP_ANONPORT) { 1610 db_printf("%sINP_ANONPORT", comma ? ", " : ""); 1611 comma = 1; 1612 } 1613 if (inp_flags & INP_RECVIF) { 1614 db_printf("%sINP_RECVIF", comma ? ", " : ""); 1615 comma = 1; 1616 } 1617 if (inp_flags & INP_MTUDISC) { 1618 db_printf("%sINP_MTUDISC", comma ? ", " : ""); 1619 comma = 1; 1620 } 1621 if (inp_flags & INP_FAITH) { 1622 db_printf("%sINP_FAITH", comma ? ", " : ""); 1623 comma = 1; 1624 } 1625 if (inp_flags & INP_RECVTTL) { 1626 db_printf("%sINP_RECVTTL", comma ? ", " : ""); 1627 comma = 1; 1628 } 1629 if (inp_flags & INP_DONTFRAG) { 1630 db_printf("%sINP_DONTFRAG", comma ? ", " : ""); 1631 comma = 1; 1632 } 1633 if (inp_flags & IN6P_IPV6_V6ONLY) { 1634 db_printf("%sIN6P_IPV6_V6ONLY", comma ? ", " : ""); 1635 comma = 1; 1636 } 1637 if (inp_flags & IN6P_PKTINFO) { 1638 db_printf("%sIN6P_PKTINFO", comma ? ", " : ""); 1639 comma = 1; 1640 } 1641 if (inp_flags & IN6P_HOPLIMIT) { 1642 db_printf("%sIN6P_HOPLIMIT", comma ? ", " : ""); 1643 comma = 1; 1644 } 1645 if (inp_flags & IN6P_HOPOPTS) { 1646 db_printf("%sIN6P_HOPOPTS", comma ? ", " : ""); 1647 comma = 1; 1648 } 1649 if (inp_flags & IN6P_DSTOPTS) { 1650 db_printf("%sIN6P_DSTOPTS", comma ? ", " : ""); 1651 comma = 1; 1652 } 1653 if (inp_flags & IN6P_RTHDR) { 1654 db_printf("%sIN6P_RTHDR", comma ? ", " : ""); 1655 comma = 1; 1656 } 1657 if (inp_flags & IN6P_RTHDRDSTOPTS) { 1658 db_printf("%sIN6P_RTHDRDSTOPTS", comma ? ", " : ""); 1659 comma = 1; 1660 } 1661 if (inp_flags & IN6P_TCLASS) { 1662 db_printf("%sIN6P_TCLASS", comma ? ", " : ""); 1663 comma = 1; 1664 } 1665 if (inp_flags & IN6P_AUTOFLOWLABEL) { 1666 db_printf("%sIN6P_AUTOFLOWLABEL", comma ? ", " : ""); 1667 comma = 1; 1668 } 1669 if (inp_flags & IN6P_RFC2292) { 1670 db_printf("%sIN6P_RFC2292", comma ? ", " : ""); 1671 comma = 1; 1672 } 1673 if (inp_flags & IN6P_MTU) { 1674 db_printf("IN6P_MTU%s", comma ? ", " : ""); 1675 comma = 1; 1676 } 1677 } 1678 1679 static void 1680 db_print_inpvflag(u_char inp_vflag) 1681 { 1682 int comma; 1683 1684 comma = 0; 1685 if (inp_vflag & INP_IPV4) { 1686 db_printf("%sINP_IPV4", comma ? ", " : ""); 1687 comma = 1; 1688 } 1689 if (inp_vflag & INP_IPV6) { 1690 db_printf("%sINP_IPV6", comma ? ", " : ""); 1691 comma = 1; 1692 } 1693 if (inp_vflag & INP_IPV6PROTO) { 1694 db_printf("%sINP_IPV6PROTO", comma ? ", " : ""); 1695 comma = 1; 1696 } 1697 if (inp_vflag & INP_TIMEWAIT) { 1698 db_printf("%sINP_TIMEWAIT", comma ? ", " : ""); 1699 comma = 1; 1700 } 1701 if (inp_vflag & INP_ONESBCAST) { 1702 db_printf("%sINP_ONESBCAST", comma ? ", " : ""); 1703 comma = 1; 1704 } 1705 if (inp_vflag & INP_DROPPED) { 1706 db_printf("%sINP_DROPPED", comma ? ", " : ""); 1707 comma = 1; 1708 } 1709 if (inp_vflag & INP_SOCKREF) { 1710 db_printf("%sINP_SOCKREF", comma ? ", " : ""); 1711 comma = 1; 1712 } 1713 } 1714 1715 void 1716 db_print_inpcb(struct inpcb *inp, const char *name, int indent) 1717 { 1718 1719 db_print_indent(indent); 1720 db_printf("%s at %p\n", name, inp); 1721 1722 indent += 2; 1723 1724 db_print_indent(indent); 1725 db_printf("inp_flow: 0x%x\n", inp->inp_flow); 1726 1727 db_print_inconninfo(&inp->inp_inc, "inp_conninfo", indent); 1728 1729 db_print_indent(indent); 1730 db_printf("inp_ppcb: %p inp_pcbinfo: %p inp_socket: %p\n", 1731 inp->inp_ppcb, inp->inp_pcbinfo, inp->inp_socket); 1732 1733 db_print_indent(indent); 1734 db_printf("inp_label: %p inp_flags: 0x%x (", 1735 inp->inp_label, inp->inp_flags); 1736 db_print_inpflags(inp->inp_flags); 1737 db_printf(")\n"); 1738 1739 db_print_indent(indent); 1740 db_printf("inp_sp: %p inp_vflag: 0x%x (", inp->inp_sp, 1741 inp->inp_vflag); 1742 db_print_inpvflag(inp->inp_vflag); 1743 db_printf(")\n"); 1744 1745 db_print_indent(indent); 1746 db_printf("inp_ip_ttl: %d inp_ip_p: %d inp_ip_minttl: %d\n", 1747 inp->inp_ip_ttl, inp->inp_ip_p, inp->inp_ip_minttl); 1748 1749 db_print_indent(indent); 1750 #ifdef INET6 1751 if (inp->inp_vflag & INP_IPV6) { 1752 db_printf("in6p_options: %p in6p_outputopts: %p " 1753 "in6p_moptions: %p\n", inp->in6p_options, 1754 inp->in6p_outputopts, inp->in6p_moptions); 1755 db_printf("in6p_icmp6filt: %p in6p_cksum %d " 1756 "in6p_hops %u\n", inp->in6p_icmp6filt, inp->in6p_cksum, 1757 inp->in6p_hops); 1758 } else 1759 #endif 1760 { 1761 db_printf("inp_ip_tos: %d inp_ip_options: %p " 1762 "inp_ip_moptions: %p\n", inp->inp_ip_tos, 1763 inp->inp_options, inp->inp_moptions); 1764 } 1765 1766 db_print_indent(indent); 1767 db_printf("inp_phd: %p inp_gencnt: %ju\n", inp->inp_phd, 1768 (uintmax_t)inp->inp_gencnt); 1769 } 1770 1771 DB_SHOW_COMMAND(inpcb, db_show_inpcb) 1772 { 1773 struct inpcb *inp; 1774 1775 if (!have_addr) { 1776 db_printf("usage: show inpcb <addr>\n"); 1777 return; 1778 } 1779 inp = (struct inpcb *)addr; 1780 1781 db_print_inpcb(inp, "inpcb", 0); 1782 } 1783 #endif 1784