1 /*- 2 * Copyright (c) 1982, 1986, 1988, 1993 3 * The Regents of the University of California. All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 4. Neither the name of the University nor the names of its contributors 14 * may be used to endorse or promote products derived from this software 15 * without specific prior written permission. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 __FBSDID("$FreeBSD$"); 32 33 #if !defined(KLD_MODULE) 34 #include "opt_inet.h" 35 #include "opt_ipfw.h" 36 #include "opt_sctp.h" 37 #ifndef INET 38 #error "IPDIVERT requires INET." 39 #endif 40 #ifndef IPFIREWALL 41 #error "IPDIVERT requires IPFIREWALL" 42 #endif 43 #endif 44 45 #include <sys/param.h> 46 #include <sys/kernel.h> 47 #include <sys/lock.h> 48 #include <sys/malloc.h> 49 #include <sys/mbuf.h> 50 #include <sys/module.h> 51 #include <sys/kernel.h> 52 #include <sys/priv.h> 53 #include <sys/proc.h> 54 #include <sys/protosw.h> 55 #include <sys/rwlock.h> 56 #include <sys/signalvar.h> 57 #include <sys/socket.h> 58 #include <sys/socketvar.h> 59 #include <sys/sx.h> 60 #include <sys/sysctl.h> 61 #include <sys/systm.h> 62 63 #include <vm/uma.h> 64 65 #include <net/if.h> 66 #include <net/netisr.h> 67 #include <net/route.h> 68 #include <net/vnet.h> 69 70 #include <netinet/in.h> 71 #include <netinet/in_pcb.h> 72 #include <netinet/in_systm.h> 73 #include <netinet/in_var.h> 74 #include <netinet/ip.h> 75 #include <netinet/ip_divert.h> 76 #include <netinet/ip_var.h> 77 #include <netinet/ip_fw.h> 78 #include <netinet/ipfw/ip_fw_private.h> 79 #ifdef SCTP 80 #include <netinet/sctp_crc32.h> 81 #endif 82 83 #include <security/mac/mac_framework.h> 84 85 /* 86 * Divert sockets 87 */ 88 89 /* 90 * Allocate enough space to hold a full IP packet 91 */ 92 #define DIVSNDQ (65536 + 100) 93 #define DIVRCVQ (65536 + 100) 94 95 /* 96 * Divert sockets work in conjunction with ipfw, see the divert(4) 97 * manpage for features. 98 * Internally, packets selected by ipfw in ip_input() or ip_output(), 99 * and never diverted before, are passed to the input queue of the 100 * divert socket with a given 'divert_port' number (as specified in 101 * the matching ipfw rule), and they are tagged with a 16 bit cookie 102 * (representing the rule number of the matching ipfw rule), which 103 * is passed to process reading from the socket. 104 * 105 * Packets written to the divert socket are again tagged with a cookie 106 * (usually the same as above) and a destination address. 107 * If the destination address is INADDR_ANY then the packet is 108 * treated as outgoing and sent to ip_output(), otherwise it is 109 * treated as incoming and sent to ip_input(). 110 * In both cases, the packet is tagged with the cookie. 111 * 112 * On reinjection, processing in ip_input() and ip_output() 113 * will be exactly the same as for the original packet, except that 114 * ipfw processing will start at the rule number after the one 115 * written in the cookie (so, tagging a packet with a cookie of 0 116 * will cause it to be effectively considered as a standard packet). 117 */ 118 119 /* Internal variables. */ 120 static VNET_DEFINE(struct inpcbhead, divcb); 121 static VNET_DEFINE(struct inpcbinfo, divcbinfo); 122 123 #define V_divcb VNET(divcb) 124 #define V_divcbinfo VNET(divcbinfo) 125 126 static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ 127 static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ 128 129 static eventhandler_tag ip_divert_event_tag; 130 131 /* 132 * Initialize divert connection block queue. 133 */ 134 static void 135 div_zone_change(void *tag) 136 { 137 138 uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 139 } 140 141 static int 142 div_inpcb_init(void *mem, int size, int flags) 143 { 144 struct inpcb *inp = mem; 145 146 INP_LOCK_INIT(inp, "inp", "divinp"); 147 return (0); 148 } 149 150 static void 151 div_inpcb_fini(void *mem, int size) 152 { 153 struct inpcb *inp = mem; 154 155 INP_LOCK_DESTROY(inp); 156 } 157 158 static void 159 div_init(void) 160 { 161 162 INP_INFO_LOCK_INIT(&V_divcbinfo, "div"); 163 LIST_INIT(&V_divcb); 164 V_divcbinfo.ipi_listhead = &V_divcb; 165 #ifdef VIMAGE 166 V_divcbinfo.ipi_vnet = curvnet; 167 #endif 168 /* 169 * XXX We don't use the hash list for divert IP, but it's easier 170 * to allocate a one entry hash list than it is to check all 171 * over the place for hashbase == NULL. 172 */ 173 V_divcbinfo.ipi_hashbase = hashinit(1, M_PCB, &V_divcbinfo.ipi_hashmask); 174 V_divcbinfo.ipi_porthashbase = hashinit(1, M_PCB, 175 &V_divcbinfo.ipi_porthashmask); 176 V_divcbinfo.ipi_zone = uma_zcreate("divcb", sizeof(struct inpcb), 177 NULL, NULL, div_inpcb_init, div_inpcb_fini, UMA_ALIGN_PTR, 178 UMA_ZONE_NOFREE); 179 uma_zone_set_max(V_divcbinfo.ipi_zone, maxsockets); 180 } 181 182 static void 183 div_destroy(void) 184 { 185 186 INP_INFO_LOCK_DESTROY(&V_divcbinfo); 187 uma_zdestroy(V_divcbinfo.ipi_zone); 188 hashdestroy(V_divcbinfo.ipi_hashbase, M_PCB, V_divcbinfo.ipi_hashmask); 189 hashdestroy(V_divcbinfo.ipi_porthashbase, M_PCB, 190 V_divcbinfo.ipi_porthashmask); 191 } 192 193 /* 194 * IPPROTO_DIVERT is not in the real IP protocol number space; this 195 * function should never be called. Just in case, drop any packets. 196 */ 197 void 198 div_input(struct mbuf *m, int off) 199 { 200 201 KMOD_IPSTAT_INC(ips_noproto); 202 m_freem(m); 203 } 204 205 /* 206 * Divert a packet by passing it up to the divert socket at port 'port'. 207 * 208 * Setup generic address and protocol structures for div_input routine, 209 * then pass them along with mbuf chain. 210 */ 211 static void 212 divert_packet(struct mbuf *m, int incoming) 213 { 214 struct ip *ip; 215 struct inpcb *inp; 216 struct socket *sa; 217 u_int16_t nport; 218 struct sockaddr_in divsrc; 219 struct m_tag *mtag; 220 221 mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL); 222 if (mtag == NULL) { 223 printf("%s: no divert tag\n", __func__); 224 m_freem(m); 225 return; 226 } 227 /* Assure header */ 228 if (m->m_len < sizeof(struct ip) && 229 (m = m_pullup(m, sizeof(struct ip))) == 0) 230 return; 231 ip = mtod(m, struct ip *); 232 233 /* Delayed checksums are currently not compatible with divert. */ 234 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 235 ip->ip_len = ntohs(ip->ip_len); 236 in_delayed_cksum(m); 237 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 238 ip->ip_len = htons(ip->ip_len); 239 } 240 #ifdef SCTP 241 if (m->m_pkthdr.csum_flags & CSUM_SCTP) { 242 ip->ip_len = ntohs(ip->ip_len); 243 sctp_delayed_cksum(m); 244 m->m_pkthdr.csum_flags &= ~CSUM_SCTP; 245 ip->ip_len = htons(ip->ip_len); 246 } 247 #endif 248 /* 249 * Record receive interface address, if any. 250 * But only for incoming packets. 251 */ 252 bzero(&divsrc, sizeof(divsrc)); 253 divsrc.sin_len = sizeof(divsrc); 254 divsrc.sin_family = AF_INET; 255 divsrc.sin_port = divert_cookie(mtag); /* record matching rule */ 256 if (incoming) { 257 struct ifaddr *ifa; 258 struct ifnet *ifp; 259 260 /* Sanity check */ 261 M_ASSERTPKTHDR(m); 262 263 /* Find IP address for receive interface */ 264 ifp = m->m_pkthdr.rcvif; 265 if_addr_rlock(ifp); 266 TAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 267 if (ifa->ifa_addr->sa_family != AF_INET) 268 continue; 269 divsrc.sin_addr = 270 ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; 271 break; 272 } 273 if_addr_runlock(ifp); 274 } 275 /* 276 * Record the incoming interface name whenever we have one. 277 */ 278 if (m->m_pkthdr.rcvif) { 279 /* 280 * Hide the actual interface name in there in the 281 * sin_zero array. XXX This needs to be moved to a 282 * different sockaddr type for divert, e.g. 283 * sockaddr_div with multiple fields like 284 * sockaddr_dl. Presently we have only 7 bytes 285 * but that will do for now as most interfaces 286 * are 4 or less + 2 or less bytes for unit. 287 * There is probably a faster way of doing this, 288 * possibly taking it from the sockaddr_dl on the iface. 289 * This solves the problem of a P2P link and a LAN interface 290 * having the same address, which can result in the wrong 291 * interface being assigned to the packet when fed back 292 * into the divert socket. Theoretically if the daemon saves 293 * and re-uses the sockaddr_in as suggested in the man pages, 294 * this iface name will come along for the ride. 295 * (see div_output for the other half of this.) 296 */ 297 strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, 298 sizeof(divsrc.sin_zero)); 299 } 300 301 /* Put packet on socket queue, if any */ 302 sa = NULL; 303 nport = htons((u_int16_t)divert_info(mtag)); 304 INP_INFO_RLOCK(&V_divcbinfo); 305 LIST_FOREACH(inp, &V_divcb, inp_list) { 306 /* XXX why does only one socket match? */ 307 if (inp->inp_lport == nport) { 308 INP_RLOCK(inp); 309 sa = inp->inp_socket; 310 SOCKBUF_LOCK(&sa->so_rcv); 311 if (sbappendaddr_locked(&sa->so_rcv, 312 (struct sockaddr *)&divsrc, m, 313 (struct mbuf *)0) == 0) { 314 SOCKBUF_UNLOCK(&sa->so_rcv); 315 sa = NULL; /* force mbuf reclaim below */ 316 } else 317 sorwakeup_locked(sa); 318 INP_RUNLOCK(inp); 319 break; 320 } 321 } 322 INP_INFO_RUNLOCK(&V_divcbinfo); 323 if (sa == NULL) { 324 m_freem(m); 325 KMOD_IPSTAT_INC(ips_noproto); 326 KMOD_IPSTAT_DEC(ips_delivered); 327 } 328 } 329 330 /* 331 * Deliver packet back into the IP processing machinery. 332 * 333 * If no address specified, or address is 0.0.0.0, send to ip_output(); 334 * otherwise, send to ip_input() and mark as having been received on 335 * the interface with that address. 336 */ 337 static int 338 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, 339 struct mbuf *control) 340 { 341 struct m_tag *mtag; 342 struct divert_tag *dt; 343 int error = 0; 344 struct mbuf *options; 345 346 /* 347 * An mbuf may hasn't come from userland, but we pretend 348 * that it has. 349 */ 350 m->m_pkthdr.rcvif = NULL; 351 m->m_nextpkt = NULL; 352 M_SETFIB(m, so->so_fibnum); 353 354 if (control) 355 m_freem(control); /* XXX */ 356 357 if ((mtag = m_tag_find(m, PACKET_TAG_DIVERT, NULL)) == NULL) { 358 mtag = m_tag_get(PACKET_TAG_DIVERT, sizeof(struct divert_tag), 359 M_NOWAIT | M_ZERO); 360 if (mtag == NULL) { 361 error = ENOBUFS; 362 goto cantsend; 363 } 364 dt = (struct divert_tag *)(mtag+1); 365 m_tag_prepend(m, mtag); 366 } else 367 dt = (struct divert_tag *)(mtag+1); 368 369 /* Loopback avoidance and state recovery */ 370 if (sin) { 371 int i; 372 373 dt->cookie = sin->sin_port; 374 /* 375 * Find receive interface with the given name, stuffed 376 * (if it exists) in the sin_zero[] field. 377 * The name is user supplied data so don't trust its size 378 * or that it is zero terminated. 379 */ 380 for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) 381 ; 382 if ( i > 0 && i < sizeof(sin->sin_zero)) 383 m->m_pkthdr.rcvif = ifunit(sin->sin_zero); 384 } 385 386 /* Reinject packet into the system as incoming or outgoing */ 387 if (!sin || sin->sin_addr.s_addr == 0) { 388 struct ip *const ip = mtod(m, struct ip *); 389 struct inpcb *inp; 390 391 dt->info |= IP_FW_DIVERT_OUTPUT_FLAG; 392 INP_INFO_WLOCK(&V_divcbinfo); 393 inp = sotoinpcb(so); 394 INP_RLOCK(inp); 395 /* 396 * Don't allow both user specified and setsockopt options, 397 * and don't allow packet length sizes that will crash 398 */ 399 if (((ip->ip_hl != (sizeof (*ip) >> 2)) && inp->inp_options) || 400 ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { 401 error = EINVAL; 402 INP_RUNLOCK(inp); 403 INP_INFO_WUNLOCK(&V_divcbinfo); 404 m_freem(m); 405 } else { 406 /* Convert fields to host order for ip_output() */ 407 ip->ip_len = ntohs(ip->ip_len); 408 ip->ip_off = ntohs(ip->ip_off); 409 410 /* Send packet to output processing */ 411 KMOD_IPSTAT_INC(ips_rawout); /* XXX */ 412 413 #ifdef MAC 414 mac_inpcb_create_mbuf(inp, m); 415 #endif 416 /* 417 * Get ready to inject the packet into ip_output(). 418 * Just in case socket options were specified on the 419 * divert socket, we duplicate them. This is done 420 * to avoid having to hold the PCB locks over the call 421 * to ip_output(), as doing this results in a number of 422 * lock ordering complexities. 423 * 424 * Note that we set the multicast options argument for 425 * ip_output() to NULL since it should be invariant that 426 * they are not present. 427 */ 428 KASSERT(inp->inp_moptions == NULL, 429 ("multicast options set on a divert socket")); 430 options = NULL; 431 /* 432 * XXXCSJP: It is unclear to me whether or not it makes 433 * sense for divert sockets to have options. However, 434 * for now we will duplicate them with the INP locks 435 * held so we can use them in ip_output() without 436 * requring a reference to the pcb. 437 */ 438 if (inp->inp_options != NULL) { 439 options = m_dup(inp->inp_options, M_DONTWAIT); 440 if (options == NULL) 441 error = ENOBUFS; 442 } 443 INP_RUNLOCK(inp); 444 INP_INFO_WUNLOCK(&V_divcbinfo); 445 if (error == ENOBUFS) { 446 m_freem(m); 447 return (error); 448 } 449 error = ip_output(m, options, NULL, 450 ((so->so_options & SO_DONTROUTE) ? 451 IP_ROUTETOIF : 0) | IP_ALLOWBROADCAST | 452 IP_RAWOUTPUT, NULL, NULL); 453 if (options != NULL) 454 m_freem(options); 455 } 456 } else { 457 dt->info |= IP_FW_DIVERT_LOOPBACK_FLAG; 458 if (m->m_pkthdr.rcvif == NULL) { 459 /* 460 * No luck with the name, check by IP address. 461 * Clear the port and the ifname to make sure 462 * there are no distractions for ifa_ifwithaddr. 463 */ 464 struct ifaddr *ifa; 465 466 bzero(sin->sin_zero, sizeof(sin->sin_zero)); 467 sin->sin_port = 0; 468 ifa = ifa_ifwithaddr((struct sockaddr *) sin); 469 if (ifa == NULL) { 470 error = EADDRNOTAVAIL; 471 goto cantsend; 472 } 473 m->m_pkthdr.rcvif = ifa->ifa_ifp; 474 ifa_free(ifa); 475 } 476 #ifdef MAC 477 mac_socket_create_mbuf(so, m); 478 #endif 479 /* Send packet to input processing via netisr */ 480 netisr_queue_src(NETISR_IP, (uintptr_t)so, m); 481 } 482 483 return error; 484 485 cantsend: 486 m_freem(m); 487 return error; 488 } 489 490 static int 491 div_attach(struct socket *so, int proto, struct thread *td) 492 { 493 struct inpcb *inp; 494 int error; 495 496 inp = sotoinpcb(so); 497 KASSERT(inp == NULL, ("div_attach: inp != NULL")); 498 if (td != NULL) { 499 error = priv_check(td, PRIV_NETINET_DIVERT); 500 if (error) 501 return (error); 502 } 503 error = soreserve(so, div_sendspace, div_recvspace); 504 if (error) 505 return error; 506 INP_INFO_WLOCK(&V_divcbinfo); 507 error = in_pcballoc(so, &V_divcbinfo); 508 if (error) { 509 INP_INFO_WUNLOCK(&V_divcbinfo); 510 return error; 511 } 512 inp = (struct inpcb *)so->so_pcb; 513 INP_INFO_WUNLOCK(&V_divcbinfo); 514 inp->inp_ip_p = proto; 515 inp->inp_vflag |= INP_IPV4; 516 inp->inp_flags |= INP_HDRINCL; 517 INP_WUNLOCK(inp); 518 return 0; 519 } 520 521 static void 522 div_detach(struct socket *so) 523 { 524 struct inpcb *inp; 525 526 inp = sotoinpcb(so); 527 KASSERT(inp != NULL, ("div_detach: inp == NULL")); 528 INP_INFO_WLOCK(&V_divcbinfo); 529 INP_WLOCK(inp); 530 in_pcbdetach(inp); 531 in_pcbfree(inp); 532 INP_INFO_WUNLOCK(&V_divcbinfo); 533 } 534 535 static int 536 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 537 { 538 struct inpcb *inp; 539 int error; 540 541 inp = sotoinpcb(so); 542 KASSERT(inp != NULL, ("div_bind: inp == NULL")); 543 /* in_pcbbind assumes that nam is a sockaddr_in 544 * and in_pcbbind requires a valid address. Since divert 545 * sockets don't we need to make sure the address is 546 * filled in properly. 547 * XXX -- divert should not be abusing in_pcbind 548 * and should probably have its own family. 549 */ 550 if (nam->sa_family != AF_INET) 551 return EAFNOSUPPORT; 552 ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; 553 INP_INFO_WLOCK(&V_divcbinfo); 554 INP_WLOCK(inp); 555 error = in_pcbbind(inp, nam, td->td_ucred); 556 INP_WUNLOCK(inp); 557 INP_INFO_WUNLOCK(&V_divcbinfo); 558 return error; 559 } 560 561 static int 562 div_shutdown(struct socket *so) 563 { 564 struct inpcb *inp; 565 566 inp = sotoinpcb(so); 567 KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); 568 INP_WLOCK(inp); 569 socantsendmore(so); 570 INP_WUNLOCK(inp); 571 return 0; 572 } 573 574 static int 575 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 576 struct mbuf *control, struct thread *td) 577 { 578 579 /* Packet must have a header (but that's about it) */ 580 if (m->m_len < sizeof (struct ip) && 581 (m = m_pullup(m, sizeof (struct ip))) == 0) { 582 KMOD_IPSTAT_INC(ips_toosmall); 583 m_freem(m); 584 return EINVAL; 585 } 586 587 /* Send packet */ 588 return div_output(so, m, (struct sockaddr_in *)nam, control); 589 } 590 591 void 592 div_ctlinput(int cmd, struct sockaddr *sa, void *vip) 593 { 594 struct in_addr faddr; 595 596 faddr = ((struct sockaddr_in *)sa)->sin_addr; 597 if (sa->sa_family != AF_INET || faddr.s_addr == INADDR_ANY) 598 return; 599 if (PRC_IS_REDIRECT(cmd)) 600 return; 601 } 602 603 static int 604 div_pcblist(SYSCTL_HANDLER_ARGS) 605 { 606 int error, i, n; 607 struct inpcb *inp, **inp_list; 608 inp_gen_t gencnt; 609 struct xinpgen xig; 610 611 /* 612 * The process of preparing the TCB list is too time-consuming and 613 * resource-intensive to repeat twice on every request. 614 */ 615 if (req->oldptr == 0) { 616 n = V_divcbinfo.ipi_count; 617 req->oldidx = 2 * (sizeof xig) 618 + (n + n/8) * sizeof(struct xinpcb); 619 return 0; 620 } 621 622 if (req->newptr != 0) 623 return EPERM; 624 625 /* 626 * OK, now we're committed to doing something. 627 */ 628 INP_INFO_RLOCK(&V_divcbinfo); 629 gencnt = V_divcbinfo.ipi_gencnt; 630 n = V_divcbinfo.ipi_count; 631 INP_INFO_RUNLOCK(&V_divcbinfo); 632 633 error = sysctl_wire_old_buffer(req, 634 2 * sizeof(xig) + n*sizeof(struct xinpcb)); 635 if (error != 0) 636 return (error); 637 638 xig.xig_len = sizeof xig; 639 xig.xig_count = n; 640 xig.xig_gen = gencnt; 641 xig.xig_sogen = so_gencnt; 642 error = SYSCTL_OUT(req, &xig, sizeof xig); 643 if (error) 644 return error; 645 646 inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); 647 if (inp_list == 0) 648 return ENOMEM; 649 650 INP_INFO_RLOCK(&V_divcbinfo); 651 for (inp = LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; 652 inp = LIST_NEXT(inp, inp_list)) { 653 INP_RLOCK(inp); 654 if (inp->inp_gencnt <= gencnt && 655 cr_canseeinpcb(req->td->td_ucred, inp) == 0) 656 inp_list[i++] = inp; 657 INP_RUNLOCK(inp); 658 } 659 INP_INFO_RUNLOCK(&V_divcbinfo); 660 n = i; 661 662 error = 0; 663 for (i = 0; i < n; i++) { 664 inp = inp_list[i]; 665 INP_RLOCK(inp); 666 if (inp->inp_gencnt <= gencnt) { 667 struct xinpcb xi; 668 bzero(&xi, sizeof(xi)); 669 xi.xi_len = sizeof xi; 670 /* XXX should avoid extra copy */ 671 bcopy(inp, &xi.xi_inp, sizeof *inp); 672 if (inp->inp_socket) 673 sotoxsocket(inp->inp_socket, &xi.xi_socket); 674 INP_RUNLOCK(inp); 675 error = SYSCTL_OUT(req, &xi, sizeof xi); 676 } else 677 INP_RUNLOCK(inp); 678 } 679 if (!error) { 680 /* 681 * Give the user an updated idea of our state. 682 * If the generation differs from what we told 683 * her before, she knows that something happened 684 * while we were processing this request, and it 685 * might be necessary to retry. 686 */ 687 INP_INFO_RLOCK(&V_divcbinfo); 688 xig.xig_gen = V_divcbinfo.ipi_gencnt; 689 xig.xig_sogen = so_gencnt; 690 xig.xig_count = V_divcbinfo.ipi_count; 691 INP_INFO_RUNLOCK(&V_divcbinfo); 692 error = SYSCTL_OUT(req, &xig, sizeof xig); 693 } 694 free(inp_list, M_TEMP); 695 return error; 696 } 697 698 #ifdef SYSCTL_NODE 699 SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, CTLFLAG_RW, 0, "IPDIVERT"); 700 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, CTLFLAG_RD, 0, 0, 701 div_pcblist, "S,xinpcb", "List of active divert sockets"); 702 #endif 703 704 struct pr_usrreqs div_usrreqs = { 705 .pru_attach = div_attach, 706 .pru_bind = div_bind, 707 .pru_control = in_control, 708 .pru_detach = div_detach, 709 .pru_peeraddr = in_getpeeraddr, 710 .pru_send = div_send, 711 .pru_shutdown = div_shutdown, 712 .pru_sockaddr = in_getsockaddr, 713 .pru_sosetlabel = in_pcbsosetlabel 714 }; 715 716 struct protosw div_protosw = { 717 .pr_type = SOCK_RAW, 718 .pr_protocol = IPPROTO_DIVERT, 719 .pr_flags = PR_ATOMIC|PR_ADDR, 720 .pr_input = div_input, 721 .pr_ctlinput = div_ctlinput, 722 .pr_ctloutput = ip_ctloutput, 723 .pr_init = div_init, 724 #ifdef VIMAGE 725 .pr_destroy = div_destroy, 726 #endif 727 .pr_usrreqs = &div_usrreqs 728 }; 729 730 static int 731 div_modevent(module_t mod, int type, void *unused) 732 { 733 int err = 0; 734 #ifndef VIMAGE 735 int n; 736 #endif 737 738 switch (type) { 739 case MOD_LOAD: 740 /* 741 * Protocol will be initialized by pf_proto_register(). 742 * We don't have to register ip_protox because we are not 743 * a true IP protocol that goes over the wire. 744 */ 745 err = pf_proto_register(PF_INET, &div_protosw); 746 if (err != 0) 747 return (err); 748 ip_divert_ptr = divert_packet; 749 ip_divert_event_tag = EVENTHANDLER_REGISTER(maxsockets_change, 750 div_zone_change, NULL, EVENTHANDLER_PRI_ANY); 751 break; 752 case MOD_QUIESCE: 753 /* 754 * IPDIVERT may normally not be unloaded because of the 755 * potential race conditions. Tell kldunload we can't be 756 * unloaded unless the unload is forced. 757 */ 758 err = EPERM; 759 break; 760 case MOD_UNLOAD: 761 #ifdef VIMAGE 762 err = EPERM; 763 break; 764 #else 765 /* 766 * Forced unload. 767 * 768 * Module ipdivert can only be unloaded if no sockets are 769 * connected. Maybe this can be changed later to forcefully 770 * disconnect any open sockets. 771 * 772 * XXXRW: Note that there is a slight race here, as a new 773 * socket open request could be spinning on the lock and then 774 * we destroy the lock. 775 */ 776 INP_INFO_WLOCK(&V_divcbinfo); 777 n = V_divcbinfo.ipi_count; 778 if (n != 0) { 779 err = EBUSY; 780 INP_INFO_WUNLOCK(&V_divcbinfo); 781 break; 782 } 783 ip_divert_ptr = NULL; 784 err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); 785 INP_INFO_WUNLOCK(&V_divcbinfo); 786 div_destroy(); 787 EVENTHANDLER_DEREGISTER(maxsockets_change, ip_divert_event_tag); 788 break; 789 #endif /* !VIMAGE */ 790 default: 791 err = EOPNOTSUPP; 792 break; 793 } 794 return err; 795 } 796 797 static moduledata_t ipdivertmod = { 798 "ipdivert", 799 div_modevent, 800 0 801 }; 802 803 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_ANY); 804 MODULE_DEPEND(dummynet, ipfw, 2, 2, 2); 805 MODULE_VERSION(ipdivert, 1); 806