1 /*- 2 * SPDX-License-Identifier: BSD-3-Clause 3 * 4 * Copyright (c) 1982, 1986, 1988, 1993 5 * The Regents of the University of California. All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice, this list of conditions and the following disclaimer. 12 * 2. Redistributions in binary form must reproduce the above copyright 13 * notice, this list of conditions and the following disclaimer in the 14 * documentation and/or other materials provided with the distribution. 15 * 3. Neither the name of the University nor the names of its contributors 16 * may be used to endorse or promote products derived from this software 17 * without specific prior written permission. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 20 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 25 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 #include <sys/cdefs.h> 33 __FBSDID("$FreeBSD$"); 34 35 #include "opt_inet.h" 36 #include "opt_inet6.h" 37 #include "opt_sctp.h" 38 #ifndef INET 39 #error "IPDIVERT requires INET" 40 #endif 41 42 #include <sys/param.h> 43 #include <sys/eventhandler.h> 44 #include <sys/kernel.h> 45 #include <sys/lock.h> 46 #include <sys/malloc.h> 47 #include <sys/mbuf.h> 48 #include <sys/module.h> 49 #include <sys/kernel.h> 50 #include <sys/priv.h> 51 #include <sys/proc.h> 52 #include <sys/protosw.h> 53 #include <sys/socket.h> 54 #include <sys/socketvar.h> 55 #include <sys/sysctl.h> 56 #include <net/vnet.h> 57 58 #include <net/if.h> 59 #include <net/if_var.h> 60 #include <net/netisr.h> 61 62 #include <netinet/in.h> 63 #include <netinet/in_pcb.h> 64 #include <netinet/in_systm.h> 65 #include <netinet/in_var.h> 66 #include <netinet/ip.h> 67 #include <netinet/ip_var.h> 68 #ifdef INET6 69 #include <netinet/ip6.h> 70 #include <netinet6/ip6_var.h> 71 #endif 72 #if defined(SCTP) || defined(SCTP_SUPPORT) 73 #include <netinet/sctp_crc32.h> 74 #endif 75 76 #include <security/mac/mac_framework.h> 77 /* 78 * Divert sockets 79 */ 80 81 /* 82 * Allocate enough space to hold a full IP packet 83 */ 84 #define DIVSNDQ (65536 + 100) 85 #define DIVRCVQ (65536 + 100) 86 87 /* 88 * Divert sockets work in conjunction with ipfw or other packet filters, 89 * see the divert(4) manpage for features. 90 * Packets are selected by the packet filter and tagged with an 91 * MTAG_IPFW_RULE tag carrying the 'divert port' number (as set by 92 * the packet filter) and information on the matching filter rule for 93 * subsequent reinjection. The divert_port is used to put the packet 94 * on the corresponding divert socket, while the rule number is passed 95 * up (at least partially) as the sin_port in the struct sockaddr. 96 * 97 * Packets written to the divert socket carry in sin_addr a 98 * destination address, and in sin_port the number of the filter rule 99 * after which to continue processing. 100 * If the destination address is INADDR_ANY, the packet is treated as 101 * as outgoing and sent to ip_output(); otherwise it is treated as 102 * incoming and sent to ip_input(). 103 * Further, sin_zero carries some information on the interface, 104 * which can be used in the reinject -- see comments in the code. 105 * 106 * On reinjection, processing in ip_input() and ip_output() 107 * will be exactly the same as for the original packet, except that 108 * packet filter processing will start at the rule number after the one 109 * written in the sin_port (ipfw does not allow a rule #0, so sin_port=0 110 * will apply the entire ruleset to the packet). 111 */ 112 113 /* Internal variables. */ 114 VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); 115 #define V_divcbinfo VNET(divcbinfo) 116 117 static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ 118 static u_long div_recvspace = DIVRCVQ; /* XXX sysctl ? */ 119 120 static int div_output_inbound(int fmaily, struct socket *so, struct mbuf *m, 121 struct sockaddr_in *sin); 122 static int div_output_outbound(int family, struct socket *so, struct mbuf *m); 123 124 /* 125 * Initialize divert connection block queue. 126 */ 127 INPCBSTORAGE_DEFINE(divcbstor, "divinp", "divcb", "div", "divhash"); 128 129 static void 130 div_init(void *arg __unused) 131 { 132 133 /* 134 * XXX We don't use the hash list for divert IP, but it's easier to 135 * allocate one-entry hash lists than it is to check all over the 136 * place for hashbase == NULL. 137 */ 138 in_pcbinfo_init(&V_divcbinfo, &divcbstor, 1, 1); 139 } 140 VNET_SYSINIT(div_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_init, NULL); 141 142 static void 143 div_destroy(void *unused __unused) 144 { 145 146 in_pcbinfo_destroy(&V_divcbinfo); 147 } 148 VNET_SYSUNINIT(divert, SI_SUB_PROTO_DOMAIN, SI_ORDER_THIRD, div_destroy, NULL); 149 150 /* 151 * IPPROTO_DIVERT is not in the real IP protocol number space; this 152 * function should never be called. Just in case, drop any packets. 153 */ 154 static int 155 div_input(struct mbuf **mp, int *offp, int proto) 156 { 157 struct mbuf *m = *mp; 158 159 KMOD_IPSTAT_INC(ips_noproto); 160 m_freem(m); 161 return (IPPROTO_DONE); 162 } 163 164 static bool 165 div_port_match(const struct inpcb *inp, void *v) 166 { 167 uint16_t nport = *(uint16_t *)v; 168 169 return (inp->inp_lport == nport); 170 } 171 172 /* 173 * Divert a packet by passing it up to the divert socket at port 'port'. 174 * 175 * Setup generic address and protocol structures for div_input routine, 176 * then pass them along with mbuf chain. 177 */ 178 static void 179 divert_packet(struct mbuf *m, bool incoming) 180 { 181 struct ip *ip; 182 struct inpcb *inp; 183 struct socket *sa; 184 u_int16_t nport; 185 struct sockaddr_in divsrc; 186 struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo, 187 INPLOOKUP_RLOCKPCB, div_port_match, &nport); 188 struct m_tag *mtag; 189 190 NET_EPOCH_ASSERT(); 191 192 mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 193 if (mtag == NULL) { 194 m_freem(m); 195 return; 196 } 197 /* Assure header */ 198 if (m->m_len < sizeof(struct ip) && 199 (m = m_pullup(m, sizeof(struct ip))) == NULL) 200 return; 201 ip = mtod(m, struct ip *); 202 203 /* Delayed checksums are currently not compatible with divert. */ 204 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 205 in_delayed_cksum(m); 206 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 207 } 208 #if defined(SCTP) || defined(SCTP_SUPPORT) 209 if (m->m_pkthdr.csum_flags & CSUM_SCTP) { 210 sctp_delayed_cksum(m, (uint32_t)(ip->ip_hl << 2)); 211 m->m_pkthdr.csum_flags &= ~CSUM_SCTP; 212 } 213 #endif 214 #ifdef INET6 215 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA_IPV6) { 216 in6_delayed_cksum(m, m->m_pkthdr.len - 217 sizeof(struct ip6_hdr), sizeof(struct ip6_hdr)); 218 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA_IPV6; 219 } 220 #if defined(SCTP) || defined(SCTP_SUPPORT) 221 if (m->m_pkthdr.csum_flags & CSUM_SCTP_IPV6) { 222 sctp_delayed_cksum(m, sizeof(struct ip6_hdr)); 223 m->m_pkthdr.csum_flags &= ~CSUM_SCTP_IPV6; 224 } 225 #endif 226 #endif /* INET6 */ 227 bzero(&divsrc, sizeof(divsrc)); 228 divsrc.sin_len = sizeof(divsrc); 229 divsrc.sin_family = AF_INET; 230 /* record matching rule, in host format */ 231 divsrc.sin_port = ((struct ipfw_rule_ref *)(mtag+1))->rulenum; 232 /* 233 * Record receive interface address, if any. 234 * But only for incoming packets. 235 */ 236 if (incoming) { 237 struct ifaddr *ifa; 238 struct ifnet *ifp; 239 240 /* Sanity check */ 241 M_ASSERTPKTHDR(m); 242 243 /* Find IP address for receive interface */ 244 ifp = m->m_pkthdr.rcvif; 245 CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) { 246 if (ifa->ifa_addr->sa_family != AF_INET) 247 continue; 248 divsrc.sin_addr = 249 ((struct sockaddr_in *) ifa->ifa_addr)->sin_addr; 250 break; 251 } 252 } 253 /* 254 * Record the incoming interface name whenever we have one. 255 */ 256 if (m->m_pkthdr.rcvif) { 257 /* 258 * Hide the actual interface name in there in the 259 * sin_zero array. XXX This needs to be moved to a 260 * different sockaddr type for divert, e.g. 261 * sockaddr_div with multiple fields like 262 * sockaddr_dl. Presently we have only 7 bytes 263 * but that will do for now as most interfaces 264 * are 4 or less + 2 or less bytes for unit. 265 * There is probably a faster way of doing this, 266 * possibly taking it from the sockaddr_dl on the iface. 267 * This solves the problem of a P2P link and a LAN interface 268 * having the same address, which can result in the wrong 269 * interface being assigned to the packet when fed back 270 * into the divert socket. Theoretically if the daemon saves 271 * and re-uses the sockaddr_in as suggested in the man pages, 272 * this iface name will come along for the ride. 273 * (see div_output for the other half of this.) 274 */ 275 strlcpy(divsrc.sin_zero, m->m_pkthdr.rcvif->if_xname, 276 sizeof(divsrc.sin_zero)); 277 } 278 279 /* Put packet on socket queue, if any */ 280 sa = NULL; 281 /* nport is inp_next's context. */ 282 nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); 283 while ((inp = inp_next(&inpi)) != NULL) { 284 sa = inp->inp_socket; 285 SOCKBUF_LOCK(&sa->so_rcv); 286 if (sbappendaddr_locked(&sa->so_rcv, 287 (struct sockaddr *)&divsrc, m, NULL) == 0) { 288 soroverflow_locked(sa); 289 sa = NULL; /* force mbuf reclaim below */ 290 } else 291 sorwakeup_locked(sa); 292 /* XXX why does only one socket match? */ 293 INP_RUNLOCK(inp); 294 break; 295 } 296 if (sa == NULL) { 297 m_freem(m); 298 KMOD_IPSTAT_INC(ips_noproto); 299 KMOD_IPSTAT_DEC(ips_delivered); 300 } 301 } 302 303 /* 304 * Deliver packet back into the IP processing machinery. 305 * 306 * If no address specified, or address is 0.0.0.0, send to ip_output(); 307 * otherwise, send to ip_input() and mark as having been received on 308 * the interface with that address. 309 */ 310 static int 311 div_output(struct socket *so, struct mbuf *m, struct sockaddr_in *sin, 312 struct mbuf *control) 313 { 314 struct epoch_tracker et; 315 const struct ip *ip; 316 struct m_tag *mtag; 317 struct ipfw_rule_ref *dt; 318 int error, family; 319 320 if (control) { 321 m_freem(control); /* XXX */ 322 control = NULL; 323 } 324 325 if (sin != NULL) { 326 if (sin->sin_family != AF_INET) { 327 m_freem(m); 328 return (EAFNOSUPPORT); 329 } 330 if (sin->sin_len != sizeof(*sin)) { 331 m_freem(m); 332 return (EINVAL); 333 } 334 } 335 336 /* 337 * An mbuf may hasn't come from userland, but we pretend 338 * that it has. 339 */ 340 m->m_pkthdr.rcvif = NULL; 341 m->m_nextpkt = NULL; 342 M_SETFIB(m, so->so_fibnum); 343 344 mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); 345 if (mtag == NULL) { 346 /* this should be normal */ 347 mtag = m_tag_alloc(MTAG_IPFW_RULE, 0, 348 sizeof(struct ipfw_rule_ref), M_NOWAIT | M_ZERO); 349 if (mtag == NULL) { 350 m_freem(m); 351 return (ENOBUFS); 352 } 353 m_tag_prepend(m, mtag); 354 } 355 dt = (struct ipfw_rule_ref *)(mtag+1); 356 357 /* Loopback avoidance and state recovery */ 358 if (sin) { 359 int i; 360 361 /* set the starting point. We provide a non-zero slot, 362 * but a non_matching chain_id to skip that info and use 363 * the rulenum/rule_id. 364 */ 365 dt->slot = 1; /* dummy, chain_id is invalid */ 366 dt->chain_id = 0; 367 dt->rulenum = sin->sin_port+1; /* host format ? */ 368 dt->rule_id = 0; 369 /* XXX: broken for IPv6 */ 370 /* 371 * Find receive interface with the given name, stuffed 372 * (if it exists) in the sin_zero[] field. 373 * The name is user supplied data so don't trust its size 374 * or that it is zero terminated. 375 */ 376 for (i = 0; i < sizeof(sin->sin_zero) && sin->sin_zero[i]; i++) 377 ; 378 if ( i > 0 && i < sizeof(sin->sin_zero)) 379 m->m_pkthdr.rcvif = ifunit(sin->sin_zero); 380 } 381 382 ip = mtod(m, struct ip *); 383 switch (ip->ip_v) { 384 case IPVERSION: 385 family = AF_INET; 386 break; 387 #ifdef INET6 388 case IPV6_VERSION >> 4: 389 family = AF_INET6; 390 break; 391 #endif 392 default: 393 m_freem(m); 394 return (EAFNOSUPPORT); 395 } 396 397 /* Reinject packet into the system as incoming or outgoing */ 398 NET_EPOCH_ENTER(et); 399 if (!sin || sin->sin_addr.s_addr == 0) { 400 dt->info |= IPFW_IS_DIVERT | IPFW_INFO_OUT; 401 error = div_output_outbound(family, so, m); 402 } else { 403 dt->info |= IPFW_IS_DIVERT | IPFW_INFO_IN; 404 error = div_output_inbound(family, so, m, sin); 405 } 406 NET_EPOCH_EXIT(et); 407 408 return (error); 409 } 410 411 /* 412 * Sends mbuf @m to the wire via ip[6]_output(). 413 * 414 * Returns 0 on success or an errno value on failure. @m is always consumed. 415 */ 416 static int 417 div_output_outbound(int family, struct socket *so, struct mbuf *m) 418 { 419 struct ip *const ip = mtod(m, struct ip *); 420 struct mbuf *options; 421 struct inpcb *inp; 422 int error; 423 424 inp = sotoinpcb(so); 425 INP_RLOCK(inp); 426 switch (family) { 427 case AF_INET: 428 /* 429 * Don't allow both user specified and setsockopt 430 * options, and don't allow packet length sizes that 431 * will crash. 432 */ 433 if ((((ip->ip_hl << 2) != sizeof(struct ip)) && 434 inp->inp_options != NULL) || 435 ((u_short)ntohs(ip->ip_len) > m->m_pkthdr.len)) { 436 INP_RUNLOCK(inp); 437 m_freem(m); 438 return (EINVAL); 439 } 440 break; 441 #ifdef INET6 442 case AF_INET6: 443 { 444 struct ip6_hdr *const ip6 = mtod(m, struct ip6_hdr *); 445 446 /* Don't allow packet length sizes that will crash */ 447 if (((u_short)ntohs(ip6->ip6_plen) > m->m_pkthdr.len)) { 448 INP_RUNLOCK(inp); 449 m_freem(m); 450 return (EINVAL); 451 } 452 break; 453 } 454 #endif 455 } 456 457 /* Send packet to output processing */ 458 KMOD_IPSTAT_INC(ips_rawout); /* XXX */ 459 460 #ifdef MAC 461 mac_inpcb_create_mbuf(inp, m); 462 #endif 463 /* 464 * Get ready to inject the packet into ip_output(). 465 * Just in case socket options were specified on the 466 * divert socket, we duplicate them. This is done 467 * to avoid having to hold the PCB locks over the call 468 * to ip_output(), as doing this results in a number of 469 * lock ordering complexities. 470 * 471 * Note that we set the multicast options argument for 472 * ip_output() to NULL since it should be invariant that 473 * they are not present. 474 */ 475 KASSERT(inp->inp_moptions == NULL, 476 ("multicast options set on a divert socket")); 477 /* 478 * XXXCSJP: It is unclear to me whether or not it makes 479 * sense for divert sockets to have options. However, 480 * for now we will duplicate them with the INP locks 481 * held so we can use them in ip_output() without 482 * requring a reference to the pcb. 483 */ 484 options = NULL; 485 if (inp->inp_options != NULL) { 486 options = m_dup(inp->inp_options, M_NOWAIT); 487 if (options == NULL) { 488 INP_RUNLOCK(inp); 489 m_freem(m); 490 return (ENOBUFS); 491 } 492 } 493 INP_RUNLOCK(inp); 494 495 error = 0; 496 switch (family) { 497 case AF_INET: 498 error = ip_output(m, options, NULL, 499 ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0) 500 | IP_ALLOWBROADCAST | IP_RAWOUTPUT, NULL, NULL); 501 break; 502 #ifdef INET6 503 case AF_INET6: 504 error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); 505 break; 506 #endif 507 } 508 if (options != NULL) 509 m_freem(options); 510 511 return (error); 512 } 513 514 /* 515 * Schedules mbuf @m for local processing via IPv4/IPv6 netisr queue. 516 * 517 * Returns 0 on success or an errno value on failure. @m is always consumed. 518 */ 519 static int 520 div_output_inbound(int family, struct socket *so, struct mbuf *m, 521 struct sockaddr_in *sin) 522 { 523 const struct ip *ip; 524 struct ifaddr *ifa; 525 526 if (m->m_pkthdr.rcvif == NULL) { 527 /* 528 * No luck with the name, check by IP address. 529 * Clear the port and the ifname to make sure 530 * there are no distractions for ifa_ifwithaddr. 531 */ 532 533 /* XXX: broken for IPv6 */ 534 bzero(sin->sin_zero, sizeof(sin->sin_zero)); 535 sin->sin_port = 0; 536 ifa = ifa_ifwithaddr((struct sockaddr *) sin); 537 if (ifa == NULL) { 538 m_freem(m); 539 return (EADDRNOTAVAIL); 540 } 541 m->m_pkthdr.rcvif = ifa->ifa_ifp; 542 } 543 #ifdef MAC 544 mac_socket_create_mbuf(so, m); 545 #endif 546 /* Send packet to input processing via netisr */ 547 switch (family) { 548 case AF_INET: 549 ip = mtod(m, struct ip *); 550 /* 551 * Restore M_BCAST flag when destination address is 552 * broadcast. It is expected by ip_tryforward(). 553 */ 554 if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) 555 m->m_flags |= M_MCAST; 556 else if (in_broadcast(ip->ip_dst, m->m_pkthdr.rcvif)) 557 m->m_flags |= M_BCAST; 558 netisr_queue_src(NETISR_IP, (uintptr_t)so, m); 559 break; 560 #ifdef INET6 561 case AF_INET6: 562 netisr_queue_src(NETISR_IPV6, (uintptr_t)so, m); 563 break; 564 #endif 565 default: 566 m_freem(m); 567 return (EINVAL); 568 } 569 570 return (0); 571 } 572 573 static int 574 div_attach(struct socket *so, int proto, struct thread *td) 575 { 576 struct inpcb *inp; 577 int error; 578 579 inp = sotoinpcb(so); 580 KASSERT(inp == NULL, ("div_attach: inp != NULL")); 581 if (td != NULL) { 582 error = priv_check(td, PRIV_NETINET_DIVERT); 583 if (error) 584 return (error); 585 } 586 error = soreserve(so, div_sendspace, div_recvspace); 587 if (error) 588 return error; 589 error = in_pcballoc(so, &V_divcbinfo); 590 if (error) 591 return error; 592 inp = (struct inpcb *)so->so_pcb; 593 inp->inp_ip_p = proto; 594 inp->inp_vflag |= INP_IPV4; 595 inp->inp_flags |= INP_HDRINCL; 596 INP_WUNLOCK(inp); 597 return 0; 598 } 599 600 static void 601 div_detach(struct socket *so) 602 { 603 struct inpcb *inp; 604 605 inp = sotoinpcb(so); 606 KASSERT(inp != NULL, ("div_detach: inp == NULL")); 607 INP_WLOCK(inp); 608 in_pcbdetach(inp); 609 in_pcbfree(inp); 610 } 611 612 static int 613 div_bind(struct socket *so, struct sockaddr *nam, struct thread *td) 614 { 615 struct inpcb *inp; 616 int error; 617 618 inp = sotoinpcb(so); 619 KASSERT(inp != NULL, ("div_bind: inp == NULL")); 620 /* in_pcbbind assumes that nam is a sockaddr_in 621 * and in_pcbbind requires a valid address. Since divert 622 * sockets don't we need to make sure the address is 623 * filled in properly. 624 * XXX -- divert should not be abusing in_pcbind 625 * and should probably have its own family. 626 */ 627 if (nam->sa_family != AF_INET) 628 return EAFNOSUPPORT; 629 if (nam->sa_len != sizeof(struct sockaddr_in)) 630 return EINVAL; 631 ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; 632 INP_WLOCK(inp); 633 INP_HASH_WLOCK(&V_divcbinfo); 634 error = in_pcbbind(inp, nam, td->td_ucred); 635 INP_HASH_WUNLOCK(&V_divcbinfo); 636 INP_WUNLOCK(inp); 637 return error; 638 } 639 640 static int 641 div_shutdown(struct socket *so) 642 { 643 struct inpcb *inp; 644 645 inp = sotoinpcb(so); 646 KASSERT(inp != NULL, ("div_shutdown: inp == NULL")); 647 INP_WLOCK(inp); 648 socantsendmore(so); 649 INP_WUNLOCK(inp); 650 return 0; 651 } 652 653 static int 654 div_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, 655 struct mbuf *control, struct thread *td) 656 { 657 658 /* Packet must have a header (but that's about it) */ 659 if (m->m_len < sizeof (struct ip) && 660 (m = m_pullup(m, sizeof (struct ip))) == NULL) { 661 KMOD_IPSTAT_INC(ips_toosmall); 662 if (control != NULL) 663 m_freem(control); 664 m_freem(m); 665 return EINVAL; 666 } 667 668 /* Send packet */ 669 return div_output(so, m, (struct sockaddr_in *)nam, control); 670 } 671 672 static int 673 div_pcblist(SYSCTL_HANDLER_ARGS) 674 { 675 struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo, 676 INPLOOKUP_RLOCKPCB); 677 struct xinpgen xig; 678 struct inpcb *inp; 679 int error; 680 681 if (req->newptr != 0) 682 return EPERM; 683 684 if (req->oldptr == 0) { 685 int n; 686 687 n = V_divcbinfo.ipi_count; 688 n += imax(n / 8, 10); 689 req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); 690 return 0; 691 } 692 693 if ((error = sysctl_wire_old_buffer(req, 0)) != 0) 694 return (error); 695 696 bzero(&xig, sizeof(xig)); 697 xig.xig_len = sizeof xig; 698 xig.xig_count = V_divcbinfo.ipi_count; 699 xig.xig_gen = V_divcbinfo.ipi_gencnt; 700 xig.xig_sogen = so_gencnt; 701 error = SYSCTL_OUT(req, &xig, sizeof xig); 702 if (error) 703 return error; 704 705 while ((inp = inp_next(&inpi)) != NULL) { 706 if (inp->inp_gencnt <= xig.xig_gen) { 707 struct xinpcb xi; 708 709 in_pcbtoxinpcb(inp, &xi); 710 error = SYSCTL_OUT(req, &xi, sizeof xi); 711 if (error) { 712 INP_RUNLOCK(inp); 713 break; 714 } 715 } 716 } 717 718 if (!error) { 719 /* 720 * Give the user an updated idea of our state. 721 * If the generation differs from what we told 722 * her before, she knows that something happened 723 * while we were processing this request, and it 724 * might be necessary to retry. 725 */ 726 xig.xig_gen = V_divcbinfo.ipi_gencnt; 727 xig.xig_sogen = so_gencnt; 728 xig.xig_count = V_divcbinfo.ipi_count; 729 error = SYSCTL_OUT(req, &xig, sizeof xig); 730 } 731 732 return (error); 733 } 734 735 #ifdef SYSCTL_NODE 736 static SYSCTL_NODE(_net_inet, IPPROTO_DIVERT, divert, 737 CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 738 "IPDIVERT"); 739 SYSCTL_PROC(_net_inet_divert, OID_AUTO, pcblist, 740 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, 741 NULL, 0, div_pcblist, "S,xinpcb", 742 "List of active divert sockets"); 743 #endif 744 745 struct pr_usrreqs div_usrreqs = { 746 .pru_attach = div_attach, 747 .pru_bind = div_bind, 748 .pru_control = in_control, 749 .pru_detach = div_detach, 750 .pru_peeraddr = in_getpeeraddr, 751 .pru_send = div_send, 752 .pru_shutdown = div_shutdown, 753 .pru_sockaddr = in_getsockaddr, 754 .pru_sosetlabel = in_pcbsosetlabel 755 }; 756 757 struct protosw div_protosw = { 758 .pr_type = SOCK_RAW, 759 .pr_protocol = IPPROTO_DIVERT, 760 .pr_flags = PR_ATOMIC|PR_ADDR, 761 .pr_input = div_input, 762 .pr_usrreqs = &div_usrreqs 763 }; 764 765 static int 766 div_modevent(module_t mod, int type, void *unused) 767 { 768 int err = 0; 769 770 switch (type) { 771 case MOD_LOAD: 772 /* 773 * Protocol will be initialized by pf_proto_register(). 774 * We don't have to register ip_protox because we are not 775 * a true IP protocol that goes over the wire. 776 */ 777 err = pf_proto_register(PF_INET, &div_protosw); 778 if (err != 0) 779 return (err); 780 ip_divert_ptr = divert_packet; 781 break; 782 case MOD_QUIESCE: 783 /* 784 * IPDIVERT may normally not be unloaded because of the 785 * potential race conditions. Tell kldunload we can't be 786 * unloaded unless the unload is forced. 787 */ 788 err = EPERM; 789 break; 790 case MOD_UNLOAD: 791 /* 792 * Forced unload. 793 * 794 * Module ipdivert can only be unloaded if no sockets are 795 * connected. Maybe this can be changed later to forcefully 796 * disconnect any open sockets. 797 * 798 * XXXRW: Note that there is a slight race here, as a new 799 * socket open request could be spinning on the lock and then 800 * we destroy the lock. 801 */ 802 INP_INFO_WLOCK(&V_divcbinfo); 803 if (V_divcbinfo.ipi_count != 0) { 804 err = EBUSY; 805 INP_INFO_WUNLOCK(&V_divcbinfo); 806 break; 807 } 808 ip_divert_ptr = NULL; 809 err = pf_proto_unregister(PF_INET, IPPROTO_DIVERT, SOCK_RAW); 810 INP_INFO_WUNLOCK(&V_divcbinfo); 811 #ifndef VIMAGE 812 div_destroy(NULL); 813 #endif 814 break; 815 default: 816 err = EOPNOTSUPP; 817 break; 818 } 819 return err; 820 } 821 822 static moduledata_t ipdivertmod = { 823 "ipdivert", 824 div_modevent, 825 0 826 }; 827 828 DECLARE_MODULE(ipdivert, ipdivertmod, SI_SUB_PROTO_FIREWALL, SI_ORDER_ANY); 829 MODULE_DEPEND(ipdivert, ipfw, 3, 3, 3); 830 MODULE_VERSION(ipdivert, 1); 831