1 /*- 2 * Copyright (c) 1989 Stephen Deering 3 * Copyright (c) 1992, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * Stephen Deering of Stanford University. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 4. Neither the name of the University nor the names of its contributors 18 * may be used to endorse or promote products derived from this software 19 * without specific prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 22 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 25 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 27 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 28 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 29 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 30 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 31 * SUCH DAMAGE. 32 * 33 * @(#)ip_mroute.c 8.2 (Berkeley) 11/15/93 34 */ 35 36 /* 37 * IP multicast forwarding procedures 38 * 39 * Written by David Waitzman, BBN Labs, August 1988. 40 * Modified by Steve Deering, Stanford, February 1989. 41 * Modified by Mark J. Steiglitz, Stanford, May, 1991 42 * Modified by Van Jacobson, LBL, January 1993 43 * Modified by Ajit Thyagarajan, PARC, August 1993 44 * Modified by Bill Fenner, PARC, April 1995 45 * Modified by Ahmed Helmy, SGI, June 1996 46 * Modified by George Edmond Eddy (Rusty), ISI, February 1998 47 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000 48 * Modified by Hitoshi Asaeda, WIDE, August 2000 49 * Modified by Pavlin Radoslavov, ICSI, October 2002 50 * 51 * MROUTING Revision: 3.5 52 * and PIM-SMv2 and PIM-DM support, advanced API support, 53 * bandwidth metering and signaling 54 */ 55 56 /* 57 * TODO: Prefix functions with ipmf_. 58 * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol 59 * domain attachment (if_afdata) so we can track consumers of that service. 60 * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT, 61 * move it to socket options. 62 * TODO: Cleanup LSRR removal further. 63 * TODO: Push RSVP stubs into raw_ip.c. 64 * TODO: Use bitstring.h for vif set. 65 * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded. 66 * TODO: Sync ip6_mroute.c with this file. 67 */ 68 69 #include <sys/cdefs.h> 70 __FBSDID("$FreeBSD$"); 71 72 #include "opt_inet.h" 73 #include "opt_mrouting.h" 74 75 #define _PIM_VT 1 76 77 #include <sys/param.h> 78 #include <sys/kernel.h> 79 #include <sys/stddef.h> 80 #include <sys/lock.h> 81 #include <sys/ktr.h> 82 #include <sys/malloc.h> 83 #include <sys/mbuf.h> 84 #include <sys/module.h> 85 #include <sys/priv.h> 86 #include <sys/protosw.h> 87 #include <sys/signalvar.h> 88 #include <sys/socket.h> 89 #include <sys/socketvar.h> 90 #include <sys/sockio.h> 91 #include <sys/sx.h> 92 #include <sys/sysctl.h> 93 #include <sys/syslog.h> 94 #include <sys/systm.h> 95 #include <sys/time.h> 96 97 #include <net/if.h> 98 #include <net/netisr.h> 99 #include <net/route.h> 100 #include <net/vnet.h> 101 102 #include <netinet/in.h> 103 #include <netinet/igmp.h> 104 #include <netinet/in_systm.h> 105 #include <netinet/in_var.h> 106 #include <netinet/ip.h> 107 #include <netinet/ip_encap.h> 108 #include <netinet/ip_mroute.h> 109 #include <netinet/ip_var.h> 110 #include <netinet/ip_options.h> 111 #include <netinet/pim.h> 112 #include <netinet/pim_var.h> 113 #include <netinet/udp.h> 114 115 #include <machine/in_cksum.h> 116 117 #ifndef KTR_IPMF 118 #define KTR_IPMF KTR_INET 119 #endif 120 121 #define VIFI_INVALID ((vifi_t) -1) 122 #define M_HASCL(m) ((m)->m_flags & M_EXT) 123 124 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache"); 125 126 /* 127 * Locking. We use two locks: one for the virtual interface table and 128 * one for the forwarding table. These locks may be nested in which case 129 * the VIF lock must always be taken first. Note that each lock is used 130 * to cover not only the specific data structure but also related data 131 * structures. 132 */ 133 134 static struct mtx mrouter_mtx; 135 #define MROUTER_LOCK() mtx_lock(&mrouter_mtx) 136 #define MROUTER_UNLOCK() mtx_unlock(&mrouter_mtx) 137 #define MROUTER_LOCK_ASSERT() mtx_assert(&mrouter_mtx, MA_OWNED) 138 #define MROUTER_LOCK_INIT() \ 139 mtx_init(&mrouter_mtx, "IPv4 multicast forwarding", NULL, MTX_DEF) 140 #define MROUTER_LOCK_DESTROY() mtx_destroy(&mrouter_mtx) 141 142 static struct mrtstat mrtstat; 143 SYSCTL_STRUCT(_net_inet_ip, OID_AUTO, mrtstat, CTLFLAG_RW, 144 &mrtstat, mrtstat, 145 "IPv4 Multicast Forwarding Statistics (struct mrtstat, " 146 "netinet/ip_mroute.h)"); 147 148 static u_long mfchash; 149 #define MFCHASH(a, g) \ 150 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \ 151 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & mfchash) 152 #define MFCHASHSIZE 256 153 154 static u_char *nexpire; /* 0..mfchashsize-1 */ 155 static u_long mfchashsize; /* Hash size */ 156 LIST_HEAD(mfchashhdr, mfc) *mfchashtbl; 157 158 static struct mtx mfc_mtx; 159 #define MFC_LOCK() mtx_lock(&mfc_mtx) 160 #define MFC_UNLOCK() mtx_unlock(&mfc_mtx) 161 #define MFC_LOCK_ASSERT() mtx_assert(&mfc_mtx, MA_OWNED) 162 #define MFC_LOCK_INIT() \ 163 mtx_init(&mfc_mtx, "IPv4 multicast forwarding cache", NULL, MTX_DEF) 164 #define MFC_LOCK_DESTROY() mtx_destroy(&mfc_mtx) 165 166 static vifi_t numvifs; 167 static struct vif viftable[MAXVIFS]; 168 SYSCTL_OPAQUE(_net_inet_ip, OID_AUTO, viftable, CTLFLAG_RD, 169 &viftable, sizeof(viftable), "S,vif[MAXVIFS]", 170 "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)"); 171 172 static struct mtx vif_mtx; 173 #define VIF_LOCK() mtx_lock(&vif_mtx) 174 #define VIF_UNLOCK() mtx_unlock(&vif_mtx) 175 #define VIF_LOCK_ASSERT() mtx_assert(&vif_mtx, MA_OWNED) 176 #define VIF_LOCK_INIT() \ 177 mtx_init(&vif_mtx, "IPv4 multicast interfaces", NULL, MTX_DEF) 178 #define VIF_LOCK_DESTROY() mtx_destroy(&vif_mtx) 179 180 static eventhandler_tag if_detach_event_tag = NULL; 181 182 static struct callout expire_upcalls_ch; 183 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */ 184 #define UPCALL_EXPIRE 6 /* number of timeouts */ 185 186 /* 187 * Bandwidth meter variables and constants 188 */ 189 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters"); 190 /* 191 * Pending timeouts are stored in a hash table, the key being the 192 * expiration time. Periodically, the entries are analysed and processed. 193 */ 194 #define BW_METER_BUCKETS 1024 195 static struct bw_meter *bw_meter_timers[BW_METER_BUCKETS]; 196 static struct callout bw_meter_ch; 197 #define BW_METER_PERIOD (hz) /* periodical handling of bw meters */ 198 199 /* 200 * Pending upcalls are stored in a vector which is flushed when 201 * full, or periodically 202 */ 203 static struct bw_upcall bw_upcalls[BW_UPCALLS_MAX]; 204 static u_int bw_upcalls_n; /* # of pending upcalls */ 205 static struct callout bw_upcalls_ch; 206 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */ 207 208 static struct pimstat pimstat; 209 210 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW, 0, "PIM"); 211 SYSCTL_STRUCT(_net_inet_pim, PIMCTL_STATS, stats, CTLFLAG_RD, 212 &pimstat, pimstat, 213 "PIM Statistics (struct pimstat, netinet/pim_var.h)"); 214 215 static u_long pim_squelch_wholepkt = 0; 216 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RW, 217 &pim_squelch_wholepkt, 0, 218 "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified"); 219 220 extern struct domain inetdomain; 221 static const struct protosw in_pim_protosw = { 222 .pr_type = SOCK_RAW, 223 .pr_domain = &inetdomain, 224 .pr_protocol = IPPROTO_PIM, 225 .pr_flags = PR_ATOMIC|PR_ADDR|PR_LASTHDR, 226 .pr_input = pim_input, 227 .pr_output = (pr_output_t*)rip_output, 228 .pr_ctloutput = rip_ctloutput, 229 .pr_usrreqs = &rip_usrreqs 230 }; 231 static const struct encaptab *pim_encap_cookie; 232 233 static int pim_encapcheck(const struct mbuf *, int, int, void *); 234 235 /* 236 * Note: the PIM Register encapsulation adds the following in front of a 237 * data packet: 238 * 239 * struct pim_encap_hdr { 240 * struct ip ip; 241 * struct pim_encap_pimhdr pim; 242 * } 243 * 244 */ 245 246 struct pim_encap_pimhdr { 247 struct pim pim; 248 uint32_t flags; 249 }; 250 #define PIM_ENCAP_TTL 64 251 252 static struct ip pim_encap_iphdr = { 253 #if BYTE_ORDER == LITTLE_ENDIAN 254 sizeof(struct ip) >> 2, 255 IPVERSION, 256 #else 257 IPVERSION, 258 sizeof(struct ip) >> 2, 259 #endif 260 0, /* tos */ 261 sizeof(struct ip), /* total length */ 262 0, /* id */ 263 0, /* frag offset */ 264 PIM_ENCAP_TTL, 265 IPPROTO_PIM, 266 0, /* checksum */ 267 }; 268 269 static struct pim_encap_pimhdr pim_encap_pimhdr = { 270 { 271 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */ 272 0, /* reserved */ 273 0, /* checksum */ 274 }, 275 0 /* flags */ 276 }; 277 278 static struct ifnet multicast_register_if; 279 static vifi_t reg_vif_num = VIFI_INVALID; 280 281 /* 282 * Private variables. 283 */ 284 285 static u_long X_ip_mcast_src(int); 286 static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *, 287 struct ip_moptions *); 288 static int X_ip_mrouter_done(void); 289 static int X_ip_mrouter_get(struct socket *, struct sockopt *); 290 static int X_ip_mrouter_set(struct socket *, struct sockopt *); 291 static int X_legal_vif_num(int); 292 static int X_mrt_ioctl(u_long, caddr_t, int); 293 294 static int add_bw_upcall(struct bw_upcall *); 295 static int add_mfc(struct mfcctl2 *); 296 static int add_vif(struct vifctl *); 297 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *); 298 static void bw_meter_process(void); 299 static void bw_meter_receive_packet(struct bw_meter *, int, 300 struct timeval *); 301 static void bw_upcalls_send(void); 302 static int del_bw_upcall(struct bw_upcall *); 303 static int del_mfc(struct mfcctl2 *); 304 static int del_vif(vifi_t); 305 static int del_vif_locked(vifi_t); 306 static void expire_bw_meter_process(void *); 307 static void expire_bw_upcalls_send(void *); 308 static void expire_mfc(struct mfc *); 309 static void expire_upcalls(void *); 310 static void free_bw_list(struct bw_meter *); 311 static int get_sg_cnt(struct sioc_sg_req *); 312 static int get_vif_cnt(struct sioc_vif_req *); 313 static void if_detached_event(void *, struct ifnet *); 314 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t); 315 static int ip_mrouter_init(struct socket *, int); 316 static __inline struct mfc * 317 mfc_find(struct in_addr *, struct in_addr *); 318 static void phyint_send(struct ip *, struct vif *, struct mbuf *); 319 static struct mbuf * 320 pim_register_prepare(struct ip *, struct mbuf *); 321 static int pim_register_send(struct ip *, struct vif *, 322 struct mbuf *, struct mfc *); 323 static int pim_register_send_rp(struct ip *, struct vif *, 324 struct mbuf *, struct mfc *); 325 static int pim_register_send_upcall(struct ip *, struct vif *, 326 struct mbuf *, struct mfc *); 327 static void schedule_bw_meter(struct bw_meter *, struct timeval *); 328 static void send_packet(struct vif *, struct mbuf *); 329 static int set_api_config(uint32_t *); 330 static int set_assert(int); 331 static int socket_send(struct socket *, struct mbuf *, 332 struct sockaddr_in *); 333 static void unschedule_bw_meter(struct bw_meter *); 334 335 /* 336 * Kernel multicast forwarding API capabilities and setup. 337 * If more API capabilities are added to the kernel, they should be 338 * recorded in `mrt_api_support'. 339 */ 340 #define MRT_API_VERSION 0x0305 341 342 static const int mrt_api_version = MRT_API_VERSION; 343 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF | 344 MRT_MFC_FLAGS_BORDER_VIF | 345 MRT_MFC_RP | 346 MRT_MFC_BW_UPCALL); 347 static uint32_t mrt_api_config = 0; 348 349 static int pim_assert_enabled; 350 static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */ 351 352 /* 353 * Find a route for a given origin IP address and multicast group address. 354 * Statistics must be updated by the caller. 355 */ 356 static __inline struct mfc * 357 mfc_find(struct in_addr *o, struct in_addr *g) 358 { 359 struct mfc *rt; 360 361 MFC_LOCK_ASSERT(); 362 363 LIST_FOREACH(rt, &mfchashtbl[MFCHASH(*o, *g)], mfc_hash) { 364 if (in_hosteq(rt->mfc_origin, *o) && 365 in_hosteq(rt->mfc_mcastgrp, *g) && 366 TAILQ_EMPTY(&rt->mfc_stall)) 367 break; 368 } 369 370 return (rt); 371 } 372 373 /* 374 * Handle MRT setsockopt commands to modify the multicast forwarding tables. 375 */ 376 static int 377 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt) 378 { 379 int error, optval; 380 vifi_t vifi; 381 struct vifctl vifc; 382 struct mfcctl2 mfc; 383 struct bw_upcall bw_upcall; 384 uint32_t i; 385 386 if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT) 387 return EPERM; 388 389 error = 0; 390 switch (sopt->sopt_name) { 391 case MRT_INIT: 392 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 393 if (error) 394 break; 395 error = ip_mrouter_init(so, optval); 396 break; 397 398 case MRT_DONE: 399 error = ip_mrouter_done(); 400 break; 401 402 case MRT_ADD_VIF: 403 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc); 404 if (error) 405 break; 406 error = add_vif(&vifc); 407 break; 408 409 case MRT_DEL_VIF: 410 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi); 411 if (error) 412 break; 413 error = del_vif(vifi); 414 break; 415 416 case MRT_ADD_MFC: 417 case MRT_DEL_MFC: 418 /* 419 * select data size depending on API version. 420 */ 421 if (sopt->sopt_name == MRT_ADD_MFC && 422 mrt_api_config & MRT_API_FLAGS_ALL) { 423 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2), 424 sizeof(struct mfcctl2)); 425 } else { 426 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl), 427 sizeof(struct mfcctl)); 428 bzero((caddr_t)&mfc + sizeof(struct mfcctl), 429 sizeof(mfc) - sizeof(struct mfcctl)); 430 } 431 if (error) 432 break; 433 if (sopt->sopt_name == MRT_ADD_MFC) 434 error = add_mfc(&mfc); 435 else 436 error = del_mfc(&mfc); 437 break; 438 439 case MRT_ASSERT: 440 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); 441 if (error) 442 break; 443 set_assert(optval); 444 break; 445 446 case MRT_API_CONFIG: 447 error = sooptcopyin(sopt, &i, sizeof i, sizeof i); 448 if (!error) 449 error = set_api_config(&i); 450 if (!error) 451 error = sooptcopyout(sopt, &i, sizeof i); 452 break; 453 454 case MRT_ADD_BW_UPCALL: 455 case MRT_DEL_BW_UPCALL: 456 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall, 457 sizeof bw_upcall); 458 if (error) 459 break; 460 if (sopt->sopt_name == MRT_ADD_BW_UPCALL) 461 error = add_bw_upcall(&bw_upcall); 462 else 463 error = del_bw_upcall(&bw_upcall); 464 break; 465 466 default: 467 error = EOPNOTSUPP; 468 break; 469 } 470 return error; 471 } 472 473 /* 474 * Handle MRT getsockopt commands 475 */ 476 static int 477 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt) 478 { 479 int error; 480 481 switch (sopt->sopt_name) { 482 case MRT_VERSION: 483 error = sooptcopyout(sopt, &mrt_api_version, sizeof mrt_api_version); 484 break; 485 486 case MRT_ASSERT: 487 error = sooptcopyout(sopt, &pim_assert_enabled, 488 sizeof pim_assert_enabled); 489 break; 490 491 case MRT_API_SUPPORT: 492 error = sooptcopyout(sopt, &mrt_api_support, sizeof mrt_api_support); 493 break; 494 495 case MRT_API_CONFIG: 496 error = sooptcopyout(sopt, &mrt_api_config, sizeof mrt_api_config); 497 break; 498 499 default: 500 error = EOPNOTSUPP; 501 break; 502 } 503 return error; 504 } 505 506 /* 507 * Handle ioctl commands to obtain information from the cache 508 */ 509 static int 510 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused) 511 { 512 int error = 0; 513 514 /* 515 * Currently the only function calling this ioctl routine is rtioctl(). 516 * Typically, only root can create the raw socket in order to execute 517 * this ioctl method, however the request might be coming from a prison 518 */ 519 error = priv_check(curthread, PRIV_NETINET_MROUTE); 520 if (error) 521 return (error); 522 switch (cmd) { 523 case (SIOCGETVIFCNT): 524 error = get_vif_cnt((struct sioc_vif_req *)data); 525 break; 526 527 case (SIOCGETSGCNT): 528 error = get_sg_cnt((struct sioc_sg_req *)data); 529 break; 530 531 default: 532 error = EINVAL; 533 break; 534 } 535 return error; 536 } 537 538 /* 539 * returns the packet, byte, rpf-failure count for the source group provided 540 */ 541 static int 542 get_sg_cnt(struct sioc_sg_req *req) 543 { 544 struct mfc *rt; 545 546 MFC_LOCK(); 547 rt = mfc_find(&req->src, &req->grp); 548 if (rt == NULL) { 549 MFC_UNLOCK(); 550 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff; 551 return EADDRNOTAVAIL; 552 } 553 req->pktcnt = rt->mfc_pkt_cnt; 554 req->bytecnt = rt->mfc_byte_cnt; 555 req->wrong_if = rt->mfc_wrong_if; 556 MFC_UNLOCK(); 557 return 0; 558 } 559 560 /* 561 * returns the input and output packet and byte counts on the vif provided 562 */ 563 static int 564 get_vif_cnt(struct sioc_vif_req *req) 565 { 566 vifi_t vifi = req->vifi; 567 568 VIF_LOCK(); 569 if (vifi >= numvifs) { 570 VIF_UNLOCK(); 571 return EINVAL; 572 } 573 574 req->icount = viftable[vifi].v_pkt_in; 575 req->ocount = viftable[vifi].v_pkt_out; 576 req->ibytes = viftable[vifi].v_bytes_in; 577 req->obytes = viftable[vifi].v_bytes_out; 578 VIF_UNLOCK(); 579 580 return 0; 581 } 582 583 static void 584 ip_mrouter_reset(void) 585 { 586 587 pim_assert_enabled = 0; 588 mrt_api_config = 0; 589 590 callout_init(&expire_upcalls_ch, CALLOUT_MPSAFE); 591 592 bw_upcalls_n = 0; 593 bzero((caddr_t)bw_meter_timers, sizeof(bw_meter_timers)); 594 callout_init(&bw_upcalls_ch, CALLOUT_MPSAFE); 595 callout_init(&bw_meter_ch, CALLOUT_MPSAFE); 596 } 597 598 static void 599 if_detached_event(void *arg __unused, struct ifnet *ifp) 600 { 601 vifi_t vifi; 602 int i; 603 604 MROUTER_LOCK(); 605 606 if (V_ip_mrouter == NULL) { 607 MROUTER_UNLOCK(); 608 return; 609 } 610 611 VIF_LOCK(); 612 MFC_LOCK(); 613 614 /* 615 * Tear down multicast forwarder state associated with this ifnet. 616 * 1. Walk the vif list, matching vifs against this ifnet. 617 * 2. Walk the multicast forwarding cache (mfc) looking for 618 * inner matches with this vif's index. 619 * 3. Expire any matching multicast forwarding cache entries. 620 * 4. Free vif state. This should disable ALLMULTI on the interface. 621 */ 622 for (vifi = 0; vifi < numvifs; vifi++) { 623 if (viftable[vifi].v_ifp != ifp) 624 continue; 625 for (i = 0; i < mfchashsize; i++) { 626 struct mfc *rt, *nrt; 627 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 628 nrt = LIST_NEXT(rt, mfc_hash); 629 if (rt->mfc_parent == vifi) { 630 expire_mfc(rt); 631 } 632 } 633 } 634 del_vif_locked(vifi); 635 } 636 637 MFC_UNLOCK(); 638 VIF_UNLOCK(); 639 640 MROUTER_UNLOCK(); 641 } 642 643 /* 644 * Enable multicast forwarding. 645 */ 646 static int 647 ip_mrouter_init(struct socket *so, int version) 648 { 649 650 CTR3(KTR_IPMF, "%s: so_type %d, pr_protocol %d", __func__, 651 so->so_type, so->so_proto->pr_protocol); 652 653 if (so->so_type != SOCK_RAW || so->so_proto->pr_protocol != IPPROTO_IGMP) 654 return EOPNOTSUPP; 655 656 if (version != 1) 657 return ENOPROTOOPT; 658 659 MROUTER_LOCK(); 660 661 if (V_ip_mrouter != NULL) { 662 MROUTER_UNLOCK(); 663 return EADDRINUSE; 664 } 665 666 if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event, 667 if_detached_event, NULL, EVENTHANDLER_PRI_ANY); 668 if (if_detach_event_tag == NULL) { 669 MROUTER_UNLOCK(); 670 return (ENOMEM); 671 } 672 673 mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &mfchash, HASH_NOWAIT); 674 675 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 676 677 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 678 expire_bw_upcalls_send, NULL); 679 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 680 681 V_ip_mrouter = so; 682 683 MROUTER_UNLOCK(); 684 685 CTR1(KTR_IPMF, "%s: done", __func__); 686 687 return 0; 688 } 689 690 /* 691 * Disable multicast forwarding. 692 */ 693 static int 694 X_ip_mrouter_done(void) 695 { 696 vifi_t vifi; 697 int i; 698 struct ifnet *ifp; 699 struct ifreq ifr; 700 701 MROUTER_LOCK(); 702 703 if (V_ip_mrouter == NULL) { 704 MROUTER_UNLOCK(); 705 return EINVAL; 706 } 707 708 /* 709 * Detach/disable hooks to the reset of the system. 710 */ 711 V_ip_mrouter = NULL; 712 mrt_api_config = 0; 713 714 VIF_LOCK(); 715 716 /* 717 * For each phyint in use, disable promiscuous reception of all IP 718 * multicasts. 719 */ 720 for (vifi = 0; vifi < numvifs; vifi++) { 721 if (!in_nullhost(viftable[vifi].v_lcl_addr) && 722 !(viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) { 723 struct sockaddr_in *so = (struct sockaddr_in *)&(ifr.ifr_addr); 724 725 so->sin_len = sizeof(struct sockaddr_in); 726 so->sin_family = AF_INET; 727 so->sin_addr.s_addr = INADDR_ANY; 728 ifp = viftable[vifi].v_ifp; 729 if_allmulti(ifp, 0); 730 } 731 } 732 bzero((caddr_t)viftable, sizeof(viftable)); 733 numvifs = 0; 734 pim_assert_enabled = 0; 735 736 VIF_UNLOCK(); 737 738 EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag); 739 740 callout_stop(&expire_upcalls_ch); 741 callout_stop(&bw_upcalls_ch); 742 callout_stop(&bw_meter_ch); 743 744 MFC_LOCK(); 745 746 /* 747 * Free all multicast forwarding cache entries. 748 * Do not use hashdestroy(), as we must perform other cleanup. 749 */ 750 for (i = 0; i < mfchashsize; i++) { 751 struct mfc *rt, *nrt; 752 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 753 nrt = LIST_NEXT(rt, mfc_hash); 754 expire_mfc(rt); 755 } 756 } 757 free(mfchashtbl, M_MRTABLE); 758 mfchashtbl = NULL; 759 760 bzero(nexpire, sizeof(nexpire[0]) * mfchashsize); 761 762 bw_upcalls_n = 0; 763 bzero(bw_meter_timers, sizeof(bw_meter_timers)); 764 765 MFC_UNLOCK(); 766 767 reg_vif_num = VIFI_INVALID; 768 769 MROUTER_UNLOCK(); 770 771 CTR1(KTR_IPMF, "%s: done", __func__); 772 773 return 0; 774 } 775 776 /* 777 * Set PIM assert processing global 778 */ 779 static int 780 set_assert(int i) 781 { 782 if ((i != 1) && (i != 0)) 783 return EINVAL; 784 785 pim_assert_enabled = i; 786 787 return 0; 788 } 789 790 /* 791 * Configure API capabilities 792 */ 793 int 794 set_api_config(uint32_t *apival) 795 { 796 int i; 797 798 /* 799 * We can set the API capabilities only if it is the first operation 800 * after MRT_INIT. I.e.: 801 * - there are no vifs installed 802 * - pim_assert is not enabled 803 * - the MFC table is empty 804 */ 805 if (numvifs > 0) { 806 *apival = 0; 807 return EPERM; 808 } 809 if (pim_assert_enabled) { 810 *apival = 0; 811 return EPERM; 812 } 813 814 MFC_LOCK(); 815 816 for (i = 0; i < mfchashsize; i++) { 817 if (LIST_FIRST(&mfchashtbl[i]) != NULL) { 818 *apival = 0; 819 return EPERM; 820 } 821 } 822 823 MFC_UNLOCK(); 824 825 mrt_api_config = *apival & mrt_api_support; 826 *apival = mrt_api_config; 827 828 return 0; 829 } 830 831 /* 832 * Add a vif to the vif table 833 */ 834 static int 835 add_vif(struct vifctl *vifcp) 836 { 837 struct vif *vifp = viftable + vifcp->vifc_vifi; 838 struct sockaddr_in sin = {sizeof sin, AF_INET}; 839 struct ifaddr *ifa; 840 struct ifnet *ifp; 841 int error; 842 843 VIF_LOCK(); 844 if (vifcp->vifc_vifi >= MAXVIFS) { 845 VIF_UNLOCK(); 846 return EINVAL; 847 } 848 /* rate limiting is no longer supported by this code */ 849 if (vifcp->vifc_rate_limit != 0) { 850 log(LOG_ERR, "rate limiting is no longer supported\n"); 851 VIF_UNLOCK(); 852 return EINVAL; 853 } 854 if (!in_nullhost(vifp->v_lcl_addr)) { 855 VIF_UNLOCK(); 856 return EADDRINUSE; 857 } 858 if (in_nullhost(vifcp->vifc_lcl_addr)) { 859 VIF_UNLOCK(); 860 return EADDRNOTAVAIL; 861 } 862 863 /* Find the interface with an address in AF_INET family */ 864 if (vifcp->vifc_flags & VIFF_REGISTER) { 865 /* 866 * XXX: Because VIFF_REGISTER does not really need a valid 867 * local interface (e.g. it could be 127.0.0.2), we don't 868 * check its address. 869 */ 870 ifp = NULL; 871 } else { 872 sin.sin_addr = vifcp->vifc_lcl_addr; 873 ifa = ifa_ifwithaddr((struct sockaddr *)&sin); 874 if (ifa == NULL) { 875 VIF_UNLOCK(); 876 return EADDRNOTAVAIL; 877 } 878 ifp = ifa->ifa_ifp; 879 ifa_free(ifa); 880 } 881 882 if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) { 883 CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__); 884 VIF_UNLOCK(); 885 return EOPNOTSUPP; 886 } else if (vifcp->vifc_flags & VIFF_REGISTER) { 887 ifp = &multicast_register_if; 888 CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp); 889 if (reg_vif_num == VIFI_INVALID) { 890 if_initname(&multicast_register_if, "register_vif", 0); 891 multicast_register_if.if_flags = IFF_LOOPBACK; 892 reg_vif_num = vifcp->vifc_vifi; 893 } 894 } else { /* Make sure the interface supports multicast */ 895 if ((ifp->if_flags & IFF_MULTICAST) == 0) { 896 VIF_UNLOCK(); 897 return EOPNOTSUPP; 898 } 899 900 /* Enable promiscuous reception of all IP multicasts from the if */ 901 error = if_allmulti(ifp, 1); 902 if (error) { 903 VIF_UNLOCK(); 904 return error; 905 } 906 } 907 908 vifp->v_flags = vifcp->vifc_flags; 909 vifp->v_threshold = vifcp->vifc_threshold; 910 vifp->v_lcl_addr = vifcp->vifc_lcl_addr; 911 vifp->v_rmt_addr = vifcp->vifc_rmt_addr; 912 vifp->v_ifp = ifp; 913 /* initialize per vif pkt counters */ 914 vifp->v_pkt_in = 0; 915 vifp->v_pkt_out = 0; 916 vifp->v_bytes_in = 0; 917 vifp->v_bytes_out = 0; 918 bzero(&vifp->v_route, sizeof(vifp->v_route)); 919 920 /* Adjust numvifs up if the vifi is higher than numvifs */ 921 if (numvifs <= vifcp->vifc_vifi) 922 numvifs = vifcp->vifc_vifi + 1; 923 924 VIF_UNLOCK(); 925 926 CTR4(KTR_IPMF, "%s: add vif %d laddr %s thresh %x", __func__, 927 (int)vifcp->vifc_vifi, inet_ntoa(vifcp->vifc_lcl_addr), 928 (int)vifcp->vifc_threshold); 929 930 return 0; 931 } 932 933 /* 934 * Delete a vif from the vif table 935 */ 936 static int 937 del_vif_locked(vifi_t vifi) 938 { 939 struct vif *vifp; 940 941 VIF_LOCK_ASSERT(); 942 943 if (vifi >= numvifs) { 944 return EINVAL; 945 } 946 vifp = &viftable[vifi]; 947 if (in_nullhost(vifp->v_lcl_addr)) { 948 return EADDRNOTAVAIL; 949 } 950 951 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) 952 if_allmulti(vifp->v_ifp, 0); 953 954 if (vifp->v_flags & VIFF_REGISTER) 955 reg_vif_num = VIFI_INVALID; 956 957 bzero((caddr_t)vifp, sizeof (*vifp)); 958 959 CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi); 960 961 /* Adjust numvifs down */ 962 for (vifi = numvifs; vifi > 0; vifi--) 963 if (!in_nullhost(viftable[vifi-1].v_lcl_addr)) 964 break; 965 numvifs = vifi; 966 967 return 0; 968 } 969 970 static int 971 del_vif(vifi_t vifi) 972 { 973 int cc; 974 975 VIF_LOCK(); 976 cc = del_vif_locked(vifi); 977 VIF_UNLOCK(); 978 979 return cc; 980 } 981 982 /* 983 * update an mfc entry without resetting counters and S,G addresses. 984 */ 985 static void 986 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 987 { 988 int i; 989 990 rt->mfc_parent = mfccp->mfcc_parent; 991 for (i = 0; i < numvifs; i++) { 992 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i]; 993 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & mrt_api_config & 994 MRT_MFC_FLAGS_ALL; 995 } 996 /* set the RP address */ 997 if (mrt_api_config & MRT_MFC_RP) 998 rt->mfc_rp = mfccp->mfcc_rp; 999 else 1000 rt->mfc_rp.s_addr = INADDR_ANY; 1001 } 1002 1003 /* 1004 * fully initialize an mfc entry from the parameter. 1005 */ 1006 static void 1007 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp) 1008 { 1009 rt->mfc_origin = mfccp->mfcc_origin; 1010 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp; 1011 1012 update_mfc_params(rt, mfccp); 1013 1014 /* initialize pkt counters per src-grp */ 1015 rt->mfc_pkt_cnt = 0; 1016 rt->mfc_byte_cnt = 0; 1017 rt->mfc_wrong_if = 0; 1018 timevalclear(&rt->mfc_last_assert); 1019 } 1020 1021 static void 1022 expire_mfc(struct mfc *rt) 1023 { 1024 struct rtdetq *rte, *nrte; 1025 1026 MFC_LOCK_ASSERT(); 1027 1028 free_bw_list(rt->mfc_bw_meter); 1029 1030 TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { 1031 m_freem(rte->m); 1032 TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); 1033 free(rte, M_MRTABLE); 1034 } 1035 1036 LIST_REMOVE(rt, mfc_hash); 1037 free(rt, M_MRTABLE); 1038 } 1039 1040 /* 1041 * Add an mfc entry 1042 */ 1043 static int 1044 add_mfc(struct mfcctl2 *mfccp) 1045 { 1046 struct mfc *rt; 1047 struct rtdetq *rte, *nrte; 1048 u_long hash = 0; 1049 u_short nstl; 1050 1051 VIF_LOCK(); 1052 MFC_LOCK(); 1053 1054 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp); 1055 1056 /* If an entry already exists, just update the fields */ 1057 if (rt) { 1058 CTR4(KTR_IPMF, "%s: update mfc orig %s group %lx parent %x", 1059 __func__, inet_ntoa(mfccp->mfcc_origin), 1060 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1061 mfccp->mfcc_parent); 1062 update_mfc_params(rt, mfccp); 1063 MFC_UNLOCK(); 1064 VIF_UNLOCK(); 1065 return (0); 1066 } 1067 1068 /* 1069 * Find the entry for which the upcall was made and update 1070 */ 1071 nstl = 0; 1072 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp); 1073 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1074 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1075 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) && 1076 !TAILQ_EMPTY(&rt->mfc_stall)) { 1077 CTR5(KTR_IPMF, 1078 "%s: add mfc orig %s group %lx parent %x qh %p", 1079 __func__, inet_ntoa(mfccp->mfcc_origin), 1080 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr), 1081 mfccp->mfcc_parent, 1082 TAILQ_FIRST(&rt->mfc_stall)); 1083 if (nstl++) 1084 CTR1(KTR_IPMF, "%s: multiple matches", __func__); 1085 1086 init_mfc_params(rt, mfccp); 1087 rt->mfc_expire = 0; /* Don't clean this guy up */ 1088 nexpire[hash]--; 1089 1090 /* Free queued packets, but attempt to forward them first. */ 1091 TAILQ_FOREACH_SAFE(rte, &rt->mfc_stall, rte_link, nrte) { 1092 if (rte->ifp != NULL) 1093 ip_mdq(rte->m, rte->ifp, rt, -1); 1094 m_freem(rte->m); 1095 TAILQ_REMOVE(&rt->mfc_stall, rte, rte_link); 1096 rt->mfc_nstall--; 1097 free(rte, M_MRTABLE); 1098 } 1099 } 1100 } 1101 1102 /* 1103 * It is possible that an entry is being inserted without an upcall 1104 */ 1105 if (nstl == 0) { 1106 CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__); 1107 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1108 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) && 1109 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) { 1110 init_mfc_params(rt, mfccp); 1111 if (rt->mfc_expire) 1112 nexpire[hash]--; 1113 rt->mfc_expire = 0; 1114 break; /* XXX */ 1115 } 1116 } 1117 1118 if (rt == NULL) { /* no upcall, so make a new entry */ 1119 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1120 if (rt == NULL) { 1121 MFC_UNLOCK(); 1122 VIF_UNLOCK(); 1123 return (ENOBUFS); 1124 } 1125 1126 init_mfc_params(rt, mfccp); 1127 TAILQ_INIT(&rt->mfc_stall); 1128 rt->mfc_nstall = 0; 1129 1130 rt->mfc_expire = 0; 1131 rt->mfc_bw_meter = NULL; 1132 1133 /* insert new entry at head of hash chain */ 1134 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1135 } 1136 } 1137 1138 MFC_UNLOCK(); 1139 VIF_UNLOCK(); 1140 1141 return (0); 1142 } 1143 1144 /* 1145 * Delete an mfc entry 1146 */ 1147 static int 1148 del_mfc(struct mfcctl2 *mfccp) 1149 { 1150 struct in_addr origin; 1151 struct in_addr mcastgrp; 1152 struct mfc *rt; 1153 1154 origin = mfccp->mfcc_origin; 1155 mcastgrp = mfccp->mfcc_mcastgrp; 1156 1157 CTR3(KTR_IPMF, "%s: delete mfc orig %s group %lx", __func__, 1158 inet_ntoa(origin), (u_long)ntohl(mcastgrp.s_addr)); 1159 1160 MFC_LOCK(); 1161 1162 rt = mfc_find(&origin, &mcastgrp); 1163 if (rt == NULL) { 1164 MFC_UNLOCK(); 1165 return EADDRNOTAVAIL; 1166 } 1167 1168 /* 1169 * free the bw_meter entries 1170 */ 1171 free_bw_list(rt->mfc_bw_meter); 1172 rt->mfc_bw_meter = NULL; 1173 1174 LIST_REMOVE(rt, mfc_hash); 1175 free(rt, M_MRTABLE); 1176 1177 MFC_UNLOCK(); 1178 1179 return (0); 1180 } 1181 1182 /* 1183 * Send a message to the routing daemon on the multicast routing socket. 1184 */ 1185 static int 1186 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src) 1187 { 1188 if (s) { 1189 SOCKBUF_LOCK(&s->so_rcv); 1190 if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm, 1191 NULL) != 0) { 1192 sorwakeup_locked(s); 1193 return 0; 1194 } 1195 SOCKBUF_UNLOCK(&s->so_rcv); 1196 } 1197 m_freem(mm); 1198 return -1; 1199 } 1200 1201 /* 1202 * IP multicast forwarding function. This function assumes that the packet 1203 * pointed to by "ip" has arrived on (or is about to be sent to) the interface 1204 * pointed to by "ifp", and the packet is to be relayed to other networks 1205 * that have members of the packet's destination IP multicast group. 1206 * 1207 * The packet is returned unscathed to the caller, unless it is 1208 * erroneous, in which case a non-zero return value tells the caller to 1209 * discard it. 1210 */ 1211 1212 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */ 1213 1214 static int 1215 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m, 1216 struct ip_moptions *imo) 1217 { 1218 struct mfc *rt; 1219 int error; 1220 vifi_t vifi; 1221 1222 CTR3(KTR_IPMF, "ip_mforward: delete mfc orig %s group %lx ifp %p", 1223 inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr), ifp); 1224 1225 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 || 1226 ((u_char *)(ip + 1))[1] != IPOPT_LSRR ) { 1227 /* 1228 * Packet arrived via a physical interface or 1229 * an encapsulated tunnel or a register_vif. 1230 */ 1231 } else { 1232 /* 1233 * Packet arrived through a source-route tunnel. 1234 * Source-route tunnels are no longer supported. 1235 */ 1236 return (1); 1237 } 1238 1239 VIF_LOCK(); 1240 MFC_LOCK(); 1241 if (imo && ((vifi = imo->imo_multicast_vif) < numvifs)) { 1242 if (ip->ip_ttl < MAXTTL) 1243 ip->ip_ttl++; /* compensate for -1 in *_send routines */ 1244 error = ip_mdq(m, ifp, NULL, vifi); 1245 MFC_UNLOCK(); 1246 VIF_UNLOCK(); 1247 return error; 1248 } 1249 1250 /* 1251 * Don't forward a packet with time-to-live of zero or one, 1252 * or a packet destined to a local-only group. 1253 */ 1254 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) { 1255 MFC_UNLOCK(); 1256 VIF_UNLOCK(); 1257 return 0; 1258 } 1259 1260 /* 1261 * Determine forwarding vifs from the forwarding cache table 1262 */ 1263 MRTSTAT_INC(mrts_mfc_lookups); 1264 rt = mfc_find(&ip->ip_src, &ip->ip_dst); 1265 1266 /* Entry exists, so forward if necessary */ 1267 if (rt != NULL) { 1268 error = ip_mdq(m, ifp, rt, -1); 1269 MFC_UNLOCK(); 1270 VIF_UNLOCK(); 1271 return error; 1272 } else { 1273 /* 1274 * If we don't have a route for packet's origin, 1275 * Make a copy of the packet & send message to routing daemon 1276 */ 1277 1278 struct mbuf *mb0; 1279 struct rtdetq *rte; 1280 u_long hash; 1281 int hlen = ip->ip_hl << 2; 1282 1283 MRTSTAT_INC(mrts_mfc_misses); 1284 MRTSTAT_INC(mrts_no_route); 1285 CTR2(KTR_IPMF, "ip_mforward: no mfc for (%s,%lx)", 1286 inet_ntoa(ip->ip_src), (u_long)ntohl(ip->ip_dst.s_addr)); 1287 1288 /* 1289 * Allocate mbufs early so that we don't do extra work if we are 1290 * just going to fail anyway. Make sure to pullup the header so 1291 * that other people can't step on it. 1292 */ 1293 rte = (struct rtdetq *)malloc((sizeof *rte), M_MRTABLE, 1294 M_NOWAIT|M_ZERO); 1295 if (rte == NULL) { 1296 MFC_UNLOCK(); 1297 VIF_UNLOCK(); 1298 return ENOBUFS; 1299 } 1300 1301 mb0 = m_copypacket(m, M_DONTWAIT); 1302 if (mb0 && (M_HASCL(mb0) || mb0->m_len < hlen)) 1303 mb0 = m_pullup(mb0, hlen); 1304 if (mb0 == NULL) { 1305 free(rte, M_MRTABLE); 1306 MFC_UNLOCK(); 1307 VIF_UNLOCK(); 1308 return ENOBUFS; 1309 } 1310 1311 /* is there an upcall waiting for this flow ? */ 1312 hash = MFCHASH(ip->ip_src, ip->ip_dst); 1313 LIST_FOREACH(rt, &mfchashtbl[hash], mfc_hash) { 1314 if (in_hosteq(ip->ip_src, rt->mfc_origin) && 1315 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) && 1316 !TAILQ_EMPTY(&rt->mfc_stall)) 1317 break; 1318 } 1319 1320 if (rt == NULL) { 1321 int i; 1322 struct igmpmsg *im; 1323 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1324 struct mbuf *mm; 1325 1326 /* 1327 * Locate the vifi for the incoming interface for this packet. 1328 * If none found, drop packet. 1329 */ 1330 for (vifi = 0; vifi < numvifs && 1331 viftable[vifi].v_ifp != ifp; vifi++) 1332 ; 1333 if (vifi >= numvifs) /* vif not found, drop packet */ 1334 goto non_fatal; 1335 1336 /* no upcall, so make a new entry */ 1337 rt = (struct mfc *)malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT); 1338 if (rt == NULL) 1339 goto fail; 1340 1341 /* Make a copy of the header to send to the user level process */ 1342 mm = m_copy(mb0, 0, hlen); 1343 if (mm == NULL) 1344 goto fail1; 1345 1346 /* 1347 * Send message to routing daemon to install 1348 * a route into the kernel table 1349 */ 1350 1351 im = mtod(mm, struct igmpmsg *); 1352 im->im_msgtype = IGMPMSG_NOCACHE; 1353 im->im_mbz = 0; 1354 im->im_vif = vifi; 1355 1356 MRTSTAT_INC(mrts_upcalls); 1357 1358 k_igmpsrc.sin_addr = ip->ip_src; 1359 if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { 1360 CTR0(KTR_IPMF, "ip_mforward: socket queue full"); 1361 MRTSTAT_INC(mrts_upq_sockfull); 1362 fail1: 1363 free(rt, M_MRTABLE); 1364 fail: 1365 free(rte, M_MRTABLE); 1366 m_freem(mb0); 1367 MFC_UNLOCK(); 1368 VIF_UNLOCK(); 1369 return ENOBUFS; 1370 } 1371 1372 /* insert new entry at head of hash chain */ 1373 rt->mfc_origin.s_addr = ip->ip_src.s_addr; 1374 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr; 1375 rt->mfc_expire = UPCALL_EXPIRE; 1376 nexpire[hash]++; 1377 for (i = 0; i < numvifs; i++) { 1378 rt->mfc_ttls[i] = 0; 1379 rt->mfc_flags[i] = 0; 1380 } 1381 rt->mfc_parent = -1; 1382 1383 /* clear the RP address */ 1384 rt->mfc_rp.s_addr = INADDR_ANY; 1385 rt->mfc_bw_meter = NULL; 1386 1387 /* initialize pkt counters per src-grp */ 1388 rt->mfc_pkt_cnt = 0; 1389 rt->mfc_byte_cnt = 0; 1390 rt->mfc_wrong_if = 0; 1391 timevalclear(&rt->mfc_last_assert); 1392 1393 TAILQ_INIT(&rt->mfc_stall); 1394 rt->mfc_nstall = 0; 1395 1396 /* link into table */ 1397 LIST_INSERT_HEAD(&mfchashtbl[hash], rt, mfc_hash); 1398 TAILQ_INSERT_HEAD(&rt->mfc_stall, rte, rte_link); 1399 rt->mfc_nstall++; 1400 1401 } else { 1402 /* determine if queue has overflowed */ 1403 if (rt->mfc_nstall > MAX_UPQ) { 1404 MRTSTAT_INC(mrts_upq_ovflw); 1405 non_fatal: 1406 free(rte, M_MRTABLE); 1407 m_freem(mb0); 1408 MFC_UNLOCK(); 1409 VIF_UNLOCK(); 1410 return (0); 1411 } 1412 TAILQ_INSERT_TAIL(&rt->mfc_stall, rte, rte_link); 1413 rt->mfc_nstall++; 1414 } 1415 1416 rte->m = mb0; 1417 rte->ifp = ifp; 1418 1419 MFC_UNLOCK(); 1420 VIF_UNLOCK(); 1421 1422 return 0; 1423 } 1424 } 1425 1426 /* 1427 * Clean up the cache entry if upcall is not serviced 1428 */ 1429 static void 1430 expire_upcalls(void *unused) 1431 { 1432 int i; 1433 1434 MFC_LOCK(); 1435 1436 for (i = 0; i < mfchashsize; i++) { 1437 struct mfc *rt, *nrt; 1438 1439 if (nexpire[i] == 0) 1440 continue; 1441 1442 for (rt = LIST_FIRST(&mfchashtbl[i]); rt; rt = nrt) { 1443 nrt = LIST_NEXT(rt, mfc_hash); 1444 1445 if (TAILQ_EMPTY(&rt->mfc_stall)) 1446 continue; 1447 1448 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0) 1449 continue; 1450 1451 /* 1452 * free the bw_meter entries 1453 */ 1454 while (rt->mfc_bw_meter != NULL) { 1455 struct bw_meter *x = rt->mfc_bw_meter; 1456 1457 rt->mfc_bw_meter = x->bm_mfc_next; 1458 free(x, M_BWMETER); 1459 } 1460 1461 MRTSTAT_INC(mrts_cache_cleanups); 1462 CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__, 1463 (u_long)ntohl(rt->mfc_origin.s_addr), 1464 (u_long)ntohl(rt->mfc_mcastgrp.s_addr)); 1465 1466 expire_mfc(rt); 1467 } 1468 } 1469 1470 MFC_UNLOCK(); 1471 1472 callout_reset(&expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls, NULL); 1473 } 1474 1475 /* 1476 * Packet forwarding routine once entry in the cache is made 1477 */ 1478 static int 1479 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif) 1480 { 1481 struct ip *ip = mtod(m, struct ip *); 1482 vifi_t vifi; 1483 int plen = ip->ip_len; 1484 1485 VIF_LOCK_ASSERT(); 1486 1487 /* 1488 * If xmt_vif is not -1, send on only the requested vif. 1489 * 1490 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.) 1491 */ 1492 if (xmt_vif < numvifs) { 1493 if (viftable[xmt_vif].v_flags & VIFF_REGISTER) 1494 pim_register_send(ip, viftable + xmt_vif, m, rt); 1495 else 1496 phyint_send(ip, viftable + xmt_vif, m); 1497 return 1; 1498 } 1499 1500 /* 1501 * Don't forward if it didn't arrive from the parent vif for its origin. 1502 */ 1503 vifi = rt->mfc_parent; 1504 if ((vifi >= numvifs) || (viftable[vifi].v_ifp != ifp)) { 1505 CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)", 1506 __func__, ifp, (int)vifi, viftable[vifi].v_ifp); 1507 MRTSTAT_INC(mrts_wrong_if); 1508 ++rt->mfc_wrong_if; 1509 /* 1510 * If we are doing PIM assert processing, send a message 1511 * to the routing daemon. 1512 * 1513 * XXX: A PIM-SM router needs the WRONGVIF detection so it 1514 * can complete the SPT switch, regardless of the type 1515 * of the iif (broadcast media, GRE tunnel, etc). 1516 */ 1517 if (pim_assert_enabled && (vifi < numvifs) && viftable[vifi].v_ifp) { 1518 1519 if (ifp == &multicast_register_if) 1520 PIMSTAT_INC(pims_rcv_registers_wrongiif); 1521 1522 /* Get vifi for the incoming packet */ 1523 for (vifi=0; vifi < numvifs && viftable[vifi].v_ifp != ifp; vifi++) 1524 ; 1525 if (vifi >= numvifs) 1526 return 0; /* The iif is not found: ignore the packet. */ 1527 1528 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF) 1529 return 0; /* WRONGVIF disabled: ignore the packet */ 1530 1531 if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) { 1532 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 1533 struct igmpmsg *im; 1534 int hlen = ip->ip_hl << 2; 1535 struct mbuf *mm = m_copy(m, 0, hlen); 1536 1537 if (mm && (M_HASCL(mm) || mm->m_len < hlen)) 1538 mm = m_pullup(mm, hlen); 1539 if (mm == NULL) 1540 return ENOBUFS; 1541 1542 im = mtod(mm, struct igmpmsg *); 1543 im->im_msgtype = IGMPMSG_WRONGVIF; 1544 im->im_mbz = 0; 1545 im->im_vif = vifi; 1546 1547 MRTSTAT_INC(mrts_upcalls); 1548 1549 k_igmpsrc.sin_addr = im->im_src; 1550 if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) { 1551 CTR1(KTR_IPMF, "%s: socket queue full", __func__); 1552 MRTSTAT_INC(mrts_upq_sockfull); 1553 return ENOBUFS; 1554 } 1555 } 1556 } 1557 return 0; 1558 } 1559 1560 1561 /* If I sourced this packet, it counts as output, else it was input. */ 1562 if (in_hosteq(ip->ip_src, viftable[vifi].v_lcl_addr)) { 1563 viftable[vifi].v_pkt_out++; 1564 viftable[vifi].v_bytes_out += plen; 1565 } else { 1566 viftable[vifi].v_pkt_in++; 1567 viftable[vifi].v_bytes_in += plen; 1568 } 1569 rt->mfc_pkt_cnt++; 1570 rt->mfc_byte_cnt += plen; 1571 1572 /* 1573 * For each vif, decide if a copy of the packet should be forwarded. 1574 * Forward if: 1575 * - the ttl exceeds the vif's threshold 1576 * - there are group members downstream on interface 1577 */ 1578 for (vifi = 0; vifi < numvifs; vifi++) 1579 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) { 1580 viftable[vifi].v_pkt_out++; 1581 viftable[vifi].v_bytes_out += plen; 1582 if (viftable[vifi].v_flags & VIFF_REGISTER) 1583 pim_register_send(ip, viftable + vifi, m, rt); 1584 else 1585 phyint_send(ip, viftable + vifi, m); 1586 } 1587 1588 /* 1589 * Perform upcall-related bw measuring. 1590 */ 1591 if (rt->mfc_bw_meter != NULL) { 1592 struct bw_meter *x; 1593 struct timeval now; 1594 1595 microtime(&now); 1596 MFC_LOCK_ASSERT(); 1597 for (x = rt->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) 1598 bw_meter_receive_packet(x, plen, &now); 1599 } 1600 1601 return 0; 1602 } 1603 1604 /* 1605 * Check if a vif number is legal/ok. This is used by in_mcast.c. 1606 */ 1607 static int 1608 X_legal_vif_num(int vif) 1609 { 1610 int ret; 1611 1612 ret = 0; 1613 if (vif < 0) 1614 return (ret); 1615 1616 VIF_LOCK(); 1617 if (vif < numvifs) 1618 ret = 1; 1619 VIF_UNLOCK(); 1620 1621 return (ret); 1622 } 1623 1624 /* 1625 * Return the local address used by this vif 1626 */ 1627 static u_long 1628 X_ip_mcast_src(int vifi) 1629 { 1630 in_addr_t addr; 1631 1632 addr = INADDR_ANY; 1633 if (vifi < 0) 1634 return (addr); 1635 1636 VIF_LOCK(); 1637 if (vifi < numvifs) 1638 addr = viftable[vifi].v_lcl_addr.s_addr; 1639 VIF_UNLOCK(); 1640 1641 return (addr); 1642 } 1643 1644 static void 1645 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m) 1646 { 1647 struct mbuf *mb_copy; 1648 int hlen = ip->ip_hl << 2; 1649 1650 VIF_LOCK_ASSERT(); 1651 1652 /* 1653 * Make a new reference to the packet; make sure that 1654 * the IP header is actually copied, not just referenced, 1655 * so that ip_output() only scribbles on the copy. 1656 */ 1657 mb_copy = m_copypacket(m, M_DONTWAIT); 1658 if (mb_copy && (M_HASCL(mb_copy) || mb_copy->m_len < hlen)) 1659 mb_copy = m_pullup(mb_copy, hlen); 1660 if (mb_copy == NULL) 1661 return; 1662 1663 send_packet(vifp, mb_copy); 1664 } 1665 1666 static void 1667 send_packet(struct vif *vifp, struct mbuf *m) 1668 { 1669 struct ip_moptions imo; 1670 struct in_multi *imm[2]; 1671 int error; 1672 1673 VIF_LOCK_ASSERT(); 1674 1675 imo.imo_multicast_ifp = vifp->v_ifp; 1676 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1; 1677 imo.imo_multicast_loop = 1; 1678 imo.imo_multicast_vif = -1; 1679 imo.imo_num_memberships = 0; 1680 imo.imo_max_memberships = 2; 1681 imo.imo_membership = &imm[0]; 1682 1683 /* 1684 * Re-entrancy should not be a problem here, because 1685 * the packets that we send out and are looped back at us 1686 * should get rejected because they appear to come from 1687 * the loopback interface, thus preventing looping. 1688 */ 1689 error = ip_output(m, NULL, &vifp->v_route, IP_FORWARDING, &imo, NULL); 1690 CTR3(KTR_IPMF, "%s: vif %td err %d", __func__, 1691 (ptrdiff_t)(vifp - viftable), error); 1692 } 1693 1694 /* 1695 * Stubs for old RSVP socket shim implementation. 1696 */ 1697 1698 static int 1699 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused) 1700 { 1701 1702 return (EOPNOTSUPP); 1703 } 1704 1705 static void 1706 X_ip_rsvp_force_done(struct socket *so __unused) 1707 { 1708 1709 } 1710 1711 static void 1712 X_rsvp_input(struct mbuf *m, int off __unused) 1713 { 1714 1715 if (!V_rsvp_on) 1716 m_freem(m); 1717 } 1718 1719 /* 1720 * Code for bandwidth monitors 1721 */ 1722 1723 /* 1724 * Define common interface for timeval-related methods 1725 */ 1726 #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp) 1727 #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp)) 1728 #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp)) 1729 1730 static uint32_t 1731 compute_bw_meter_flags(struct bw_upcall *req) 1732 { 1733 uint32_t flags = 0; 1734 1735 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS) 1736 flags |= BW_METER_UNIT_PACKETS; 1737 if (req->bu_flags & BW_UPCALL_UNIT_BYTES) 1738 flags |= BW_METER_UNIT_BYTES; 1739 if (req->bu_flags & BW_UPCALL_GEQ) 1740 flags |= BW_METER_GEQ; 1741 if (req->bu_flags & BW_UPCALL_LEQ) 1742 flags |= BW_METER_LEQ; 1743 1744 return flags; 1745 } 1746 1747 /* 1748 * Add a bw_meter entry 1749 */ 1750 static int 1751 add_bw_upcall(struct bw_upcall *req) 1752 { 1753 struct mfc *mfc; 1754 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC, 1755 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC }; 1756 struct timeval now; 1757 struct bw_meter *x; 1758 uint32_t flags; 1759 1760 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 1761 return EOPNOTSUPP; 1762 1763 /* Test if the flags are valid */ 1764 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES))) 1765 return EINVAL; 1766 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ))) 1767 return EINVAL; 1768 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 1769 == (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) 1770 return EINVAL; 1771 1772 /* Test if the threshold time interval is valid */ 1773 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <)) 1774 return EINVAL; 1775 1776 flags = compute_bw_meter_flags(req); 1777 1778 /* 1779 * Find if we have already same bw_meter entry 1780 */ 1781 MFC_LOCK(); 1782 mfc = mfc_find(&req->bu_src, &req->bu_dst); 1783 if (mfc == NULL) { 1784 MFC_UNLOCK(); 1785 return EADDRNOTAVAIL; 1786 } 1787 for (x = mfc->mfc_bw_meter; x != NULL; x = x->bm_mfc_next) { 1788 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 1789 &req->bu_threshold.b_time, ==)) && 1790 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 1791 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 1792 (x->bm_flags & BW_METER_USER_FLAGS) == flags) { 1793 MFC_UNLOCK(); 1794 return 0; /* XXX Already installed */ 1795 } 1796 } 1797 1798 /* Allocate the new bw_meter entry */ 1799 x = (struct bw_meter *)malloc(sizeof(*x), M_BWMETER, M_NOWAIT); 1800 if (x == NULL) { 1801 MFC_UNLOCK(); 1802 return ENOBUFS; 1803 } 1804 1805 /* Set the new bw_meter entry */ 1806 x->bm_threshold.b_time = req->bu_threshold.b_time; 1807 microtime(&now); 1808 x->bm_start_time = now; 1809 x->bm_threshold.b_packets = req->bu_threshold.b_packets; 1810 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes; 1811 x->bm_measured.b_packets = 0; 1812 x->bm_measured.b_bytes = 0; 1813 x->bm_flags = flags; 1814 x->bm_time_next = NULL; 1815 x->bm_time_hash = BW_METER_BUCKETS; 1816 1817 /* Add the new bw_meter entry to the front of entries for this MFC */ 1818 x->bm_mfc = mfc; 1819 x->bm_mfc_next = mfc->mfc_bw_meter; 1820 mfc->mfc_bw_meter = x; 1821 schedule_bw_meter(x, &now); 1822 MFC_UNLOCK(); 1823 1824 return 0; 1825 } 1826 1827 static void 1828 free_bw_list(struct bw_meter *list) 1829 { 1830 while (list != NULL) { 1831 struct bw_meter *x = list; 1832 1833 list = list->bm_mfc_next; 1834 unschedule_bw_meter(x); 1835 free(x, M_BWMETER); 1836 } 1837 } 1838 1839 /* 1840 * Delete one or multiple bw_meter entries 1841 */ 1842 static int 1843 del_bw_upcall(struct bw_upcall *req) 1844 { 1845 struct mfc *mfc; 1846 struct bw_meter *x; 1847 1848 if (!(mrt_api_config & MRT_MFC_BW_UPCALL)) 1849 return EOPNOTSUPP; 1850 1851 MFC_LOCK(); 1852 1853 /* Find the corresponding MFC entry */ 1854 mfc = mfc_find(&req->bu_src, &req->bu_dst); 1855 if (mfc == NULL) { 1856 MFC_UNLOCK(); 1857 return EADDRNOTAVAIL; 1858 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) { 1859 /* 1860 * Delete all bw_meter entries for this mfc 1861 */ 1862 struct bw_meter *list; 1863 1864 list = mfc->mfc_bw_meter; 1865 mfc->mfc_bw_meter = NULL; 1866 free_bw_list(list); 1867 MFC_UNLOCK(); 1868 return 0; 1869 } else { /* Delete a single bw_meter entry */ 1870 struct bw_meter *prev; 1871 uint32_t flags = 0; 1872 1873 flags = compute_bw_meter_flags(req); 1874 1875 /* Find the bw_meter entry to delete */ 1876 for (prev = NULL, x = mfc->mfc_bw_meter; x != NULL; 1877 prev = x, x = x->bm_mfc_next) { 1878 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, 1879 &req->bu_threshold.b_time, ==)) && 1880 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) && 1881 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) && 1882 (x->bm_flags & BW_METER_USER_FLAGS) == flags) 1883 break; 1884 } 1885 if (x != NULL) { /* Delete entry from the list for this MFC */ 1886 if (prev != NULL) 1887 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/ 1888 else 1889 x->bm_mfc->mfc_bw_meter = x->bm_mfc_next;/* new head of list */ 1890 1891 unschedule_bw_meter(x); 1892 MFC_UNLOCK(); 1893 /* Free the bw_meter entry */ 1894 free(x, M_BWMETER); 1895 return 0; 1896 } else { 1897 MFC_UNLOCK(); 1898 return EINVAL; 1899 } 1900 } 1901 /* NOTREACHED */ 1902 } 1903 1904 /* 1905 * Perform bandwidth measurement processing that may result in an upcall 1906 */ 1907 static void 1908 bw_meter_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp) 1909 { 1910 struct timeval delta; 1911 1912 MFC_LOCK_ASSERT(); 1913 1914 delta = *nowp; 1915 BW_TIMEVALDECR(&delta, &x->bm_start_time); 1916 1917 if (x->bm_flags & BW_METER_GEQ) { 1918 /* 1919 * Processing for ">=" type of bw_meter entry 1920 */ 1921 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 1922 /* Reset the bw_meter entry */ 1923 x->bm_start_time = *nowp; 1924 x->bm_measured.b_packets = 0; 1925 x->bm_measured.b_bytes = 0; 1926 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 1927 } 1928 1929 /* Record that a packet is received */ 1930 x->bm_measured.b_packets++; 1931 x->bm_measured.b_bytes += plen; 1932 1933 /* 1934 * Test if we should deliver an upcall 1935 */ 1936 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) { 1937 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 1938 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) || 1939 ((x->bm_flags & BW_METER_UNIT_BYTES) && 1940 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) { 1941 /* Prepare an upcall for delivery */ 1942 bw_meter_prepare_upcall(x, nowp); 1943 x->bm_flags |= BW_METER_UPCALL_DELIVERED; 1944 } 1945 } 1946 } else if (x->bm_flags & BW_METER_LEQ) { 1947 /* 1948 * Processing for "<=" type of bw_meter entry 1949 */ 1950 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) { 1951 /* 1952 * We are behind time with the multicast forwarding table 1953 * scanning for "<=" type of bw_meter entries, so test now 1954 * if we should deliver an upcall. 1955 */ 1956 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 1957 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 1958 ((x->bm_flags & BW_METER_UNIT_BYTES) && 1959 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 1960 /* Prepare an upcall for delivery */ 1961 bw_meter_prepare_upcall(x, nowp); 1962 } 1963 /* Reschedule the bw_meter entry */ 1964 unschedule_bw_meter(x); 1965 schedule_bw_meter(x, nowp); 1966 } 1967 1968 /* Record that a packet is received */ 1969 x->bm_measured.b_packets++; 1970 x->bm_measured.b_bytes += plen; 1971 1972 /* 1973 * Test if we should restart the measuring interval 1974 */ 1975 if ((x->bm_flags & BW_METER_UNIT_PACKETS && 1976 x->bm_measured.b_packets <= x->bm_threshold.b_packets) || 1977 (x->bm_flags & BW_METER_UNIT_BYTES && 1978 x->bm_measured.b_bytes <= x->bm_threshold.b_bytes)) { 1979 /* Don't restart the measuring interval */ 1980 } else { 1981 /* Do restart the measuring interval */ 1982 /* 1983 * XXX: note that we don't unschedule and schedule, because this 1984 * might be too much overhead per packet. Instead, when we process 1985 * all entries for a given timer hash bin, we check whether it is 1986 * really a timeout. If not, we reschedule at that time. 1987 */ 1988 x->bm_start_time = *nowp; 1989 x->bm_measured.b_packets = 0; 1990 x->bm_measured.b_bytes = 0; 1991 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 1992 } 1993 } 1994 } 1995 1996 /* 1997 * Prepare a bandwidth-related upcall 1998 */ 1999 static void 2000 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp) 2001 { 2002 struct timeval delta; 2003 struct bw_upcall *u; 2004 2005 MFC_LOCK_ASSERT(); 2006 2007 /* 2008 * Compute the measured time interval 2009 */ 2010 delta = *nowp; 2011 BW_TIMEVALDECR(&delta, &x->bm_start_time); 2012 2013 /* 2014 * If there are too many pending upcalls, deliver them now 2015 */ 2016 if (bw_upcalls_n >= BW_UPCALLS_MAX) 2017 bw_upcalls_send(); 2018 2019 /* 2020 * Set the bw_upcall entry 2021 */ 2022 u = &bw_upcalls[bw_upcalls_n++]; 2023 u->bu_src = x->bm_mfc->mfc_origin; 2024 u->bu_dst = x->bm_mfc->mfc_mcastgrp; 2025 u->bu_threshold.b_time = x->bm_threshold.b_time; 2026 u->bu_threshold.b_packets = x->bm_threshold.b_packets; 2027 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes; 2028 u->bu_measured.b_time = delta; 2029 u->bu_measured.b_packets = x->bm_measured.b_packets; 2030 u->bu_measured.b_bytes = x->bm_measured.b_bytes; 2031 u->bu_flags = 0; 2032 if (x->bm_flags & BW_METER_UNIT_PACKETS) 2033 u->bu_flags |= BW_UPCALL_UNIT_PACKETS; 2034 if (x->bm_flags & BW_METER_UNIT_BYTES) 2035 u->bu_flags |= BW_UPCALL_UNIT_BYTES; 2036 if (x->bm_flags & BW_METER_GEQ) 2037 u->bu_flags |= BW_UPCALL_GEQ; 2038 if (x->bm_flags & BW_METER_LEQ) 2039 u->bu_flags |= BW_UPCALL_LEQ; 2040 } 2041 2042 /* 2043 * Send the pending bandwidth-related upcalls 2044 */ 2045 static void 2046 bw_upcalls_send(void) 2047 { 2048 struct mbuf *m; 2049 int len = bw_upcalls_n * sizeof(bw_upcalls[0]); 2050 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2051 static struct igmpmsg igmpmsg = { 0, /* unused1 */ 2052 0, /* unused2 */ 2053 IGMPMSG_BW_UPCALL,/* im_msgtype */ 2054 0, /* im_mbz */ 2055 0, /* im_vif */ 2056 0, /* unused3 */ 2057 { 0 }, /* im_src */ 2058 { 0 } }; /* im_dst */ 2059 2060 MFC_LOCK_ASSERT(); 2061 2062 if (bw_upcalls_n == 0) 2063 return; /* No pending upcalls */ 2064 2065 bw_upcalls_n = 0; 2066 2067 /* 2068 * Allocate a new mbuf, initialize it with the header and 2069 * the payload for the pending calls. 2070 */ 2071 MGETHDR(m, M_DONTWAIT, MT_DATA); 2072 if (m == NULL) { 2073 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n"); 2074 return; 2075 } 2076 2077 m->m_len = m->m_pkthdr.len = 0; 2078 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg); 2079 m_copyback(m, sizeof(struct igmpmsg), len, (caddr_t)&bw_upcalls[0]); 2080 2081 /* 2082 * Send the upcalls 2083 * XXX do we need to set the address in k_igmpsrc ? 2084 */ 2085 MRTSTAT_INC(mrts_upcalls); 2086 if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) { 2087 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n"); 2088 MRTSTAT_INC(mrts_upq_sockfull); 2089 } 2090 } 2091 2092 /* 2093 * Compute the timeout hash value for the bw_meter entries 2094 */ 2095 #define BW_METER_TIMEHASH(bw_meter, hash) \ 2096 do { \ 2097 struct timeval next_timeval = (bw_meter)->bm_start_time; \ 2098 \ 2099 BW_TIMEVALADD(&next_timeval, &(bw_meter)->bm_threshold.b_time); \ 2100 (hash) = next_timeval.tv_sec; \ 2101 if (next_timeval.tv_usec) \ 2102 (hash)++; /* XXX: make sure we don't timeout early */ \ 2103 (hash) %= BW_METER_BUCKETS; \ 2104 } while (0) 2105 2106 /* 2107 * Schedule a timer to process periodically bw_meter entry of type "<=" 2108 * by linking the entry in the proper hash bucket. 2109 */ 2110 static void 2111 schedule_bw_meter(struct bw_meter *x, struct timeval *nowp) 2112 { 2113 int time_hash; 2114 2115 MFC_LOCK_ASSERT(); 2116 2117 if (!(x->bm_flags & BW_METER_LEQ)) 2118 return; /* XXX: we schedule timers only for "<=" entries */ 2119 2120 /* 2121 * Reset the bw_meter entry 2122 */ 2123 x->bm_start_time = *nowp; 2124 x->bm_measured.b_packets = 0; 2125 x->bm_measured.b_bytes = 0; 2126 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED; 2127 2128 /* 2129 * Compute the timeout hash value and insert the entry 2130 */ 2131 BW_METER_TIMEHASH(x, time_hash); 2132 x->bm_time_next = bw_meter_timers[time_hash]; 2133 bw_meter_timers[time_hash] = x; 2134 x->bm_time_hash = time_hash; 2135 } 2136 2137 /* 2138 * Unschedule the periodic timer that processes bw_meter entry of type "<=" 2139 * by removing the entry from the proper hash bucket. 2140 */ 2141 static void 2142 unschedule_bw_meter(struct bw_meter *x) 2143 { 2144 int time_hash; 2145 struct bw_meter *prev, *tmp; 2146 2147 MFC_LOCK_ASSERT(); 2148 2149 if (!(x->bm_flags & BW_METER_LEQ)) 2150 return; /* XXX: we schedule timers only for "<=" entries */ 2151 2152 /* 2153 * Compute the timeout hash value and delete the entry 2154 */ 2155 time_hash = x->bm_time_hash; 2156 if (time_hash >= BW_METER_BUCKETS) 2157 return; /* Entry was not scheduled */ 2158 2159 for (prev = NULL, tmp = bw_meter_timers[time_hash]; 2160 tmp != NULL; prev = tmp, tmp = tmp->bm_time_next) 2161 if (tmp == x) 2162 break; 2163 2164 if (tmp == NULL) 2165 panic("unschedule_bw_meter: bw_meter entry not found"); 2166 2167 if (prev != NULL) 2168 prev->bm_time_next = x->bm_time_next; 2169 else 2170 bw_meter_timers[time_hash] = x->bm_time_next; 2171 2172 x->bm_time_next = NULL; 2173 x->bm_time_hash = BW_METER_BUCKETS; 2174 } 2175 2176 2177 /* 2178 * Process all "<=" type of bw_meter that should be processed now, 2179 * and for each entry prepare an upcall if necessary. Each processed 2180 * entry is rescheduled again for the (periodic) processing. 2181 * 2182 * This is run periodically (once per second normally). On each round, 2183 * all the potentially matching entries are in the hash slot that we are 2184 * looking at. 2185 */ 2186 static void 2187 bw_meter_process() 2188 { 2189 static uint32_t last_tv_sec; /* last time we processed this */ 2190 2191 uint32_t loops; 2192 int i; 2193 struct timeval now, process_endtime; 2194 2195 microtime(&now); 2196 if (last_tv_sec == now.tv_sec) 2197 return; /* nothing to do */ 2198 2199 loops = now.tv_sec - last_tv_sec; 2200 last_tv_sec = now.tv_sec; 2201 if (loops > BW_METER_BUCKETS) 2202 loops = BW_METER_BUCKETS; 2203 2204 MFC_LOCK(); 2205 /* 2206 * Process all bins of bw_meter entries from the one after the last 2207 * processed to the current one. On entry, i points to the last bucket 2208 * visited, so we need to increment i at the beginning of the loop. 2209 */ 2210 for (i = (now.tv_sec - loops) % BW_METER_BUCKETS; loops > 0; loops--) { 2211 struct bw_meter *x, *tmp_list; 2212 2213 if (++i >= BW_METER_BUCKETS) 2214 i = 0; 2215 2216 /* Disconnect the list of bw_meter entries from the bin */ 2217 tmp_list = bw_meter_timers[i]; 2218 bw_meter_timers[i] = NULL; 2219 2220 /* Process the list of bw_meter entries */ 2221 while (tmp_list != NULL) { 2222 x = tmp_list; 2223 tmp_list = tmp_list->bm_time_next; 2224 2225 /* Test if the time interval is over */ 2226 process_endtime = x->bm_start_time; 2227 BW_TIMEVALADD(&process_endtime, &x->bm_threshold.b_time); 2228 if (BW_TIMEVALCMP(&process_endtime, &now, >)) { 2229 /* Not yet: reschedule, but don't reset */ 2230 int time_hash; 2231 2232 BW_METER_TIMEHASH(x, time_hash); 2233 if (time_hash == i && process_endtime.tv_sec == now.tv_sec) { 2234 /* 2235 * XXX: somehow the bin processing is a bit ahead of time. 2236 * Put the entry in the next bin. 2237 */ 2238 if (++time_hash >= BW_METER_BUCKETS) 2239 time_hash = 0; 2240 } 2241 x->bm_time_next = bw_meter_timers[time_hash]; 2242 bw_meter_timers[time_hash] = x; 2243 x->bm_time_hash = time_hash; 2244 2245 continue; 2246 } 2247 2248 /* 2249 * Test if we should deliver an upcall 2250 */ 2251 if (((x->bm_flags & BW_METER_UNIT_PACKETS) && 2252 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) || 2253 ((x->bm_flags & BW_METER_UNIT_BYTES) && 2254 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) { 2255 /* Prepare an upcall for delivery */ 2256 bw_meter_prepare_upcall(x, &now); 2257 } 2258 2259 /* 2260 * Reschedule for next processing 2261 */ 2262 schedule_bw_meter(x, &now); 2263 } 2264 } 2265 2266 /* Send all upcalls that are pending delivery */ 2267 bw_upcalls_send(); 2268 2269 MFC_UNLOCK(); 2270 } 2271 2272 /* 2273 * A periodic function for sending all upcalls that are pending delivery 2274 */ 2275 static void 2276 expire_bw_upcalls_send(void *unused) 2277 { 2278 MFC_LOCK(); 2279 bw_upcalls_send(); 2280 MFC_UNLOCK(); 2281 2282 callout_reset(&bw_upcalls_ch, BW_UPCALLS_PERIOD, 2283 expire_bw_upcalls_send, NULL); 2284 } 2285 2286 /* 2287 * A periodic function for periodic scanning of the multicast forwarding 2288 * table for processing all "<=" bw_meter entries. 2289 */ 2290 static void 2291 expire_bw_meter_process(void *unused) 2292 { 2293 if (mrt_api_config & MRT_MFC_BW_UPCALL) 2294 bw_meter_process(); 2295 2296 callout_reset(&bw_meter_ch, BW_METER_PERIOD, expire_bw_meter_process, NULL); 2297 } 2298 2299 /* 2300 * End of bandwidth monitoring code 2301 */ 2302 2303 /* 2304 * Send the packet up to the user daemon, or eventually do kernel encapsulation 2305 * 2306 */ 2307 static int 2308 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m, 2309 struct mfc *rt) 2310 { 2311 struct mbuf *mb_copy, *mm; 2312 2313 /* 2314 * Do not send IGMP_WHOLEPKT notifications to userland, if the 2315 * rendezvous point was unspecified, and we were told not to. 2316 */ 2317 if (pim_squelch_wholepkt != 0 && (mrt_api_config & MRT_MFC_RP) && 2318 in_nullhost(rt->mfc_rp)) 2319 return 0; 2320 2321 mb_copy = pim_register_prepare(ip, m); 2322 if (mb_copy == NULL) 2323 return ENOBUFS; 2324 2325 /* 2326 * Send all the fragments. Note that the mbuf for each fragment 2327 * is freed by the sending machinery. 2328 */ 2329 for (mm = mb_copy; mm; mm = mb_copy) { 2330 mb_copy = mm->m_nextpkt; 2331 mm->m_nextpkt = 0; 2332 mm = m_pullup(mm, sizeof(struct ip)); 2333 if (mm != NULL) { 2334 ip = mtod(mm, struct ip *); 2335 if ((mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) { 2336 pim_register_send_rp(ip, vifp, mm, rt); 2337 } else { 2338 pim_register_send_upcall(ip, vifp, mm, rt); 2339 } 2340 } 2341 } 2342 2343 return 0; 2344 } 2345 2346 /* 2347 * Return a copy of the data packet that is ready for PIM Register 2348 * encapsulation. 2349 * XXX: Note that in the returned copy the IP header is a valid one. 2350 */ 2351 static struct mbuf * 2352 pim_register_prepare(struct ip *ip, struct mbuf *m) 2353 { 2354 struct mbuf *mb_copy = NULL; 2355 int mtu; 2356 2357 /* Take care of delayed checksums */ 2358 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) { 2359 in_delayed_cksum(m); 2360 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA; 2361 } 2362 2363 /* 2364 * Copy the old packet & pullup its IP header into the 2365 * new mbuf so we can modify it. 2366 */ 2367 mb_copy = m_copypacket(m, M_DONTWAIT); 2368 if (mb_copy == NULL) 2369 return NULL; 2370 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2); 2371 if (mb_copy == NULL) 2372 return NULL; 2373 2374 /* take care of the TTL */ 2375 ip = mtod(mb_copy, struct ip *); 2376 --ip->ip_ttl; 2377 2378 /* Compute the MTU after the PIM Register encapsulation */ 2379 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr); 2380 2381 if (ip->ip_len <= mtu) { 2382 /* Turn the IP header into a valid one */ 2383 ip->ip_len = htons(ip->ip_len); 2384 ip->ip_off = htons(ip->ip_off); 2385 ip->ip_sum = 0; 2386 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2); 2387 } else { 2388 /* Fragment the packet */ 2389 if (ip_fragment(ip, &mb_copy, mtu, 0, CSUM_DELAY_IP) != 0) { 2390 m_freem(mb_copy); 2391 return NULL; 2392 } 2393 } 2394 return mb_copy; 2395 } 2396 2397 /* 2398 * Send an upcall with the data packet to the user-level process. 2399 */ 2400 static int 2401 pim_register_send_upcall(struct ip *ip, struct vif *vifp, 2402 struct mbuf *mb_copy, struct mfc *rt) 2403 { 2404 struct mbuf *mb_first; 2405 int len = ntohs(ip->ip_len); 2406 struct igmpmsg *im; 2407 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET }; 2408 2409 VIF_LOCK_ASSERT(); 2410 2411 /* 2412 * Add a new mbuf with an upcall header 2413 */ 2414 MGETHDR(mb_first, M_DONTWAIT, MT_DATA); 2415 if (mb_first == NULL) { 2416 m_freem(mb_copy); 2417 return ENOBUFS; 2418 } 2419 mb_first->m_data += max_linkhdr; 2420 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg); 2421 mb_first->m_len = sizeof(struct igmpmsg); 2422 mb_first->m_next = mb_copy; 2423 2424 /* Send message to routing daemon */ 2425 im = mtod(mb_first, struct igmpmsg *); 2426 im->im_msgtype = IGMPMSG_WHOLEPKT; 2427 im->im_mbz = 0; 2428 im->im_vif = vifp - viftable; 2429 im->im_src = ip->ip_src; 2430 im->im_dst = ip->ip_dst; 2431 2432 k_igmpsrc.sin_addr = ip->ip_src; 2433 2434 MRTSTAT_INC(mrts_upcalls); 2435 2436 if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) { 2437 CTR1(KTR_IPMF, "%s: socket queue full", __func__); 2438 MRTSTAT_INC(mrts_upq_sockfull); 2439 return ENOBUFS; 2440 } 2441 2442 /* Keep statistics */ 2443 PIMSTAT_INC(pims_snd_registers_msgs); 2444 PIMSTAT_ADD(pims_snd_registers_bytes, len); 2445 2446 return 0; 2447 } 2448 2449 /* 2450 * Encapsulate the data packet in PIM Register message and send it to the RP. 2451 */ 2452 static int 2453 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy, 2454 struct mfc *rt) 2455 { 2456 struct mbuf *mb_first; 2457 struct ip *ip_outer; 2458 struct pim_encap_pimhdr *pimhdr; 2459 int len = ntohs(ip->ip_len); 2460 vifi_t vifi = rt->mfc_parent; 2461 2462 VIF_LOCK_ASSERT(); 2463 2464 if ((vifi >= numvifs) || in_nullhost(viftable[vifi].v_lcl_addr)) { 2465 m_freem(mb_copy); 2466 return EADDRNOTAVAIL; /* The iif vif is invalid */ 2467 } 2468 2469 /* 2470 * Add a new mbuf with the encapsulating header 2471 */ 2472 MGETHDR(mb_first, M_DONTWAIT, MT_DATA); 2473 if (mb_first == NULL) { 2474 m_freem(mb_copy); 2475 return ENOBUFS; 2476 } 2477 mb_first->m_data += max_linkhdr; 2478 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2479 mb_first->m_next = mb_copy; 2480 2481 mb_first->m_pkthdr.len = len + mb_first->m_len; 2482 2483 /* 2484 * Fill in the encapsulating IP and PIM header 2485 */ 2486 ip_outer = mtod(mb_first, struct ip *); 2487 *ip_outer = pim_encap_iphdr; 2488 ip_outer->ip_id = ip_newid(); 2489 ip_outer->ip_len = len + sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr); 2490 ip_outer->ip_src = viftable[vifi].v_lcl_addr; 2491 ip_outer->ip_dst = rt->mfc_rp; 2492 /* 2493 * Copy the inner header TOS to the outer header, and take care of the 2494 * IP_DF bit. 2495 */ 2496 ip_outer->ip_tos = ip->ip_tos; 2497 if (ntohs(ip->ip_off) & IP_DF) 2498 ip_outer->ip_off |= IP_DF; 2499 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer 2500 + sizeof(pim_encap_iphdr)); 2501 *pimhdr = pim_encap_pimhdr; 2502 /* If the iif crosses a border, set the Border-bit */ 2503 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & mrt_api_config) 2504 pimhdr->flags |= htonl(PIM_BORDER_REGISTER); 2505 2506 mb_first->m_data += sizeof(pim_encap_iphdr); 2507 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr)); 2508 mb_first->m_data -= sizeof(pim_encap_iphdr); 2509 2510 send_packet(vifp, mb_first); 2511 2512 /* Keep statistics */ 2513 PIMSTAT_INC(pims_snd_registers_msgs); 2514 PIMSTAT_ADD(pims_snd_registers_bytes, len); 2515 2516 return 0; 2517 } 2518 2519 /* 2520 * pim_encapcheck() is called by the encap4_input() path at runtime to 2521 * determine if a packet is for PIM; allowing PIM to be dynamically loaded 2522 * into the kernel. 2523 */ 2524 static int 2525 pim_encapcheck(const struct mbuf *m, int off, int proto, void *arg) 2526 { 2527 2528 #ifdef DIAGNOSTIC 2529 KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM")); 2530 #endif 2531 if (proto != IPPROTO_PIM) 2532 return 0; /* not for us; reject the datagram. */ 2533 2534 return 64; /* claim the datagram. */ 2535 } 2536 2537 /* 2538 * PIM-SMv2 and PIM-DM messages processing. 2539 * Receives and verifies the PIM control messages, and passes them 2540 * up to the listening socket, using rip_input(). 2541 * The only message with special processing is the PIM_REGISTER message 2542 * (used by PIM-SM): the PIM header is stripped off, and the inner packet 2543 * is passed to if_simloop(). 2544 */ 2545 void 2546 pim_input(struct mbuf *m, int off) 2547 { 2548 struct ip *ip = mtod(m, struct ip *); 2549 struct pim *pim; 2550 int minlen; 2551 int datalen = ip->ip_len; 2552 int ip_tos; 2553 int iphlen = off; 2554 2555 /* Keep statistics */ 2556 PIMSTAT_INC(pims_rcv_total_msgs); 2557 PIMSTAT_ADD(pims_rcv_total_bytes, datalen); 2558 2559 /* 2560 * Validate lengths 2561 */ 2562 if (datalen < PIM_MINLEN) { 2563 PIMSTAT_INC(pims_rcv_tooshort); 2564 CTR3(KTR_IPMF, "%s: short packet (%d) from %s", 2565 __func__, datalen, inet_ntoa(ip->ip_src)); 2566 m_freem(m); 2567 return; 2568 } 2569 2570 /* 2571 * If the packet is at least as big as a REGISTER, go agead 2572 * and grab the PIM REGISTER header size, to avoid another 2573 * possible m_pullup() later. 2574 * 2575 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8 2576 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28 2577 */ 2578 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN); 2579 /* 2580 * Get the IP and PIM headers in contiguous memory, and 2581 * possibly the PIM REGISTER header. 2582 */ 2583 if ((m->m_flags & M_EXT || m->m_len < minlen) && 2584 (m = m_pullup(m, minlen)) == 0) { 2585 CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__); 2586 return; 2587 } 2588 2589 /* m_pullup() may have given us a new mbuf so reset ip. */ 2590 ip = mtod(m, struct ip *); 2591 ip_tos = ip->ip_tos; 2592 2593 /* adjust mbuf to point to the PIM header */ 2594 m->m_data += iphlen; 2595 m->m_len -= iphlen; 2596 pim = mtod(m, struct pim *); 2597 2598 /* 2599 * Validate checksum. If PIM REGISTER, exclude the data packet. 2600 * 2601 * XXX: some older PIMv2 implementations don't make this distinction, 2602 * so for compatibility reason perform the checksum over part of the 2603 * message, and if error, then over the whole message. 2604 */ 2605 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) { 2606 /* do nothing, checksum okay */ 2607 } else if (in_cksum(m, datalen)) { 2608 PIMSTAT_INC(pims_rcv_badsum); 2609 CTR1(KTR_IPMF, "%s: invalid checksum", __func__); 2610 m_freem(m); 2611 return; 2612 } 2613 2614 /* PIM version check */ 2615 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) { 2616 PIMSTAT_INC(pims_rcv_badversion); 2617 CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__, 2618 (int)PIM_VT_V(pim->pim_vt), PIM_VERSION); 2619 m_freem(m); 2620 return; 2621 } 2622 2623 /* restore mbuf back to the outer IP */ 2624 m->m_data -= iphlen; 2625 m->m_len += iphlen; 2626 2627 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) { 2628 /* 2629 * Since this is a REGISTER, we'll make a copy of the register 2630 * headers ip + pim + u_int32 + encap_ip, to be passed up to the 2631 * routing daemon. 2632 */ 2633 struct sockaddr_in dst = { sizeof(dst), AF_INET }; 2634 struct mbuf *mcp; 2635 struct ip *encap_ip; 2636 u_int32_t *reghdr; 2637 struct ifnet *vifp; 2638 2639 VIF_LOCK(); 2640 if ((reg_vif_num >= numvifs) || (reg_vif_num == VIFI_INVALID)) { 2641 VIF_UNLOCK(); 2642 CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__, 2643 (int)reg_vif_num); 2644 m_freem(m); 2645 return; 2646 } 2647 /* XXX need refcnt? */ 2648 vifp = viftable[reg_vif_num].v_ifp; 2649 VIF_UNLOCK(); 2650 2651 /* 2652 * Validate length 2653 */ 2654 if (datalen < PIM_REG_MINLEN) { 2655 PIMSTAT_INC(pims_rcv_tooshort); 2656 PIMSTAT_INC(pims_rcv_badregisters); 2657 CTR1(KTR_IPMF, "%s: register packet size too small", __func__); 2658 m_freem(m); 2659 return; 2660 } 2661 2662 reghdr = (u_int32_t *)(pim + 1); 2663 encap_ip = (struct ip *)(reghdr + 1); 2664 2665 CTR3(KTR_IPMF, "%s: register: encap ip src %s len %d", 2666 __func__, inet_ntoa(encap_ip->ip_src), ntohs(encap_ip->ip_len)); 2667 2668 /* verify the version number of the inner packet */ 2669 if (encap_ip->ip_v != IPVERSION) { 2670 PIMSTAT_INC(pims_rcv_badregisters); 2671 CTR1(KTR_IPMF, "%s: bad encap ip version", __func__); 2672 m_freem(m); 2673 return; 2674 } 2675 2676 /* verify the inner packet is destined to a mcast group */ 2677 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) { 2678 PIMSTAT_INC(pims_rcv_badregisters); 2679 CTR2(KTR_IPMF, "%s: bad encap ip dest %s", __func__, 2680 inet_ntoa(encap_ip->ip_dst)); 2681 m_freem(m); 2682 return; 2683 } 2684 2685 /* If a NULL_REGISTER, pass it to the daemon */ 2686 if ((ntohl(*reghdr) & PIM_NULL_REGISTER)) 2687 goto pim_input_to_daemon; 2688 2689 /* 2690 * Copy the TOS from the outer IP header to the inner IP header. 2691 */ 2692 if (encap_ip->ip_tos != ip_tos) { 2693 /* Outer TOS -> inner TOS */ 2694 encap_ip->ip_tos = ip_tos; 2695 /* Recompute the inner header checksum. Sigh... */ 2696 2697 /* adjust mbuf to point to the inner IP header */ 2698 m->m_data += (iphlen + PIM_MINLEN); 2699 m->m_len -= (iphlen + PIM_MINLEN); 2700 2701 encap_ip->ip_sum = 0; 2702 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2); 2703 2704 /* restore mbuf to point back to the outer IP header */ 2705 m->m_data -= (iphlen + PIM_MINLEN); 2706 m->m_len += (iphlen + PIM_MINLEN); 2707 } 2708 2709 /* 2710 * Decapsulate the inner IP packet and loopback to forward it 2711 * as a normal multicast packet. Also, make a copy of the 2712 * outer_iphdr + pimhdr + reghdr + encap_iphdr 2713 * to pass to the daemon later, so it can take the appropriate 2714 * actions (e.g., send back PIM_REGISTER_STOP). 2715 * XXX: here m->m_data points to the outer IP header. 2716 */ 2717 mcp = m_copy(m, 0, iphlen + PIM_REG_MINLEN); 2718 if (mcp == NULL) { 2719 CTR1(KTR_IPMF, "%s: m_copy() failed", __func__); 2720 m_freem(m); 2721 return; 2722 } 2723 2724 /* Keep statistics */ 2725 /* XXX: registers_bytes include only the encap. mcast pkt */ 2726 PIMSTAT_INC(pims_rcv_registers_msgs); 2727 PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len)); 2728 2729 /* 2730 * forward the inner ip packet; point m_data at the inner ip. 2731 */ 2732 m_adj(m, iphlen + PIM_MINLEN); 2733 2734 CTR4(KTR_IPMF, 2735 "%s: forward decap'd REGISTER: src %lx dst %lx vif %d", 2736 __func__, 2737 (u_long)ntohl(encap_ip->ip_src.s_addr), 2738 (u_long)ntohl(encap_ip->ip_dst.s_addr), 2739 (int)reg_vif_num); 2740 2741 /* NB: vifp was collected above; can it change on us? */ 2742 if_simloop(vifp, m, dst.sin_family, 0); 2743 2744 /* prepare the register head to send to the mrouting daemon */ 2745 m = mcp; 2746 } 2747 2748 pim_input_to_daemon: 2749 /* 2750 * Pass the PIM message up to the daemon; if it is a Register message, 2751 * pass the 'head' only up to the daemon. This includes the 2752 * outer IP header, PIM header, PIM-Register header and the 2753 * inner IP header. 2754 * XXX: the outer IP header pkt size of a Register is not adjust to 2755 * reflect the fact that the inner multicast data is truncated. 2756 */ 2757 rip_input(m, iphlen); 2758 2759 return; 2760 } 2761 2762 static int 2763 sysctl_mfctable(SYSCTL_HANDLER_ARGS) 2764 { 2765 struct mfc *rt; 2766 int error, i; 2767 2768 if (req->newptr) 2769 return (EPERM); 2770 if (mfchashtbl == NULL) /* XXX unlocked */ 2771 return (0); 2772 error = sysctl_wire_old_buffer(req, 0); 2773 if (error) 2774 return (error); 2775 2776 MFC_LOCK(); 2777 for (i = 0; i < mfchashsize; i++) { 2778 LIST_FOREACH(rt, &mfchashtbl[i], mfc_hash) { 2779 error = SYSCTL_OUT(req, rt, sizeof(struct mfc)); 2780 if (error) 2781 goto out_locked; 2782 } 2783 } 2784 out_locked: 2785 MFC_UNLOCK(); 2786 return (error); 2787 } 2788 2789 SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable, CTLFLAG_RD, sysctl_mfctable, 2790 "IPv4 Multicast Forwarding Table (struct *mfc[mfchashsize], " 2791 "netinet/ip_mroute.h)"); 2792 2793 static int 2794 ip_mroute_modevent(module_t mod, int type, void *unused) 2795 { 2796 2797 switch (type) { 2798 case MOD_LOAD: 2799 MROUTER_LOCK_INIT(); 2800 MFC_LOCK_INIT(); 2801 VIF_LOCK_INIT(); 2802 2803 mfchashsize = MFCHASHSIZE; 2804 if (TUNABLE_ULONG_FETCH("net.inet.ip.mfchashsize", &mfchashsize) && 2805 !powerof2(mfchashsize)) { 2806 printf("WARNING: %s not a power of 2; using default\n", 2807 "net.inet.ip.mfchashsize"); 2808 mfchashsize = MFCHASHSIZE; 2809 } 2810 MALLOC(nexpire, u_char *, mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO); 2811 2812 pim_squelch_wholepkt = 0; 2813 TUNABLE_ULONG_FETCH("net.inet.pim.squelch_wholepkt", 2814 &pim_squelch_wholepkt); 2815 ip_mrouter_reset(); 2816 2817 pim_encap_cookie = encap_attach_func(AF_INET, IPPROTO_PIM, 2818 pim_encapcheck, &in_pim_protosw, NULL); 2819 if (pim_encap_cookie == NULL) { 2820 printf("ip_mroute: unable to attach pim encap\n"); 2821 VIF_LOCK_DESTROY(); 2822 MFC_LOCK_DESTROY(); 2823 MROUTER_LOCK_DESTROY(); 2824 return (EINVAL); 2825 } 2826 2827 ip_mcast_src = X_ip_mcast_src; 2828 ip_mforward = X_ip_mforward; 2829 ip_mrouter_done = X_ip_mrouter_done; 2830 ip_mrouter_get = X_ip_mrouter_get; 2831 ip_mrouter_set = X_ip_mrouter_set; 2832 2833 ip_rsvp_force_done = X_ip_rsvp_force_done; 2834 ip_rsvp_vif = X_ip_rsvp_vif; 2835 2836 legal_vif_num = X_legal_vif_num; 2837 mrt_ioctl = X_mrt_ioctl; 2838 rsvp_input_p = X_rsvp_input; 2839 break; 2840 2841 case MOD_UNLOAD: 2842 /* 2843 * Typically module unload happens after the user-level 2844 * process has shutdown the kernel services (the check 2845 * below insures someone can't just yank the module out 2846 * from under a running process). But if the module is 2847 * just loaded and then unloaded w/o starting up a user 2848 * process we still need to cleanup. 2849 */ 2850 if (V_ip_mrouter != NULL) 2851 return (EINVAL); 2852 2853 if (pim_encap_cookie) { 2854 encap_detach(pim_encap_cookie); 2855 pim_encap_cookie = NULL; 2856 } 2857 X_ip_mrouter_done(); 2858 2859 FREE(nexpire, M_MRTABLE); 2860 nexpire = NULL; 2861 2862 ip_mcast_src = NULL; 2863 ip_mforward = NULL; 2864 ip_mrouter_done = NULL; 2865 ip_mrouter_get = NULL; 2866 ip_mrouter_set = NULL; 2867 2868 ip_rsvp_force_done = NULL; 2869 ip_rsvp_vif = NULL; 2870 2871 legal_vif_num = NULL; 2872 mrt_ioctl = NULL; 2873 rsvp_input_p = NULL; 2874 2875 VIF_LOCK_DESTROY(); 2876 MFC_LOCK_DESTROY(); 2877 MROUTER_LOCK_DESTROY(); 2878 break; 2879 2880 default: 2881 return EOPNOTSUPP; 2882 } 2883 return 0; 2884 } 2885 2886 static moduledata_t ip_mroutemod = { 2887 "ip_mroute", 2888 ip_mroute_modevent, 2889 0 2890 }; 2891 2892 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PSEUDO, SI_ORDER_ANY); 2893