1 /*-
2 * SPDX-License-Identifier: BSD-3-Clause
3 *
4 * Copyright (c) 1989 Stephen Deering
5 * Copyright (c) 1992, 1993
6 * The Regents of the University of California. All rights reserved.
7 *
8 * This code is derived from software contributed to Berkeley by
9 * Stephen Deering of Stanford University.
10 *
11 * Redistribution and use in source and binary forms, with or without
12 * modification, are permitted provided that the following conditions
13 * are met:
14 * 1. Redistributions of source code must retain the above copyright
15 * notice, this list of conditions and the following disclaimer.
16 * 2. Redistributions in binary form must reproduce the above copyright
17 * notice, this list of conditions and the following disclaimer in the
18 * documentation and/or other materials provided with the distribution.
19 * 3. Neither the name of the University nor the names of its contributors
20 * may be used to endorse or promote products derived from this software
21 * without specific prior written permission.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 /*
37 * IP multicast forwarding procedures
38 *
39 * Written by David Waitzman, BBN Labs, August 1988.
40 * Modified by Steve Deering, Stanford, February 1989.
41 * Modified by Mark J. Steiglitz, Stanford, May, 1991
42 * Modified by Van Jacobson, LBL, January 1993
43 * Modified by Ajit Thyagarajan, PARC, August 1993
44 * Modified by Bill Fenner, PARC, April 1995
45 * Modified by Ahmed Helmy, SGI, June 1996
46 * Modified by George Edmond Eddy (Rusty), ISI, February 1998
47 * Modified by Pavlin Radoslavov, USC/ISI, May 1998, August 1999, October 2000
48 * Modified by Hitoshi Asaeda, WIDE, August 2000
49 * Modified by Pavlin Radoslavov, ICSI, October 2002
50 * Modified by Wojciech Macek, Semihalf, May 2021
51 *
52 * MROUTING Revision: 3.5
53 * and PIM-SMv2 and PIM-DM support, advanced API support,
54 * bandwidth metering and signaling
55 */
56
57 /*
58 * TODO: Prefix functions with ipmf_.
59 * TODO: Maintain a refcount on if_allmulti() in ifnet or in the protocol
60 * domain attachment (if_afdata) so we can track consumers of that service.
61 * TODO: Deprecate routing socket path for SIOCGETSGCNT and SIOCGETVIFCNT,
62 * move it to socket options.
63 * TODO: Cleanup LSRR removal further.
64 * TODO: Push RSVP stubs into raw_ip.c.
65 * TODO: Use bitstring.h for vif set.
66 * TODO: Fix mrt6_ioctl dangling ref when dynamically loaded.
67 * TODO: Sync ip6_mroute.c with this file.
68 */
69
70 #include <sys/cdefs.h>
71 #include "opt_inet.h"
72 #include "opt_mrouting.h"
73
74 #define _PIM_VT 1
75
76 #include <sys/types.h>
77 #include <sys/param.h>
78 #include <sys/kernel.h>
79 #include <sys/stddef.h>
80 #include <sys/condvar.h>
81 #include <sys/eventhandler.h>
82 #include <sys/lock.h>
83 #include <sys/kthread.h>
84 #include <sys/ktr.h>
85 #include <sys/malloc.h>
86 #include <sys/mbuf.h>
87 #include <sys/module.h>
88 #include <sys/priv.h>
89 #include <sys/protosw.h>
90 #include <sys/signalvar.h>
91 #include <sys/socket.h>
92 #include <sys/socketvar.h>
93 #include <sys/sockio.h>
94 #include <sys/sx.h>
95 #include <sys/sysctl.h>
96 #include <sys/syslog.h>
97 #include <sys/systm.h>
98 #include <sys/taskqueue.h>
99 #include <sys/time.h>
100 #include <sys/counter.h>
101 #include <machine/atomic.h>
102
103 #include <net/if.h>
104 #include <net/if_var.h>
105 #include <net/if_private.h>
106 #include <net/if_types.h>
107 #include <net/netisr.h>
108 #include <net/route.h>
109 #include <net/vnet.h>
110
111 #include <netinet/in.h>
112 #include <netinet/igmp.h>
113 #include <netinet/in_systm.h>
114 #include <netinet/in_var.h>
115 #include <netinet/ip.h>
116 #include <netinet/ip_encap.h>
117 #include <netinet/ip_mroute.h>
118 #include <netinet/ip_var.h>
119 #include <netinet/ip_options.h>
120 #include <netinet/pim.h>
121 #include <netinet/pim_var.h>
122 #include <netinet/udp.h>
123
124 #include <machine/in_cksum.h>
125
126 #ifndef KTR_IPMF
127 #define KTR_IPMF KTR_INET
128 #endif
129
130 #define VIFI_INVALID ((vifi_t) -1)
131
132 static MALLOC_DEFINE(M_MRTABLE, "mroutetbl", "multicast forwarding cache");
133
134 /*
135 * Locking. We use two locks: one for the virtual interface table and
136 * one for the forwarding table. These locks may be nested in which case
137 * the VIF lock must always be taken first. Note that each lock is used
138 * to cover not only the specific data structure but also related data
139 * structures.
140 */
141
142 static struct sx __exclusive_cache_line mrouter_teardown;
143 #define MRW_TEARDOWN_WLOCK() sx_xlock(&mrouter_teardown)
144 #define MRW_TEARDOWN_WUNLOCK() sx_xunlock(&mrouter_teardown)
145 #define MRW_TEARDOWN_LOCK_INIT() \
146 sx_init(&mrouter_teardown, "IPv4 multicast forwarding teardown")
147 #define MRW_TEARDOWN_LOCK_DESTROY() sx_destroy(&mrouter_teardown)
148
149 static struct rwlock mrouter_lock;
150 #define MRW_RLOCK() rw_rlock(&mrouter_lock)
151 #define MRW_WLOCK() rw_wlock(&mrouter_lock)
152 #define MRW_RUNLOCK() rw_runlock(&mrouter_lock)
153 #define MRW_WUNLOCK() rw_wunlock(&mrouter_lock)
154 #define MRW_UNLOCK() rw_unlock(&mrouter_lock)
155 #define MRW_LOCK_ASSERT() rw_assert(&mrouter_lock, RA_LOCKED)
156 #define MRW_WLOCK_ASSERT() rw_assert(&mrouter_lock, RA_WLOCKED)
157 #define MRW_LOCK_TRY_UPGRADE() rw_try_upgrade(&mrouter_lock)
158 #define MRW_WOWNED() rw_wowned(&mrouter_lock)
159 #define MRW_LOCK_INIT() \
160 rw_init(&mrouter_lock, "IPv4 multicast forwarding")
161 #define MRW_LOCK_DESTROY() rw_destroy(&mrouter_lock)
162
163 static int ip_mrouter_cnt; /* # of vnets with active mrouters */
164 static int ip_mrouter_unloading; /* Allow no more V_ip_mrouter sockets */
165
166 VNET_PCPUSTAT_DEFINE_STATIC(struct mrtstat, mrtstat);
167 VNET_PCPUSTAT_SYSINIT(mrtstat);
168 VNET_PCPUSTAT_SYSUNINIT(mrtstat);
169 SYSCTL_VNET_PCPUSTAT(_net_inet_ip, OID_AUTO, mrtstat, struct mrtstat,
170 mrtstat, "IPv4 Multicast Forwarding Statistics (struct mrtstat, "
171 "netinet/ip_mroute.h)");
172
173 VNET_DEFINE_STATIC(u_long, mfchash);
174 #define V_mfchash VNET(mfchash)
175 #define MFCHASH(a, g) \
176 ((((a).s_addr >> 20) ^ ((a).s_addr >> 10) ^ (a).s_addr ^ \
177 ((g).s_addr >> 20) ^ ((g).s_addr >> 10) ^ (g).s_addr) & V_mfchash)
178 #define MFCHASHSIZE 256
179
180 static u_long mfchashsize = MFCHASHSIZE; /* Hash size */
181 SYSCTL_ULONG(_net_inet_ip, OID_AUTO, mfchashsize, CTLFLAG_RDTUN,
182 &mfchashsize, 0, "IPv4 Multicast Forwarding Table hash size");
183 VNET_DEFINE_STATIC(u_char *, nexpire); /* 0..mfchashsize-1 */
184 #define V_nexpire VNET(nexpire)
185 VNET_DEFINE_STATIC(LIST_HEAD(mfchashhdr, mfc)*, mfchashtbl);
186 #define V_mfchashtbl VNET(mfchashtbl)
187 VNET_DEFINE_STATIC(struct taskqueue *, task_queue);
188 #define V_task_queue VNET(task_queue)
189 VNET_DEFINE_STATIC(struct task, task);
190 #define V_task VNET(task)
191
192 VNET_DEFINE_STATIC(vifi_t, numvifs);
193 #define V_numvifs VNET(numvifs)
194 VNET_DEFINE_STATIC(struct vif *, viftable);
195 #define V_viftable VNET(viftable)
196
197 static eventhandler_tag if_detach_event_tag = NULL;
198
199 VNET_DEFINE_STATIC(struct callout, expire_upcalls_ch);
200 #define V_expire_upcalls_ch VNET(expire_upcalls_ch)
201
202 VNET_DEFINE_STATIC(struct mtx, buf_ring_mtx);
203 #define V_buf_ring_mtx VNET(buf_ring_mtx)
204
205 #define EXPIRE_TIMEOUT (hz / 4) /* 4x / second */
206 #define UPCALL_EXPIRE 6 /* number of timeouts */
207
208 /*
209 * Bandwidth meter variables and constants
210 */
211 static MALLOC_DEFINE(M_BWMETER, "bwmeter", "multicast upcall bw meters");
212
213 /*
214 * Pending upcalls are stored in a ring which is flushed when
215 * full, or periodically
216 */
217 VNET_DEFINE_STATIC(struct callout, bw_upcalls_ch);
218 #define V_bw_upcalls_ch VNET(bw_upcalls_ch)
219 VNET_DEFINE_STATIC(struct buf_ring *, bw_upcalls_ring);
220 #define V_bw_upcalls_ring VNET(bw_upcalls_ring)
221 VNET_DEFINE_STATIC(struct mtx, bw_upcalls_ring_mtx);
222 #define V_bw_upcalls_ring_mtx VNET(bw_upcalls_ring_mtx)
223
224 #define BW_UPCALLS_PERIOD (hz) /* periodical flush of bw upcalls */
225
226 VNET_PCPUSTAT_DEFINE_STATIC(struct pimstat, pimstat);
227 VNET_PCPUSTAT_SYSINIT(pimstat);
228 VNET_PCPUSTAT_SYSUNINIT(pimstat);
229
230 SYSCTL_NODE(_net_inet, IPPROTO_PIM, pim, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
231 "PIM");
232 SYSCTL_VNET_PCPUSTAT(_net_inet_pim, PIMCTL_STATS, stats, struct pimstat,
233 pimstat, "PIM Statistics (struct pimstat, netinet/pim_var.h)");
234
235 static u_long pim_squelch_wholepkt = 0;
236 SYSCTL_ULONG(_net_inet_pim, OID_AUTO, squelch_wholepkt, CTLFLAG_RWTUN,
237 &pim_squelch_wholepkt, 0,
238 "Disable IGMP_WHOLEPKT notifications if rendezvous point is unspecified");
239
240 static const struct encaptab *pim_encap_cookie;
241 static int pim_encapcheck(const struct mbuf *, int, int, void *);
242 static int pim_input(struct mbuf *, int, int, void *);
243
244 extern int in_mcast_loop;
245
246 static const struct encap_config ipv4_encap_cfg = {
247 .proto = IPPROTO_PIM,
248 .min_length = sizeof(struct ip) + PIM_MINLEN,
249 .exact_match = 8,
250 .check = pim_encapcheck,
251 .input = pim_input
252 };
253
254 /*
255 * Note: the PIM Register encapsulation adds the following in front of a
256 * data packet:
257 *
258 * struct pim_encap_hdr {
259 * struct ip ip;
260 * struct pim_encap_pimhdr pim;
261 * }
262 *
263 */
264
265 struct pim_encap_pimhdr {
266 struct pim pim;
267 uint32_t flags;
268 };
269 #define PIM_ENCAP_TTL 64
270
271 static struct ip pim_encap_iphdr = {
272 #if BYTE_ORDER == LITTLE_ENDIAN
273 sizeof(struct ip) >> 2,
274 IPVERSION,
275 #else
276 IPVERSION,
277 sizeof(struct ip) >> 2,
278 #endif
279 0, /* tos */
280 sizeof(struct ip), /* total length */
281 0, /* id */
282 0, /* frag offset */
283 PIM_ENCAP_TTL,
284 IPPROTO_PIM,
285 0, /* checksum */
286 };
287
288 static struct pim_encap_pimhdr pim_encap_pimhdr = {
289 {
290 PIM_MAKE_VT(PIM_VERSION, PIM_REGISTER), /* PIM vers and message type */
291 0, /* reserved */
292 0, /* checksum */
293 },
294 0 /* flags */
295 };
296
297 VNET_DEFINE_STATIC(vifi_t, reg_vif_num) = VIFI_INVALID;
298 #define V_reg_vif_num VNET(reg_vif_num)
299 VNET_DEFINE_STATIC(struct ifnet *, multicast_register_if);
300 #define V_multicast_register_if VNET(multicast_register_if)
301
302 /*
303 * Private variables.
304 */
305
306 static u_long X_ip_mcast_src(int);
307 static int X_ip_mforward(struct ip *, struct ifnet *, struct mbuf *,
308 struct ip_moptions *);
309 static int X_ip_mrouter_done(void);
310 static int X_ip_mrouter_get(struct socket *, struct sockopt *);
311 static int X_ip_mrouter_set(struct socket *, struct sockopt *);
312 static int X_legal_vif_num(int);
313 static int X_mrt_ioctl(u_long, caddr_t, int);
314
315 static int add_bw_upcall(struct bw_upcall *);
316 static int add_mfc(struct mfcctl2 *);
317 static int add_vif(struct vifctl *);
318 static void bw_meter_prepare_upcall(struct bw_meter *, struct timeval *);
319 static void bw_meter_geq_receive_packet(struct bw_meter *, int,
320 struct timeval *);
321 static void bw_upcalls_send(void);
322 static int del_bw_upcall(struct bw_upcall *);
323 static int del_mfc(struct mfcctl2 *);
324 static int del_vif(vifi_t);
325 static int del_vif_locked(vifi_t, struct ifnet **, struct ifnet **);
326 static void expire_bw_upcalls_send(void *);
327 static void expire_mfc(struct mfc *);
328 static void expire_upcalls(void *);
329 static void free_bw_list(struct bw_meter *);
330 static int get_sg_cnt(struct sioc_sg_req *);
331 static int get_vif_cnt(struct sioc_vif_req *);
332 static void if_detached_event(void *, struct ifnet *);
333 static int ip_mdq(struct mbuf *, struct ifnet *, struct mfc *, vifi_t);
334 static int ip_mrouter_init(struct socket *, int);
335 static __inline struct mfc *
336 mfc_find(struct in_addr *, struct in_addr *);
337 static void phyint_send(struct ip *, struct vif *, struct mbuf *);
338 static struct mbuf *
339 pim_register_prepare(struct ip *, struct mbuf *);
340 static int pim_register_send(struct ip *, struct vif *,
341 struct mbuf *, struct mfc *);
342 static int pim_register_send_rp(struct ip *, struct vif *,
343 struct mbuf *, struct mfc *);
344 static int pim_register_send_upcall(struct ip *, struct vif *,
345 struct mbuf *, struct mfc *);
346 static void send_packet(struct vif *, struct mbuf *);
347 static int set_api_config(uint32_t *);
348 static int set_assert(int);
349 static int socket_send(struct socket *, struct mbuf *,
350 struct sockaddr_in *);
351
352 /*
353 * Kernel multicast forwarding API capabilities and setup.
354 * If more API capabilities are added to the kernel, they should be
355 * recorded in `mrt_api_support'.
356 */
357 #define MRT_API_VERSION 0x0305
358
359 static const int mrt_api_version = MRT_API_VERSION;
360 static const uint32_t mrt_api_support = (MRT_MFC_FLAGS_DISABLE_WRONGVIF |
361 MRT_MFC_FLAGS_BORDER_VIF |
362 MRT_MFC_RP |
363 MRT_MFC_BW_UPCALL);
364 VNET_DEFINE_STATIC(uint32_t, mrt_api_config);
365 #define V_mrt_api_config VNET(mrt_api_config)
366 VNET_DEFINE_STATIC(int, pim_assert_enabled);
367 #define V_pim_assert_enabled VNET(pim_assert_enabled)
368 static struct timeval pim_assert_interval = { 3, 0 }; /* Rate limit */
369
370 /*
371 * Find a route for a given origin IP address and multicast group address.
372 * Statistics must be updated by the caller.
373 */
374 static __inline struct mfc *
mfc_find(struct in_addr * o,struct in_addr * g)375 mfc_find(struct in_addr *o, struct in_addr *g)
376 {
377 struct mfc *rt;
378
379 /*
380 * Might be called both RLOCK and WLOCK.
381 * Check if any, it's caller responsibility
382 * to choose correct option.
383 */
384 MRW_LOCK_ASSERT();
385
386 LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(*o, *g)], mfc_hash) {
387 if (in_hosteq(rt->mfc_origin, *o) &&
388 in_hosteq(rt->mfc_mcastgrp, *g) &&
389 buf_ring_empty(rt->mfc_stall_ring))
390 break;
391 }
392
393 return (rt);
394 }
395
396 static __inline struct mfc *
mfc_alloc(void)397 mfc_alloc(void)
398 {
399 struct mfc *rt;
400 rt = malloc(sizeof(*rt), M_MRTABLE, M_NOWAIT | M_ZERO);
401 if (rt == NULL)
402 return rt;
403
404 rt->mfc_stall_ring = buf_ring_alloc(MAX_UPQ, M_MRTABLE,
405 M_NOWAIT, &V_buf_ring_mtx);
406 if (rt->mfc_stall_ring == NULL) {
407 free(rt, M_MRTABLE);
408 return NULL;
409 }
410
411 return rt;
412 }
413
414 /*
415 * Handle MRT setsockopt commands to modify the multicast forwarding tables.
416 */
417 static int
X_ip_mrouter_set(struct socket * so,struct sockopt * sopt)418 X_ip_mrouter_set(struct socket *so, struct sockopt *sopt)
419 {
420 int error, optval;
421 vifi_t vifi;
422 struct vifctl vifc;
423 struct mfcctl2 mfc;
424 struct bw_upcall bw_upcall;
425 uint32_t i;
426
427 if (so != V_ip_mrouter && sopt->sopt_name != MRT_INIT)
428 return EPERM;
429
430 error = 0;
431 switch (sopt->sopt_name) {
432 case MRT_INIT:
433 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
434 if (error)
435 break;
436 error = ip_mrouter_init(so, optval);
437 break;
438 case MRT_DONE:
439 error = ip_mrouter_done();
440 break;
441 case MRT_ADD_VIF:
442 error = sooptcopyin(sopt, &vifc, sizeof vifc, sizeof vifc);
443 if (error)
444 break;
445 error = add_vif(&vifc);
446 break;
447 case MRT_DEL_VIF:
448 error = sooptcopyin(sopt, &vifi, sizeof vifi, sizeof vifi);
449 if (error)
450 break;
451 error = del_vif(vifi);
452 break;
453 case MRT_ADD_MFC:
454 case MRT_DEL_MFC:
455 /*
456 * select data size depending on API version.
457 */
458 if (sopt->sopt_name == MRT_ADD_MFC &&
459 V_mrt_api_config & MRT_API_FLAGS_ALL) {
460 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl2),
461 sizeof(struct mfcctl2));
462 } else {
463 error = sooptcopyin(sopt, &mfc, sizeof(struct mfcctl),
464 sizeof(struct mfcctl));
465 bzero((caddr_t)&mfc + sizeof(struct mfcctl),
466 sizeof(mfc) - sizeof(struct mfcctl));
467 }
468 if (error)
469 break;
470 if (sopt->sopt_name == MRT_ADD_MFC)
471 error = add_mfc(&mfc);
472 else
473 error = del_mfc(&mfc);
474 break;
475
476 case MRT_ASSERT:
477 error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval);
478 if (error)
479 break;
480 set_assert(optval);
481 break;
482
483 case MRT_API_CONFIG:
484 error = sooptcopyin(sopt, &i, sizeof i, sizeof i);
485 if (!error)
486 error = set_api_config(&i);
487 if (!error)
488 error = sooptcopyout(sopt, &i, sizeof i);
489 break;
490
491 case MRT_ADD_BW_UPCALL:
492 case MRT_DEL_BW_UPCALL:
493 error = sooptcopyin(sopt, &bw_upcall, sizeof bw_upcall,
494 sizeof bw_upcall);
495 if (error)
496 break;
497 if (sopt->sopt_name == MRT_ADD_BW_UPCALL)
498 error = add_bw_upcall(&bw_upcall);
499 else
500 error = del_bw_upcall(&bw_upcall);
501 break;
502
503 default:
504 error = EOPNOTSUPP;
505 break;
506 }
507 return error;
508 }
509
510 /*
511 * Handle MRT getsockopt commands
512 */
513 static int
X_ip_mrouter_get(struct socket * so,struct sockopt * sopt)514 X_ip_mrouter_get(struct socket *so, struct sockopt *sopt)
515 {
516 int error;
517
518 switch (sopt->sopt_name) {
519 case MRT_VERSION:
520 error = sooptcopyout(sopt, &mrt_api_version,
521 sizeof mrt_api_version);
522 break;
523 case MRT_ASSERT:
524 error = sooptcopyout(sopt, &V_pim_assert_enabled,
525 sizeof V_pim_assert_enabled);
526 break;
527 case MRT_API_SUPPORT:
528 error = sooptcopyout(sopt, &mrt_api_support,
529 sizeof mrt_api_support);
530 break;
531 case MRT_API_CONFIG:
532 error = sooptcopyout(sopt, &V_mrt_api_config,
533 sizeof V_mrt_api_config);
534 break;
535 default:
536 error = EOPNOTSUPP;
537 break;
538 }
539 return error;
540 }
541
542 /*
543 * Handle ioctl commands to obtain information from the cache
544 */
545 static int
X_mrt_ioctl(u_long cmd,caddr_t data,int fibnum __unused)546 X_mrt_ioctl(u_long cmd, caddr_t data, int fibnum __unused)
547 {
548 int error;
549
550 /*
551 * Currently the only function calling this ioctl routine is rtioctl_fib().
552 * Typically, only root can create the raw socket in order to execute
553 * this ioctl method, however the request might be coming from a prison
554 */
555 error = priv_check(curthread, PRIV_NETINET_MROUTE);
556 if (error)
557 return (error);
558 switch (cmd) {
559 case (SIOCGETVIFCNT):
560 error = get_vif_cnt((struct sioc_vif_req *)data);
561 break;
562
563 case (SIOCGETSGCNT):
564 error = get_sg_cnt((struct sioc_sg_req *)data);
565 break;
566
567 default:
568 error = EINVAL;
569 break;
570 }
571 return error;
572 }
573
574 /*
575 * returns the packet, byte, rpf-failure count for the source group provided
576 */
577 static int
get_sg_cnt(struct sioc_sg_req * req)578 get_sg_cnt(struct sioc_sg_req *req)
579 {
580 struct mfc *rt;
581
582 MRW_RLOCK();
583 rt = mfc_find(&req->src, &req->grp);
584 if (rt == NULL) {
585 MRW_RUNLOCK();
586 req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffff;
587 return EADDRNOTAVAIL;
588 }
589 req->pktcnt = rt->mfc_pkt_cnt;
590 req->bytecnt = rt->mfc_byte_cnt;
591 req->wrong_if = rt->mfc_wrong_if;
592 MRW_RUNLOCK();
593 return 0;
594 }
595
596 /*
597 * returns the input and output packet and byte counts on the vif provided
598 */
599 static int
get_vif_cnt(struct sioc_vif_req * req)600 get_vif_cnt(struct sioc_vif_req *req)
601 {
602 vifi_t vifi = req->vifi;
603
604 MRW_RLOCK();
605 if (vifi >= V_numvifs) {
606 MRW_RUNLOCK();
607 return EINVAL;
608 }
609
610 mtx_lock_spin(&V_viftable[vifi].v_spin);
611 req->icount = V_viftable[vifi].v_pkt_in;
612 req->ocount = V_viftable[vifi].v_pkt_out;
613 req->ibytes = V_viftable[vifi].v_bytes_in;
614 req->obytes = V_viftable[vifi].v_bytes_out;
615 mtx_unlock_spin(&V_viftable[vifi].v_spin);
616 MRW_RUNLOCK();
617
618 return 0;
619 }
620
621 static void
if_detached_event(void * arg __unused,struct ifnet * ifp)622 if_detached_event(void *arg __unused, struct ifnet *ifp)
623 {
624 vifi_t vifi;
625 u_long i, vifi_cnt = 0;
626 struct ifnet *free_ptr, *multi_leave;
627
628 MRW_WLOCK();
629
630 if (V_ip_mrouter == NULL) {
631 MRW_WUNLOCK();
632 return;
633 }
634
635 /*
636 * Tear down multicast forwarder state associated with this ifnet.
637 * 1. Walk the vif list, matching vifs against this ifnet.
638 * 2. Walk the multicast forwarding cache (mfc) looking for
639 * inner matches with this vif's index.
640 * 3. Expire any matching multicast forwarding cache entries.
641 * 4. Free vif state. This should disable ALLMULTI on the interface.
642 */
643 restart:
644 for (vifi = 0; vifi < V_numvifs; vifi++) {
645 if (V_viftable[vifi].v_ifp != ifp)
646 continue;
647 for (i = 0; i < mfchashsize; i++) {
648 struct mfc *rt, *nrt;
649
650 LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
651 if (rt->mfc_parent == vifi) {
652 expire_mfc(rt);
653 }
654 }
655 }
656 del_vif_locked(vifi, &multi_leave, &free_ptr);
657 if (free_ptr != NULL)
658 vifi_cnt++;
659 if (multi_leave) {
660 MRW_WUNLOCK();
661 if_allmulti(multi_leave, 0);
662 MRW_WLOCK();
663 goto restart;
664 }
665 }
666
667 MRW_WUNLOCK();
668
669 /*
670 * Free IFP. We don't have to use free_ptr here as it is the same
671 * that ifp. Perform free as many times as required in case
672 * refcount is greater than 1.
673 */
674 for (i = 0; i < vifi_cnt; i++)
675 if_free(ifp);
676 }
677
678 static void
ip_mrouter_upcall_thread(void * arg,int pending __unused)679 ip_mrouter_upcall_thread(void *arg, int pending __unused)
680 {
681 CURVNET_SET((struct vnet *) arg);
682
683 MRW_WLOCK();
684 bw_upcalls_send();
685 MRW_WUNLOCK();
686
687 CURVNET_RESTORE();
688 }
689
690 /*
691 * Enable multicast forwarding.
692 */
693 static int
ip_mrouter_init(struct socket * so,int version)694 ip_mrouter_init(struct socket *so, int version)
695 {
696
697 CTR2(KTR_IPMF, "%s: so %p", __func__, so);
698
699 if (version != 1)
700 return ENOPROTOOPT;
701
702 MRW_TEARDOWN_WLOCK();
703 MRW_WLOCK();
704
705 if (ip_mrouter_unloading) {
706 MRW_WUNLOCK();
707 MRW_TEARDOWN_WUNLOCK();
708 return ENOPROTOOPT;
709 }
710
711 if (V_ip_mrouter != NULL) {
712 MRW_WUNLOCK();
713 MRW_TEARDOWN_WUNLOCK();
714 return EADDRINUSE;
715 }
716
717 V_mfchashtbl = hashinit_flags(mfchashsize, M_MRTABLE, &V_mfchash,
718 HASH_NOWAIT);
719 if (V_mfchashtbl == NULL) {
720 MRW_WUNLOCK();
721 MRW_TEARDOWN_WUNLOCK();
722 return (ENOMEM);
723 }
724
725 /* Create upcall ring */
726 mtx_init(&V_bw_upcalls_ring_mtx, "mroute upcall buf_ring mtx", NULL, MTX_DEF);
727 V_bw_upcalls_ring = buf_ring_alloc(BW_UPCALLS_MAX, M_MRTABLE,
728 M_NOWAIT, &V_bw_upcalls_ring_mtx);
729 if (!V_bw_upcalls_ring) {
730 MRW_WUNLOCK();
731 MRW_TEARDOWN_WUNLOCK();
732 return (ENOMEM);
733 }
734
735 TASK_INIT(&V_task, 0, ip_mrouter_upcall_thread, curvnet);
736 taskqueue_cancel(V_task_queue, &V_task, NULL);
737 taskqueue_unblock(V_task_queue);
738
739 callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
740 curvnet);
741 callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
742 curvnet);
743
744 V_ip_mrouter = so;
745 atomic_add_int(&ip_mrouter_cnt, 1);
746
747 /* This is a mutex required by buf_ring init, but not used internally */
748 mtx_init(&V_buf_ring_mtx, "mroute buf_ring mtx", NULL, MTX_DEF);
749
750 MRW_WUNLOCK();
751 MRW_TEARDOWN_WUNLOCK();
752
753 CTR1(KTR_IPMF, "%s: done", __func__);
754
755 return 0;
756 }
757
758 /*
759 * Disable multicast forwarding.
760 */
761 static int
X_ip_mrouter_done(void)762 X_ip_mrouter_done(void)
763 {
764 struct ifnet **ifps;
765 int nifp;
766 u_long i;
767 vifi_t vifi;
768 struct bw_upcall *bu;
769
770 MRW_TEARDOWN_WLOCK();
771
772 if (V_ip_mrouter == NULL) {
773 MRW_TEARDOWN_WUNLOCK();
774 return (EINVAL);
775 }
776
777 /*
778 * Detach/disable hooks to the reset of the system.
779 */
780 V_ip_mrouter = NULL;
781 atomic_subtract_int(&ip_mrouter_cnt, 1);
782 V_mrt_api_config = 0;
783
784 /*
785 * Wait for all epoch sections to complete to ensure
786 * V_ip_mrouter = NULL is visible to others.
787 */
788 NET_EPOCH_WAIT();
789
790 /* Stop and drain task queue */
791 taskqueue_block(V_task_queue);
792 while (taskqueue_cancel(V_task_queue, &V_task, NULL)) {
793 taskqueue_drain(V_task_queue, &V_task);
794 }
795
796 ifps = malloc(MAXVIFS * sizeof(*ifps), M_TEMP, M_WAITOK);
797
798 MRW_WLOCK();
799 taskqueue_cancel(V_task_queue, &V_task, NULL);
800
801 /* Destroy upcall ring */
802 while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) {
803 free(bu, M_MRTABLE);
804 }
805 buf_ring_free(V_bw_upcalls_ring, M_MRTABLE);
806 mtx_destroy(&V_bw_upcalls_ring_mtx);
807
808 /*
809 * For each phyint in use, prepare to disable promiscuous reception
810 * of all IP multicasts. Defer the actual call until the lock is released;
811 * just record the list of interfaces while locked. Some interfaces use
812 * sx locks in their ioctl routines, which is not allowed while holding
813 * a non-sleepable lock.
814 */
815 KASSERT(V_numvifs <= MAXVIFS, ("More vifs than possible"));
816 for (vifi = 0, nifp = 0; vifi < V_numvifs; vifi++) {
817 if (!in_nullhost(V_viftable[vifi].v_lcl_addr) &&
818 !(V_viftable[vifi].v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
819 ifps[nifp++] = V_viftable[vifi].v_ifp;
820 }
821 }
822 bzero((caddr_t)V_viftable, sizeof(*V_viftable) * MAXVIFS);
823 V_numvifs = 0;
824 V_pim_assert_enabled = 0;
825
826 callout_stop(&V_expire_upcalls_ch);
827 callout_stop(&V_bw_upcalls_ch);
828
829 /*
830 * Free all multicast forwarding cache entries.
831 * Do not use hashdestroy(), as we must perform other cleanup.
832 */
833 for (i = 0; i < mfchashsize; i++) {
834 struct mfc *rt, *nrt;
835
836 LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
837 expire_mfc(rt);
838 }
839 }
840 free(V_mfchashtbl, M_MRTABLE);
841 V_mfchashtbl = NULL;
842
843 bzero(V_nexpire, sizeof(V_nexpire[0]) * mfchashsize);
844
845 V_reg_vif_num = VIFI_INVALID;
846
847 mtx_destroy(&V_buf_ring_mtx);
848
849 MRW_WUNLOCK();
850 MRW_TEARDOWN_WUNLOCK();
851
852 /*
853 * Now drop our claim on promiscuous multicast on the interfaces recorded
854 * above. This is safe to do now because ALLMULTI is reference counted.
855 */
856 for (vifi = 0; vifi < nifp; vifi++)
857 if_allmulti(ifps[vifi], 0);
858 free(ifps, M_TEMP);
859
860 CTR1(KTR_IPMF, "%s: done", __func__);
861
862 return 0;
863 }
864
865 /*
866 * Set PIM assert processing global
867 */
868 static int
set_assert(int i)869 set_assert(int i)
870 {
871 if ((i != 1) && (i != 0))
872 return EINVAL;
873
874 V_pim_assert_enabled = i;
875
876 return 0;
877 }
878
879 /*
880 * Configure API capabilities
881 */
882 int
set_api_config(uint32_t * apival)883 set_api_config(uint32_t *apival)
884 {
885 u_long i;
886
887 /*
888 * We can set the API capabilities only if it is the first operation
889 * after MRT_INIT. I.e.:
890 * - there are no vifs installed
891 * - pim_assert is not enabled
892 * - the MFC table is empty
893 */
894 if (V_numvifs > 0) {
895 *apival = 0;
896 return EPERM;
897 }
898 if (V_pim_assert_enabled) {
899 *apival = 0;
900 return EPERM;
901 }
902
903 MRW_RLOCK();
904
905 for (i = 0; i < mfchashsize; i++) {
906 if (LIST_FIRST(&V_mfchashtbl[i]) != NULL) {
907 MRW_RUNLOCK();
908 *apival = 0;
909 return EPERM;
910 }
911 }
912
913 MRW_RUNLOCK();
914
915 V_mrt_api_config = *apival & mrt_api_support;
916 *apival = V_mrt_api_config;
917
918 return 0;
919 }
920
921 /*
922 * Add a vif to the vif table
923 */
924 static int
add_vif(struct vifctl * vifcp)925 add_vif(struct vifctl *vifcp)
926 {
927 struct vif *vifp = V_viftable + vifcp->vifc_vifi;
928 struct sockaddr_in sin = {sizeof sin, AF_INET};
929 struct ifaddr *ifa;
930 struct ifnet *ifp;
931 int error;
932
933 if (vifcp->vifc_vifi >= MAXVIFS)
934 return EINVAL;
935 /* rate limiting is no longer supported by this code */
936 if (vifcp->vifc_rate_limit != 0) {
937 log(LOG_ERR, "rate limiting is no longer supported\n");
938 return EINVAL;
939 }
940
941 if (in_nullhost(vifcp->vifc_lcl_addr))
942 return EADDRNOTAVAIL;
943
944 /* Find the interface with an address in AF_INET family */
945 if (vifcp->vifc_flags & VIFF_REGISTER) {
946 /*
947 * XXX: Because VIFF_REGISTER does not really need a valid
948 * local interface (e.g. it could be 127.0.0.2), we don't
949 * check its address.
950 */
951 ifp = NULL;
952 } else {
953 struct epoch_tracker et;
954
955 sin.sin_addr = vifcp->vifc_lcl_addr;
956 NET_EPOCH_ENTER(et);
957 ifa = ifa_ifwithaddr((struct sockaddr *)&sin);
958 if (ifa == NULL) {
959 NET_EPOCH_EXIT(et);
960 return EADDRNOTAVAIL;
961 }
962 ifp = ifa->ifa_ifp;
963 /* XXX FIXME we need to take a ref on ifp and cleanup properly! */
964 NET_EPOCH_EXIT(et);
965 }
966
967 if ((vifcp->vifc_flags & VIFF_TUNNEL) != 0) {
968 CTR1(KTR_IPMF, "%s: tunnels are no longer supported", __func__);
969 return EOPNOTSUPP;
970 } else if (vifcp->vifc_flags & VIFF_REGISTER) {
971 ifp = V_multicast_register_if = if_alloc(IFT_LOOP);
972 CTR2(KTR_IPMF, "%s: add register vif for ifp %p", __func__, ifp);
973 if (V_reg_vif_num == VIFI_INVALID) {
974 if_initname(V_multicast_register_if, "register_vif", 0);
975 V_reg_vif_num = vifcp->vifc_vifi;
976 }
977 } else { /* Make sure the interface supports multicast */
978 if ((ifp->if_flags & IFF_MULTICAST) == 0)
979 return EOPNOTSUPP;
980
981 /* Enable promiscuous reception of all IP multicasts from the if */
982 error = if_allmulti(ifp, 1);
983 if (error)
984 return error;
985 }
986
987 MRW_WLOCK();
988
989 if (!in_nullhost(vifp->v_lcl_addr)) {
990 if (ifp)
991 V_multicast_register_if = NULL;
992 MRW_WUNLOCK();
993 if (ifp)
994 if_free(ifp);
995 return EADDRINUSE;
996 }
997
998 vifp->v_flags = vifcp->vifc_flags;
999 vifp->v_threshold = vifcp->vifc_threshold;
1000 vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1001 vifp->v_rmt_addr = vifcp->vifc_rmt_addr;
1002 vifp->v_ifp = ifp;
1003 /* initialize per vif pkt counters */
1004 vifp->v_pkt_in = 0;
1005 vifp->v_pkt_out = 0;
1006 vifp->v_bytes_in = 0;
1007 vifp->v_bytes_out = 0;
1008 sprintf(vifp->v_spin_name, "BM[%d] spin", vifcp->vifc_vifi);
1009 mtx_init(&vifp->v_spin, vifp->v_spin_name, NULL, MTX_SPIN);
1010
1011 /* Adjust numvifs up if the vifi is higher than numvifs */
1012 if (V_numvifs <= vifcp->vifc_vifi)
1013 V_numvifs = vifcp->vifc_vifi + 1;
1014
1015 MRW_WUNLOCK();
1016
1017 CTR4(KTR_IPMF, "%s: add vif %d laddr 0x%08x thresh %x", __func__,
1018 (int)vifcp->vifc_vifi, ntohl(vifcp->vifc_lcl_addr.s_addr),
1019 (int)vifcp->vifc_threshold);
1020
1021 return 0;
1022 }
1023
1024 /*
1025 * Delete a vif from the vif table
1026 */
1027 static int
del_vif_locked(vifi_t vifi,struct ifnet ** ifp_multi_leave,struct ifnet ** ifp_free)1028 del_vif_locked(vifi_t vifi, struct ifnet **ifp_multi_leave, struct ifnet **ifp_free)
1029 {
1030 struct vif *vifp;
1031
1032 *ifp_free = NULL;
1033 *ifp_multi_leave = NULL;
1034
1035 MRW_WLOCK_ASSERT();
1036
1037 if (vifi >= V_numvifs) {
1038 return EINVAL;
1039 }
1040 vifp = &V_viftable[vifi];
1041 if (in_nullhost(vifp->v_lcl_addr)) {
1042 return EADDRNOTAVAIL;
1043 }
1044
1045 if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER)))
1046 *ifp_multi_leave = vifp->v_ifp;
1047
1048 if (vifp->v_flags & VIFF_REGISTER) {
1049 V_reg_vif_num = VIFI_INVALID;
1050 if (vifp->v_ifp) {
1051 if (vifp->v_ifp == V_multicast_register_if)
1052 V_multicast_register_if = NULL;
1053 *ifp_free = vifp->v_ifp;
1054 }
1055 }
1056
1057 mtx_destroy(&vifp->v_spin);
1058
1059 bzero((caddr_t)vifp, sizeof (*vifp));
1060
1061 CTR2(KTR_IPMF, "%s: delete vif %d", __func__, (int)vifi);
1062
1063 /* Adjust numvifs down */
1064 for (vifi = V_numvifs; vifi > 0; vifi--)
1065 if (!in_nullhost(V_viftable[vifi-1].v_lcl_addr))
1066 break;
1067 V_numvifs = vifi;
1068
1069 return 0;
1070 }
1071
1072 static int
del_vif(vifi_t vifi)1073 del_vif(vifi_t vifi)
1074 {
1075 int cc;
1076 struct ifnet *free_ptr, *multi_leave;
1077
1078 MRW_WLOCK();
1079 cc = del_vif_locked(vifi, &multi_leave, &free_ptr);
1080 MRW_WUNLOCK();
1081
1082 if (multi_leave)
1083 if_allmulti(multi_leave, 0);
1084 if (free_ptr) {
1085 if_free(free_ptr);
1086 }
1087
1088 return cc;
1089 }
1090
1091 /*
1092 * update an mfc entry without resetting counters and S,G addresses.
1093 */
1094 static void
update_mfc_params(struct mfc * rt,struct mfcctl2 * mfccp)1095 update_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
1096 {
1097 int i;
1098
1099 rt->mfc_parent = mfccp->mfcc_parent;
1100 for (i = 0; i < V_numvifs; i++) {
1101 rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1102 rt->mfc_flags[i] = mfccp->mfcc_flags[i] & V_mrt_api_config &
1103 MRT_MFC_FLAGS_ALL;
1104 }
1105 /* set the RP address */
1106 if (V_mrt_api_config & MRT_MFC_RP)
1107 rt->mfc_rp = mfccp->mfcc_rp;
1108 else
1109 rt->mfc_rp.s_addr = INADDR_ANY;
1110 }
1111
1112 /*
1113 * fully initialize an mfc entry from the parameter.
1114 */
1115 static void
init_mfc_params(struct mfc * rt,struct mfcctl2 * mfccp)1116 init_mfc_params(struct mfc *rt, struct mfcctl2 *mfccp)
1117 {
1118 rt->mfc_origin = mfccp->mfcc_origin;
1119 rt->mfc_mcastgrp = mfccp->mfcc_mcastgrp;
1120
1121 update_mfc_params(rt, mfccp);
1122
1123 /* initialize pkt counters per src-grp */
1124 rt->mfc_pkt_cnt = 0;
1125 rt->mfc_byte_cnt = 0;
1126 rt->mfc_wrong_if = 0;
1127 timevalclear(&rt->mfc_last_assert);
1128 }
1129
1130 static void
expire_mfc(struct mfc * rt)1131 expire_mfc(struct mfc *rt)
1132 {
1133 struct rtdetq *rte;
1134
1135 MRW_WLOCK_ASSERT();
1136
1137 free_bw_list(rt->mfc_bw_meter_leq);
1138 free_bw_list(rt->mfc_bw_meter_geq);
1139
1140 while (!buf_ring_empty(rt->mfc_stall_ring)) {
1141 rte = buf_ring_dequeue_mc(rt->mfc_stall_ring);
1142 if (rte) {
1143 m_freem(rte->m);
1144 free(rte, M_MRTABLE);
1145 }
1146 }
1147 buf_ring_free(rt->mfc_stall_ring, M_MRTABLE);
1148
1149 LIST_REMOVE(rt, mfc_hash);
1150 free(rt, M_MRTABLE);
1151 }
1152
1153 /*
1154 * Add an mfc entry
1155 */
1156 static int
add_mfc(struct mfcctl2 * mfccp)1157 add_mfc(struct mfcctl2 *mfccp)
1158 {
1159 struct mfc *rt;
1160 struct rtdetq *rte;
1161 u_long hash = 0;
1162 u_short nstl;
1163 struct epoch_tracker et;
1164
1165 MRW_WLOCK();
1166 rt = mfc_find(&mfccp->mfcc_origin, &mfccp->mfcc_mcastgrp);
1167
1168 /* If an entry already exists, just update the fields */
1169 if (rt) {
1170 CTR4(KTR_IPMF, "%s: update mfc orig 0x%08x group %lx parent %x",
1171 __func__, ntohl(mfccp->mfcc_origin.s_addr),
1172 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
1173 mfccp->mfcc_parent);
1174 update_mfc_params(rt, mfccp);
1175 MRW_WUNLOCK();
1176 return (0);
1177 }
1178
1179 /*
1180 * Find the entry for which the upcall was made and update
1181 */
1182 nstl = 0;
1183 hash = MFCHASH(mfccp->mfcc_origin, mfccp->mfcc_mcastgrp);
1184 NET_EPOCH_ENTER(et);
1185 LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
1186 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1187 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp) &&
1188 !buf_ring_empty(rt->mfc_stall_ring)) {
1189 CTR5(KTR_IPMF,
1190 "%s: add mfc orig 0x%08x group %lx parent %x qh %p",
1191 __func__, ntohl(mfccp->mfcc_origin.s_addr),
1192 (u_long)ntohl(mfccp->mfcc_mcastgrp.s_addr),
1193 mfccp->mfcc_parent,
1194 rt->mfc_stall_ring);
1195 if (nstl++)
1196 CTR1(KTR_IPMF, "%s: multiple matches", __func__);
1197
1198 init_mfc_params(rt, mfccp);
1199 rt->mfc_expire = 0; /* Don't clean this guy up */
1200 V_nexpire[hash]--;
1201
1202 /* Free queued packets, but attempt to forward them first. */
1203 while (!buf_ring_empty(rt->mfc_stall_ring)) {
1204 rte = buf_ring_dequeue_mc(rt->mfc_stall_ring);
1205 if (rte->ifp != NULL)
1206 ip_mdq(rte->m, rte->ifp, rt, -1);
1207 m_freem(rte->m);
1208 free(rte, M_MRTABLE);
1209 }
1210 }
1211 }
1212 NET_EPOCH_EXIT(et);
1213
1214 /*
1215 * It is possible that an entry is being inserted without an upcall
1216 */
1217 if (nstl == 0) {
1218 CTR1(KTR_IPMF, "%s: adding mfc w/o upcall", __func__);
1219 LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash) {
1220 if (in_hosteq(rt->mfc_origin, mfccp->mfcc_origin) &&
1221 in_hosteq(rt->mfc_mcastgrp, mfccp->mfcc_mcastgrp)) {
1222 init_mfc_params(rt, mfccp);
1223 if (rt->mfc_expire)
1224 V_nexpire[hash]--;
1225 rt->mfc_expire = 0;
1226 break; /* XXX */
1227 }
1228 }
1229
1230 if (rt == NULL) { /* no upcall, so make a new entry */
1231 rt = mfc_alloc();
1232 if (rt == NULL) {
1233 MRW_WUNLOCK();
1234 return (ENOBUFS);
1235 }
1236
1237 init_mfc_params(rt, mfccp);
1238
1239 rt->mfc_expire = 0;
1240 rt->mfc_bw_meter_leq = NULL;
1241 rt->mfc_bw_meter_geq = NULL;
1242
1243 /* insert new entry at head of hash chain */
1244 LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
1245 }
1246 }
1247
1248 MRW_WUNLOCK();
1249
1250 return (0);
1251 }
1252
1253 /*
1254 * Delete an mfc entry
1255 */
1256 static int
del_mfc(struct mfcctl2 * mfccp)1257 del_mfc(struct mfcctl2 *mfccp)
1258 {
1259 struct in_addr origin;
1260 struct in_addr mcastgrp;
1261 struct mfc *rt;
1262
1263 origin = mfccp->mfcc_origin;
1264 mcastgrp = mfccp->mfcc_mcastgrp;
1265
1266 CTR3(KTR_IPMF, "%s: delete mfc orig 0x%08x group %lx", __func__,
1267 ntohl(origin.s_addr), (u_long)ntohl(mcastgrp.s_addr));
1268
1269 MRW_WLOCK();
1270
1271 LIST_FOREACH(rt, &V_mfchashtbl[MFCHASH(origin, mcastgrp)], mfc_hash) {
1272 if (in_hosteq(rt->mfc_origin, origin) &&
1273 in_hosteq(rt->mfc_mcastgrp, mcastgrp))
1274 break;
1275 }
1276 if (rt == NULL) {
1277 MRW_WUNLOCK();
1278 return EADDRNOTAVAIL;
1279 }
1280
1281 expire_mfc(rt);
1282
1283 MRW_WUNLOCK();
1284
1285 return (0);
1286 }
1287
1288 /*
1289 * Send a message to the routing daemon on the multicast routing socket.
1290 */
1291 static int
socket_send(struct socket * s,struct mbuf * mm,struct sockaddr_in * src)1292 socket_send(struct socket *s, struct mbuf *mm, struct sockaddr_in *src)
1293 {
1294 if (s) {
1295 SOCKBUF_LOCK(&s->so_rcv);
1296 if (sbappendaddr_locked(&s->so_rcv, (struct sockaddr *)src, mm,
1297 NULL) != 0) {
1298 sorwakeup_locked(s);
1299 return 0;
1300 }
1301 soroverflow_locked(s);
1302 }
1303 m_freem(mm);
1304 return -1;
1305 }
1306
1307 /*
1308 * IP multicast forwarding function. This function assumes that the packet
1309 * pointed to by "ip" has arrived on (or is about to be sent to) the interface
1310 * pointed to by "ifp", and the packet is to be relayed to other networks
1311 * that have members of the packet's destination IP multicast group.
1312 *
1313 * The packet is returned unscathed to the caller, unless it is
1314 * erroneous, in which case a non-zero return value tells the caller to
1315 * discard it.
1316 */
1317
1318 #define TUNNEL_LEN 12 /* # bytes of IP option for tunnel encapsulation */
1319
1320 static int
X_ip_mforward(struct ip * ip,struct ifnet * ifp,struct mbuf * m,struct ip_moptions * imo)1321 X_ip_mforward(struct ip *ip, struct ifnet *ifp, struct mbuf *m,
1322 struct ip_moptions *imo)
1323 {
1324 struct mfc *rt;
1325 int error;
1326 vifi_t vifi;
1327 struct mbuf *mb0;
1328 struct rtdetq *rte;
1329 u_long hash;
1330 int hlen;
1331
1332 M_ASSERTMAPPED(m);
1333
1334 CTR3(KTR_IPMF, "ip_mforward: delete mfc orig 0x%08x group %lx ifp %p",
1335 ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr), ifp);
1336
1337 if (ip->ip_hl < (sizeof(struct ip) + TUNNEL_LEN) >> 2 ||
1338 ((u_char *)(ip + 1))[1] != IPOPT_LSRR) {
1339 /*
1340 * Packet arrived via a physical interface or
1341 * an encapsulated tunnel or a register_vif.
1342 */
1343 } else {
1344 /*
1345 * Packet arrived through a source-route tunnel.
1346 * Source-route tunnels are no longer supported.
1347 */
1348 return (1);
1349 }
1350
1351 /*
1352 * BEGIN: MCAST ROUTING HOT PATH
1353 */
1354 MRW_RLOCK();
1355 if (imo && ((vifi = imo->imo_multicast_vif) < V_numvifs)) {
1356 if (ip->ip_ttl < MAXTTL)
1357 ip->ip_ttl++; /* compensate for -1 in *_send routines */
1358 error = ip_mdq(m, ifp, NULL, vifi);
1359 MRW_RUNLOCK();
1360 return error;
1361 }
1362
1363 /*
1364 * Don't forward a packet with time-to-live of zero or one,
1365 * or a packet destined to a local-only group.
1366 */
1367 if (ip->ip_ttl <= 1 || IN_LOCAL_GROUP(ntohl(ip->ip_dst.s_addr))) {
1368 MRW_RUNLOCK();
1369 return 0;
1370 }
1371
1372 mfc_find_retry:
1373 /*
1374 * Determine forwarding vifs from the forwarding cache table
1375 */
1376 MRTSTAT_INC(mrts_mfc_lookups);
1377 rt = mfc_find(&ip->ip_src, &ip->ip_dst);
1378
1379 /* Entry exists, so forward if necessary */
1380 if (rt != NULL) {
1381 error = ip_mdq(m, ifp, rt, -1);
1382 /* Generic unlock here as we might release R or W lock */
1383 MRW_UNLOCK();
1384 return error;
1385 }
1386
1387 /*
1388 * END: MCAST ROUTING HOT PATH
1389 */
1390
1391 /* Further processing must be done with WLOCK taken */
1392 if ((MRW_WOWNED() == 0) && (MRW_LOCK_TRY_UPGRADE() == 0)) {
1393 MRW_RUNLOCK();
1394 MRW_WLOCK();
1395 goto mfc_find_retry;
1396 }
1397
1398 /*
1399 * If we don't have a route for packet's origin,
1400 * Make a copy of the packet & send message to routing daemon
1401 */
1402 hlen = ip->ip_hl << 2;
1403
1404 MRTSTAT_INC(mrts_mfc_misses);
1405 MRTSTAT_INC(mrts_no_route);
1406 CTR2(KTR_IPMF, "ip_mforward: no mfc for (0x%08x,%lx)",
1407 ntohl(ip->ip_src.s_addr), (u_long)ntohl(ip->ip_dst.s_addr));
1408
1409 /*
1410 * Allocate mbufs early so that we don't do extra work if we are
1411 * just going to fail anyway. Make sure to pullup the header so
1412 * that other people can't step on it.
1413 */
1414 rte = malloc((sizeof *rte), M_MRTABLE, M_NOWAIT|M_ZERO);
1415 if (rte == NULL) {
1416 MRW_WUNLOCK();
1417 return ENOBUFS;
1418 }
1419
1420 mb0 = m_copypacket(m, M_NOWAIT);
1421 if (mb0 && (!M_WRITABLE(mb0) || mb0->m_len < hlen))
1422 mb0 = m_pullup(mb0, hlen);
1423 if (mb0 == NULL) {
1424 free(rte, M_MRTABLE);
1425 MRW_WUNLOCK();
1426 return ENOBUFS;
1427 }
1428
1429 /* is there an upcall waiting for this flow ? */
1430 hash = MFCHASH(ip->ip_src, ip->ip_dst);
1431 LIST_FOREACH(rt, &V_mfchashtbl[hash], mfc_hash)
1432 {
1433 if (in_hosteq(ip->ip_src, rt->mfc_origin) &&
1434 in_hosteq(ip->ip_dst, rt->mfc_mcastgrp) &&
1435 !buf_ring_empty(rt->mfc_stall_ring))
1436 break;
1437 }
1438
1439 if (rt == NULL) {
1440 int i;
1441 struct igmpmsg *im;
1442 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
1443 struct mbuf *mm;
1444
1445 /*
1446 * Locate the vifi for the incoming interface for this packet.
1447 * If none found, drop packet.
1448 */
1449 for (vifi = 0; vifi < V_numvifs &&
1450 V_viftable[vifi].v_ifp != ifp; vifi++)
1451 ;
1452 if (vifi >= V_numvifs) /* vif not found, drop packet */
1453 goto non_fatal;
1454
1455 /* no upcall, so make a new entry */
1456 rt = mfc_alloc();
1457 if (rt == NULL)
1458 goto fail;
1459
1460 /* Make a copy of the header to send to the user level process */
1461 mm = m_copym(mb0, 0, hlen, M_NOWAIT);
1462 if (mm == NULL)
1463 goto fail1;
1464
1465 /*
1466 * Send message to routing daemon to install
1467 * a route into the kernel table
1468 */
1469
1470 im = mtod(mm, struct igmpmsg*);
1471 im->im_msgtype = IGMPMSG_NOCACHE;
1472 im->im_mbz = 0;
1473 im->im_vif = vifi;
1474
1475 MRTSTAT_INC(mrts_upcalls);
1476
1477 k_igmpsrc.sin_addr = ip->ip_src;
1478 if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
1479 CTR0(KTR_IPMF, "ip_mforward: socket queue full");
1480 MRTSTAT_INC(mrts_upq_sockfull);
1481 fail1: free(rt, M_MRTABLE);
1482 fail: free(rte, M_MRTABLE);
1483 m_freem(mb0);
1484 MRW_WUNLOCK();
1485 return ENOBUFS;
1486 }
1487
1488 /* insert new entry at head of hash chain */
1489 rt->mfc_origin.s_addr = ip->ip_src.s_addr;
1490 rt->mfc_mcastgrp.s_addr = ip->ip_dst.s_addr;
1491 rt->mfc_expire = UPCALL_EXPIRE;
1492 V_nexpire[hash]++;
1493 for (i = 0; i < V_numvifs; i++) {
1494 rt->mfc_ttls[i] = 0;
1495 rt->mfc_flags[i] = 0;
1496 }
1497 rt->mfc_parent = -1;
1498
1499 /* clear the RP address */
1500 rt->mfc_rp.s_addr = INADDR_ANY;
1501 rt->mfc_bw_meter_leq = NULL;
1502 rt->mfc_bw_meter_geq = NULL;
1503
1504 /* initialize pkt counters per src-grp */
1505 rt->mfc_pkt_cnt = 0;
1506 rt->mfc_byte_cnt = 0;
1507 rt->mfc_wrong_if = 0;
1508 timevalclear(&rt->mfc_last_assert);
1509
1510 buf_ring_enqueue(rt->mfc_stall_ring, rte);
1511
1512 /* Add RT to hashtable as it didn't exist before */
1513 LIST_INSERT_HEAD(&V_mfchashtbl[hash], rt, mfc_hash);
1514 } else {
1515 /* determine if queue has overflowed */
1516 if (buf_ring_full(rt->mfc_stall_ring)) {
1517 MRTSTAT_INC(mrts_upq_ovflw);
1518 non_fatal: free(rte, M_MRTABLE);
1519 m_freem(mb0);
1520 MRW_WUNLOCK();
1521 return (0);
1522 }
1523
1524 buf_ring_enqueue(rt->mfc_stall_ring, rte);
1525 }
1526
1527 rte->m = mb0;
1528 rte->ifp = ifp;
1529
1530 MRW_WUNLOCK();
1531
1532 return 0;
1533 }
1534
1535 /*
1536 * Clean up the cache entry if upcall is not serviced
1537 */
1538 static void
expire_upcalls(void * arg)1539 expire_upcalls(void *arg)
1540 {
1541 u_long i;
1542
1543 CURVNET_SET((struct vnet *) arg);
1544
1545 /*This callout is always run with MRW_WLOCK taken. */
1546
1547 for (i = 0; i < mfchashsize; i++) {
1548 struct mfc *rt, *nrt;
1549
1550 if (V_nexpire[i] == 0)
1551 continue;
1552
1553 LIST_FOREACH_SAFE(rt, &V_mfchashtbl[i], mfc_hash, nrt) {
1554 if (buf_ring_empty(rt->mfc_stall_ring))
1555 continue;
1556
1557 if (rt->mfc_expire == 0 || --rt->mfc_expire > 0)
1558 continue;
1559
1560 MRTSTAT_INC(mrts_cache_cleanups);
1561 CTR3(KTR_IPMF, "%s: expire (%lx, %lx)", __func__,
1562 (u_long)ntohl(rt->mfc_origin.s_addr),
1563 (u_long)ntohl(rt->mfc_mcastgrp.s_addr));
1564
1565 expire_mfc(rt);
1566 }
1567 }
1568
1569 callout_reset(&V_expire_upcalls_ch, EXPIRE_TIMEOUT, expire_upcalls,
1570 curvnet);
1571
1572 CURVNET_RESTORE();
1573 }
1574
1575 /*
1576 * Packet forwarding routine once entry in the cache is made
1577 */
1578 static int
ip_mdq(struct mbuf * m,struct ifnet * ifp,struct mfc * rt,vifi_t xmt_vif)1579 ip_mdq(struct mbuf *m, struct ifnet *ifp, struct mfc *rt, vifi_t xmt_vif)
1580 {
1581 struct ip *ip = mtod(m, struct ip *);
1582 vifi_t vifi;
1583 int plen = ntohs(ip->ip_len);
1584
1585 M_ASSERTMAPPED(m);
1586 MRW_LOCK_ASSERT();
1587 NET_EPOCH_ASSERT();
1588
1589 /*
1590 * If xmt_vif is not -1, send on only the requested vif.
1591 *
1592 * (since vifi_t is u_short, -1 becomes MAXUSHORT, which > numvifs.)
1593 */
1594 if (xmt_vif < V_numvifs) {
1595 if (V_viftable[xmt_vif].v_flags & VIFF_REGISTER)
1596 pim_register_send(ip, V_viftable + xmt_vif, m, rt);
1597 else
1598 phyint_send(ip, V_viftable + xmt_vif, m);
1599 return 1;
1600 }
1601
1602 /*
1603 * Don't forward if it didn't arrive from the parent vif for its origin.
1604 */
1605 vifi = rt->mfc_parent;
1606 if ((vifi >= V_numvifs) || (V_viftable[vifi].v_ifp != ifp)) {
1607 CTR4(KTR_IPMF, "%s: rx on wrong ifp %p (vifi %d, v_ifp %p)",
1608 __func__, ifp, (int)vifi, V_viftable[vifi].v_ifp);
1609 MRTSTAT_INC(mrts_wrong_if);
1610 ++rt->mfc_wrong_if;
1611 /*
1612 * If we are doing PIM assert processing, send a message
1613 * to the routing daemon.
1614 *
1615 * XXX: A PIM-SM router needs the WRONGVIF detection so it
1616 * can complete the SPT switch, regardless of the type
1617 * of the iif (broadcast media, GRE tunnel, etc).
1618 */
1619 if (V_pim_assert_enabled && (vifi < V_numvifs) &&
1620 V_viftable[vifi].v_ifp) {
1621 if (ifp == V_multicast_register_if)
1622 PIMSTAT_INC(pims_rcv_registers_wrongiif);
1623
1624 /* Get vifi for the incoming packet */
1625 for (vifi = 0; vifi < V_numvifs && V_viftable[vifi].v_ifp != ifp; vifi++)
1626 ;
1627 if (vifi >= V_numvifs)
1628 return 0; /* The iif is not found: ignore the packet. */
1629
1630 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_DISABLE_WRONGVIF)
1631 return 0; /* WRONGVIF disabled: ignore the packet */
1632
1633 if (ratecheck(&rt->mfc_last_assert, &pim_assert_interval)) {
1634 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
1635 struct igmpmsg *im;
1636 int hlen = ip->ip_hl << 2;
1637 struct mbuf *mm = m_copym(m, 0, hlen, M_NOWAIT);
1638
1639 if (mm && (!M_WRITABLE(mm) || mm->m_len < hlen))
1640 mm = m_pullup(mm, hlen);
1641 if (mm == NULL)
1642 return ENOBUFS;
1643
1644 im = mtod(mm, struct igmpmsg *);
1645 im->im_msgtype = IGMPMSG_WRONGVIF;
1646 im->im_mbz = 0;
1647 im->im_vif = vifi;
1648
1649 MRTSTAT_INC(mrts_upcalls);
1650
1651 k_igmpsrc.sin_addr = im->im_src;
1652 if (socket_send(V_ip_mrouter, mm, &k_igmpsrc) < 0) {
1653 CTR1(KTR_IPMF, "%s: socket queue full", __func__);
1654 MRTSTAT_INC(mrts_upq_sockfull);
1655 return ENOBUFS;
1656 }
1657 }
1658 }
1659 return 0;
1660 }
1661
1662 /* If I sourced this packet, it counts as output, else it was input. */
1663 mtx_lock_spin(&V_viftable[vifi].v_spin);
1664 if (in_hosteq(ip->ip_src, V_viftable[vifi].v_lcl_addr)) {
1665 V_viftable[vifi].v_pkt_out++;
1666 V_viftable[vifi].v_bytes_out += plen;
1667 } else {
1668 V_viftable[vifi].v_pkt_in++;
1669 V_viftable[vifi].v_bytes_in += plen;
1670 }
1671 mtx_unlock_spin(&V_viftable[vifi].v_spin);
1672
1673 rt->mfc_pkt_cnt++;
1674 rt->mfc_byte_cnt += plen;
1675
1676 /*
1677 * For each vif, decide if a copy of the packet should be forwarded.
1678 * Forward if:
1679 * - the ttl exceeds the vif's threshold
1680 * - there are group members downstream on interface
1681 */
1682 for (vifi = 0; vifi < V_numvifs; vifi++)
1683 if ((rt->mfc_ttls[vifi] > 0) && (ip->ip_ttl > rt->mfc_ttls[vifi])) {
1684 V_viftable[vifi].v_pkt_out++;
1685 V_viftable[vifi].v_bytes_out += plen;
1686 if (V_viftable[vifi].v_flags & VIFF_REGISTER)
1687 pim_register_send(ip, V_viftable + vifi, m, rt);
1688 else
1689 phyint_send(ip, V_viftable + vifi, m);
1690 }
1691
1692 /*
1693 * Perform upcall-related bw measuring.
1694 */
1695 if ((rt->mfc_bw_meter_geq != NULL) || (rt->mfc_bw_meter_leq != NULL)) {
1696 struct bw_meter *x;
1697 struct timeval now;
1698
1699 microtime(&now);
1700 /* Process meters for Greater-or-EQual case */
1701 for (x = rt->mfc_bw_meter_geq; x != NULL; x = x->bm_mfc_next)
1702 bw_meter_geq_receive_packet(x, plen, &now);
1703
1704 /* Process meters for Lower-or-EQual case */
1705 for (x = rt->mfc_bw_meter_leq; x != NULL; x = x->bm_mfc_next) {
1706 /*
1707 * Record that a packet is received.
1708 * Spin lock has to be taken as callout context
1709 * (expire_bw_meter_leq) might modify these fields
1710 * as well
1711 */
1712 mtx_lock_spin(&x->bm_spin);
1713 x->bm_measured.b_packets++;
1714 x->bm_measured.b_bytes += plen;
1715 mtx_unlock_spin(&x->bm_spin);
1716 }
1717 }
1718
1719 return 0;
1720 }
1721
1722 /*
1723 * Check if a vif number is legal/ok. This is used by in_mcast.c.
1724 */
1725 static int
X_legal_vif_num(int vif)1726 X_legal_vif_num(int vif)
1727 {
1728 int ret;
1729
1730 ret = 0;
1731 if (vif < 0)
1732 return (ret);
1733
1734 MRW_RLOCK();
1735 if (vif < V_numvifs)
1736 ret = 1;
1737 MRW_RUNLOCK();
1738
1739 return (ret);
1740 }
1741
1742 /*
1743 * Return the local address used by this vif
1744 */
1745 static u_long
X_ip_mcast_src(int vifi)1746 X_ip_mcast_src(int vifi)
1747 {
1748 in_addr_t addr;
1749
1750 addr = INADDR_ANY;
1751 if (vifi < 0)
1752 return (addr);
1753
1754 MRW_RLOCK();
1755 if (vifi < V_numvifs)
1756 addr = V_viftable[vifi].v_lcl_addr.s_addr;
1757 MRW_RUNLOCK();
1758
1759 return (addr);
1760 }
1761
1762 static void
phyint_send(struct ip * ip,struct vif * vifp,struct mbuf * m)1763 phyint_send(struct ip *ip, struct vif *vifp, struct mbuf *m)
1764 {
1765 struct mbuf *mb_copy;
1766 int hlen = ip->ip_hl << 2;
1767
1768 MRW_LOCK_ASSERT();
1769 M_ASSERTMAPPED(m);
1770
1771 /*
1772 * Make a new reference to the packet; make sure that
1773 * the IP header is actually copied, not just referenced,
1774 * so that ip_output() only scribbles on the copy.
1775 */
1776 mb_copy = m_copypacket(m, M_NOWAIT);
1777 if (mb_copy && (!M_WRITABLE(mb_copy) || mb_copy->m_len < hlen))
1778 mb_copy = m_pullup(mb_copy, hlen);
1779 if (mb_copy == NULL)
1780 return;
1781
1782 send_packet(vifp, mb_copy);
1783 }
1784
1785 static void
send_packet(struct vif * vifp,struct mbuf * m)1786 send_packet(struct vif *vifp, struct mbuf *m)
1787 {
1788 struct ip_moptions imo;
1789 int error __unused;
1790
1791 MRW_LOCK_ASSERT();
1792 NET_EPOCH_ASSERT();
1793
1794 imo.imo_multicast_ifp = vifp->v_ifp;
1795 imo.imo_multicast_ttl = mtod(m, struct ip *)->ip_ttl - 1;
1796 imo.imo_multicast_loop = !!in_mcast_loop;
1797 imo.imo_multicast_vif = -1;
1798 STAILQ_INIT(&imo.imo_head);
1799
1800 /*
1801 * Re-entrancy should not be a problem here, because
1802 * the packets that we send out and are looped back at us
1803 * should get rejected because they appear to come from
1804 * the loopback interface, thus preventing looping.
1805 */
1806 error = ip_output(m, NULL, NULL, IP_FORWARDING, &imo, NULL);
1807 CTR3(KTR_IPMF, "%s: vif %td err %d", __func__,
1808 (ptrdiff_t)(vifp - V_viftable), error);
1809 }
1810
1811 /*
1812 * Stubs for old RSVP socket shim implementation.
1813 */
1814
1815 static int
X_ip_rsvp_vif(struct socket * so __unused,struct sockopt * sopt __unused)1816 X_ip_rsvp_vif(struct socket *so __unused, struct sockopt *sopt __unused)
1817 {
1818
1819 return (EOPNOTSUPP);
1820 }
1821
1822 static void
X_ip_rsvp_force_done(struct socket * so __unused)1823 X_ip_rsvp_force_done(struct socket *so __unused)
1824 {
1825
1826 }
1827
1828 static int
X_rsvp_input(struct mbuf ** mp,int * offp,int proto)1829 X_rsvp_input(struct mbuf **mp, int *offp, int proto)
1830 {
1831 struct mbuf *m;
1832
1833 m = *mp;
1834 *mp = NULL;
1835 if (!V_rsvp_on)
1836 m_freem(m);
1837 return (IPPROTO_DONE);
1838 }
1839
1840 /*
1841 * Code for bandwidth monitors
1842 */
1843
1844 /*
1845 * Define common interface for timeval-related methods
1846 */
1847 #define BW_TIMEVALCMP(tvp, uvp, cmp) timevalcmp((tvp), (uvp), cmp)
1848 #define BW_TIMEVALDECR(vvp, uvp) timevalsub((vvp), (uvp))
1849 #define BW_TIMEVALADD(vvp, uvp) timevaladd((vvp), (uvp))
1850
1851 static uint32_t
compute_bw_meter_flags(struct bw_upcall * req)1852 compute_bw_meter_flags(struct bw_upcall *req)
1853 {
1854 uint32_t flags = 0;
1855
1856 if (req->bu_flags & BW_UPCALL_UNIT_PACKETS)
1857 flags |= BW_METER_UNIT_PACKETS;
1858 if (req->bu_flags & BW_UPCALL_UNIT_BYTES)
1859 flags |= BW_METER_UNIT_BYTES;
1860 if (req->bu_flags & BW_UPCALL_GEQ)
1861 flags |= BW_METER_GEQ;
1862 if (req->bu_flags & BW_UPCALL_LEQ)
1863 flags |= BW_METER_LEQ;
1864
1865 return flags;
1866 }
1867
1868 static void
expire_bw_meter_leq(void * arg)1869 expire_bw_meter_leq(void *arg)
1870 {
1871 struct bw_meter *x = arg;
1872 struct timeval now;
1873 /*
1874 * INFO:
1875 * callout is always executed with MRW_WLOCK taken
1876 */
1877
1878 CURVNET_SET((struct vnet *)x->arg);
1879
1880 microtime(&now);
1881
1882 /*
1883 * Test if we should deliver an upcall
1884 */
1885 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
1886 (x->bm_measured.b_packets <= x->bm_threshold.b_packets)) ||
1887 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
1888 (x->bm_measured.b_bytes <= x->bm_threshold.b_bytes))) {
1889 /* Prepare an upcall for delivery */
1890 bw_meter_prepare_upcall(x, &now);
1891 }
1892
1893 /* Send all upcalls that are pending delivery */
1894 taskqueue_enqueue(V_task_queue, &V_task);
1895
1896 /* Reset counters */
1897 x->bm_start_time = now;
1898 /* Spin lock has to be taken as ip_forward context
1899 * might modify these fields as well
1900 */
1901 mtx_lock_spin(&x->bm_spin);
1902 x->bm_measured.b_bytes = 0;
1903 x->bm_measured.b_packets = 0;
1904 mtx_unlock_spin(&x->bm_spin);
1905
1906 callout_schedule(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time));
1907
1908 CURVNET_RESTORE();
1909 }
1910
1911 /*
1912 * Add a bw_meter entry
1913 */
1914 static int
add_bw_upcall(struct bw_upcall * req)1915 add_bw_upcall(struct bw_upcall *req)
1916 {
1917 struct mfc *mfc;
1918 struct timeval delta = { BW_UPCALL_THRESHOLD_INTERVAL_MIN_SEC,
1919 BW_UPCALL_THRESHOLD_INTERVAL_MIN_USEC };
1920 struct timeval now;
1921 struct bw_meter *x, **bwm_ptr;
1922 uint32_t flags;
1923
1924 if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
1925 return EOPNOTSUPP;
1926
1927 /* Test if the flags are valid */
1928 if (!(req->bu_flags & (BW_UPCALL_UNIT_PACKETS | BW_UPCALL_UNIT_BYTES)))
1929 return EINVAL;
1930 if (!(req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)))
1931 return EINVAL;
1932 if ((req->bu_flags & (BW_UPCALL_GEQ | BW_UPCALL_LEQ)) == (BW_UPCALL_GEQ | BW_UPCALL_LEQ))
1933 return EINVAL;
1934
1935 /* Test if the threshold time interval is valid */
1936 if (BW_TIMEVALCMP(&req->bu_threshold.b_time, &delta, <))
1937 return EINVAL;
1938
1939 flags = compute_bw_meter_flags(req);
1940
1941 /*
1942 * Find if we have already same bw_meter entry
1943 */
1944 MRW_WLOCK();
1945 mfc = mfc_find(&req->bu_src, &req->bu_dst);
1946 if (mfc == NULL) {
1947 MRW_WUNLOCK();
1948 return EADDRNOTAVAIL;
1949 }
1950
1951 /* Choose an appropriate bw_meter list */
1952 if (req->bu_flags & BW_UPCALL_GEQ)
1953 bwm_ptr = &mfc->mfc_bw_meter_geq;
1954 else
1955 bwm_ptr = &mfc->mfc_bw_meter_leq;
1956
1957 for (x = *bwm_ptr; x != NULL; x = x->bm_mfc_next) {
1958 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time,
1959 &req->bu_threshold.b_time, ==))
1960 && (x->bm_threshold.b_packets
1961 == req->bu_threshold.b_packets)
1962 && (x->bm_threshold.b_bytes
1963 == req->bu_threshold.b_bytes)
1964 && (x->bm_flags & BW_METER_USER_FLAGS)
1965 == flags) {
1966 MRW_WUNLOCK();
1967 return 0; /* XXX Already installed */
1968 }
1969 }
1970
1971 /* Allocate the new bw_meter entry */
1972 x = malloc(sizeof(*x), M_BWMETER, M_ZERO | M_NOWAIT);
1973 if (x == NULL) {
1974 MRW_WUNLOCK();
1975 return ENOBUFS;
1976 }
1977
1978 /* Set the new bw_meter entry */
1979 x->bm_threshold.b_time = req->bu_threshold.b_time;
1980 microtime(&now);
1981 x->bm_start_time = now;
1982 x->bm_threshold.b_packets = req->bu_threshold.b_packets;
1983 x->bm_threshold.b_bytes = req->bu_threshold.b_bytes;
1984 x->bm_measured.b_packets = 0;
1985 x->bm_measured.b_bytes = 0;
1986 x->bm_flags = flags;
1987 x->bm_time_next = NULL;
1988 x->bm_mfc = mfc;
1989 x->arg = curvnet;
1990 sprintf(x->bm_spin_name, "BM spin %p", x);
1991 mtx_init(&x->bm_spin, x->bm_spin_name, NULL, MTX_SPIN);
1992
1993 /* For LEQ case create periodic callout */
1994 if (req->bu_flags & BW_UPCALL_LEQ) {
1995 callout_init_rw(&x->bm_meter_callout, &mrouter_lock, CALLOUT_SHAREDLOCK);
1996 callout_reset(&x->bm_meter_callout, tvtohz(&x->bm_threshold.b_time),
1997 expire_bw_meter_leq, x);
1998 }
1999
2000 /* Add the new bw_meter entry to the front of entries for this MFC */
2001 x->bm_mfc_next = *bwm_ptr;
2002 *bwm_ptr = x;
2003
2004 MRW_WUNLOCK();
2005
2006 return 0;
2007 }
2008
2009 static void
free_bw_list(struct bw_meter * list)2010 free_bw_list(struct bw_meter *list)
2011 {
2012 while (list != NULL) {
2013 struct bw_meter *x = list;
2014
2015 /* MRW_WLOCK must be held here */
2016 if (x->bm_flags & BW_METER_LEQ) {
2017 callout_drain(&x->bm_meter_callout);
2018 mtx_destroy(&x->bm_spin);
2019 }
2020
2021 list = list->bm_mfc_next;
2022 free(x, M_BWMETER);
2023 }
2024 }
2025
2026 /*
2027 * Delete one or multiple bw_meter entries
2028 */
2029 static int
del_bw_upcall(struct bw_upcall * req)2030 del_bw_upcall(struct bw_upcall *req)
2031 {
2032 struct mfc *mfc;
2033 struct bw_meter *x, **bwm_ptr;
2034
2035 if (!(V_mrt_api_config & MRT_MFC_BW_UPCALL))
2036 return EOPNOTSUPP;
2037
2038 MRW_WLOCK();
2039
2040 /* Find the corresponding MFC entry */
2041 mfc = mfc_find(&req->bu_src, &req->bu_dst);
2042 if (mfc == NULL) {
2043 MRW_WUNLOCK();
2044 return EADDRNOTAVAIL;
2045 } else if (req->bu_flags & BW_UPCALL_DELETE_ALL) {
2046 /*
2047 * Delete all bw_meter entries for this mfc
2048 */
2049 struct bw_meter *list;
2050
2051 /* Free LEQ list */
2052 list = mfc->mfc_bw_meter_leq;
2053 mfc->mfc_bw_meter_leq = NULL;
2054 free_bw_list(list);
2055
2056 /* Free GEQ list */
2057 list = mfc->mfc_bw_meter_geq;
2058 mfc->mfc_bw_meter_geq = NULL;
2059 free_bw_list(list);
2060 MRW_WUNLOCK();
2061 return 0;
2062 } else { /* Delete a single bw_meter entry */
2063 struct bw_meter *prev;
2064 uint32_t flags = 0;
2065
2066 flags = compute_bw_meter_flags(req);
2067
2068 /* Choose an appropriate bw_meter list */
2069 if (req->bu_flags & BW_UPCALL_GEQ)
2070 bwm_ptr = &mfc->mfc_bw_meter_geq;
2071 else
2072 bwm_ptr = &mfc->mfc_bw_meter_leq;
2073
2074 /* Find the bw_meter entry to delete */
2075 for (prev = NULL, x = *bwm_ptr; x != NULL;
2076 prev = x, x = x->bm_mfc_next) {
2077 if ((BW_TIMEVALCMP(&x->bm_threshold.b_time, &req->bu_threshold.b_time, ==)) &&
2078 (x->bm_threshold.b_packets == req->bu_threshold.b_packets) &&
2079 (x->bm_threshold.b_bytes == req->bu_threshold.b_bytes) &&
2080 (x->bm_flags & BW_METER_USER_FLAGS) == flags)
2081 break;
2082 }
2083 if (x != NULL) { /* Delete entry from the list for this MFC */
2084 if (prev != NULL)
2085 prev->bm_mfc_next = x->bm_mfc_next; /* remove from middle*/
2086 else
2087 *bwm_ptr = x->bm_mfc_next;/* new head of list */
2088
2089 if (req->bu_flags & BW_UPCALL_LEQ)
2090 callout_stop(&x->bm_meter_callout);
2091
2092 MRW_WUNLOCK();
2093 /* Free the bw_meter entry */
2094 free(x, M_BWMETER);
2095 return 0;
2096 } else {
2097 MRW_WUNLOCK();
2098 return EINVAL;
2099 }
2100 }
2101 __assert_unreachable();
2102 }
2103
2104 /*
2105 * Perform bandwidth measurement processing that may result in an upcall
2106 */
2107 static void
bw_meter_geq_receive_packet(struct bw_meter * x,int plen,struct timeval * nowp)2108 bw_meter_geq_receive_packet(struct bw_meter *x, int plen, struct timeval *nowp)
2109 {
2110 struct timeval delta;
2111
2112 MRW_LOCK_ASSERT();
2113
2114 delta = *nowp;
2115 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2116
2117 /*
2118 * Processing for ">=" type of bw_meter entry.
2119 * bm_spin does not have to be hold here as in GEQ
2120 * case this is the only context accessing bm_measured.
2121 */
2122 if (BW_TIMEVALCMP(&delta, &x->bm_threshold.b_time, >)) {
2123 /* Reset the bw_meter entry */
2124 x->bm_start_time = *nowp;
2125 x->bm_measured.b_packets = 0;
2126 x->bm_measured.b_bytes = 0;
2127 x->bm_flags &= ~BW_METER_UPCALL_DELIVERED;
2128 }
2129
2130 /* Record that a packet is received */
2131 x->bm_measured.b_packets++;
2132 x->bm_measured.b_bytes += plen;
2133
2134 /*
2135 * Test if we should deliver an upcall
2136 */
2137 if (!(x->bm_flags & BW_METER_UPCALL_DELIVERED)) {
2138 if (((x->bm_flags & BW_METER_UNIT_PACKETS) &&
2139 (x->bm_measured.b_packets >= x->bm_threshold.b_packets)) ||
2140 ((x->bm_flags & BW_METER_UNIT_BYTES) &&
2141 (x->bm_measured.b_bytes >= x->bm_threshold.b_bytes))) {
2142 /* Prepare an upcall for delivery */
2143 bw_meter_prepare_upcall(x, nowp);
2144 x->bm_flags |= BW_METER_UPCALL_DELIVERED;
2145 }
2146 }
2147 }
2148
2149 /*
2150 * Prepare a bandwidth-related upcall
2151 */
2152 static void
bw_meter_prepare_upcall(struct bw_meter * x,struct timeval * nowp)2153 bw_meter_prepare_upcall(struct bw_meter *x, struct timeval *nowp)
2154 {
2155 struct timeval delta;
2156 struct bw_upcall *u;
2157
2158 MRW_LOCK_ASSERT();
2159
2160 /*
2161 * Compute the measured time interval
2162 */
2163 delta = *nowp;
2164 BW_TIMEVALDECR(&delta, &x->bm_start_time);
2165
2166 /*
2167 * Set the bw_upcall entry
2168 */
2169 u = malloc(sizeof(struct bw_upcall), M_MRTABLE, M_NOWAIT | M_ZERO);
2170 if (!u) {
2171 log(LOG_WARNING, "bw_meter_prepare_upcall: cannot allocate entry\n");
2172 return;
2173 }
2174 u->bu_src = x->bm_mfc->mfc_origin;
2175 u->bu_dst = x->bm_mfc->mfc_mcastgrp;
2176 u->bu_threshold.b_time = x->bm_threshold.b_time;
2177 u->bu_threshold.b_packets = x->bm_threshold.b_packets;
2178 u->bu_threshold.b_bytes = x->bm_threshold.b_bytes;
2179 u->bu_measured.b_time = delta;
2180 u->bu_measured.b_packets = x->bm_measured.b_packets;
2181 u->bu_measured.b_bytes = x->bm_measured.b_bytes;
2182 u->bu_flags = 0;
2183 if (x->bm_flags & BW_METER_UNIT_PACKETS)
2184 u->bu_flags |= BW_UPCALL_UNIT_PACKETS;
2185 if (x->bm_flags & BW_METER_UNIT_BYTES)
2186 u->bu_flags |= BW_UPCALL_UNIT_BYTES;
2187 if (x->bm_flags & BW_METER_GEQ)
2188 u->bu_flags |= BW_UPCALL_GEQ;
2189 if (x->bm_flags & BW_METER_LEQ)
2190 u->bu_flags |= BW_UPCALL_LEQ;
2191
2192 if (buf_ring_enqueue(V_bw_upcalls_ring, u))
2193 log(LOG_WARNING, "bw_meter_prepare_upcall: cannot enqueue upcall\n");
2194 if (buf_ring_count(V_bw_upcalls_ring) > (BW_UPCALLS_MAX / 2)) {
2195 taskqueue_enqueue(V_task_queue, &V_task);
2196 }
2197 }
2198 /*
2199 * Send the pending bandwidth-related upcalls
2200 */
2201 static void
bw_upcalls_send(void)2202 bw_upcalls_send(void)
2203 {
2204 struct mbuf *m;
2205 int len = 0;
2206 struct bw_upcall *bu;
2207 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
2208 static struct igmpmsg igmpmsg = {
2209 0, /* unused1 */
2210 0, /* unused2 */
2211 IGMPMSG_BW_UPCALL,/* im_msgtype */
2212 0, /* im_mbz */
2213 0, /* im_vif */
2214 0, /* unused3 */
2215 { 0 }, /* im_src */
2216 { 0 } /* im_dst */
2217 };
2218
2219 MRW_LOCK_ASSERT();
2220
2221 if (buf_ring_empty(V_bw_upcalls_ring))
2222 return;
2223
2224 /*
2225 * Allocate a new mbuf, initialize it with the header and
2226 * the payload for the pending calls.
2227 */
2228 m = m_gethdr(M_NOWAIT, MT_DATA);
2229 if (m == NULL) {
2230 log(LOG_WARNING, "bw_upcalls_send: cannot allocate mbuf\n");
2231 return;
2232 }
2233
2234 m_copyback(m, 0, sizeof(struct igmpmsg), (caddr_t)&igmpmsg);
2235 len += sizeof(struct igmpmsg);
2236 while ((bu = buf_ring_dequeue_mc(V_bw_upcalls_ring)) != NULL) {
2237 m_copyback(m, len, sizeof(struct bw_upcall), (caddr_t)bu);
2238 len += sizeof(struct bw_upcall);
2239 free(bu, M_MRTABLE);
2240 }
2241
2242 /*
2243 * Send the upcalls
2244 * XXX do we need to set the address in k_igmpsrc ?
2245 */
2246 MRTSTAT_INC(mrts_upcalls);
2247 if (socket_send(V_ip_mrouter, m, &k_igmpsrc) < 0) {
2248 log(LOG_WARNING, "bw_upcalls_send: ip_mrouter socket queue full\n");
2249 MRTSTAT_INC(mrts_upq_sockfull);
2250 }
2251 }
2252
2253 /*
2254 * A periodic function for sending all upcalls that are pending delivery
2255 */
2256 static void
expire_bw_upcalls_send(void * arg)2257 expire_bw_upcalls_send(void *arg)
2258 {
2259 CURVNET_SET((struct vnet *) arg);
2260
2261 /* This callout is run with MRW_RLOCK taken */
2262
2263 bw_upcalls_send();
2264
2265 callout_reset(&V_bw_upcalls_ch, BW_UPCALLS_PERIOD, expire_bw_upcalls_send,
2266 curvnet);
2267 CURVNET_RESTORE();
2268 }
2269
2270 /*
2271 * End of bandwidth monitoring code
2272 */
2273
2274 /*
2275 * Send the packet up to the user daemon, or eventually do kernel encapsulation
2276 *
2277 */
2278 static int
pim_register_send(struct ip * ip,struct vif * vifp,struct mbuf * m,struct mfc * rt)2279 pim_register_send(struct ip *ip, struct vif *vifp, struct mbuf *m,
2280 struct mfc *rt)
2281 {
2282 struct mbuf *mb_copy, *mm;
2283
2284 /*
2285 * Do not send IGMP_WHOLEPKT notifications to userland, if the
2286 * rendezvous point was unspecified, and we were told not to.
2287 */
2288 if (pim_squelch_wholepkt != 0 && (V_mrt_api_config & MRT_MFC_RP) &&
2289 in_nullhost(rt->mfc_rp))
2290 return 0;
2291
2292 mb_copy = pim_register_prepare(ip, m);
2293 if (mb_copy == NULL)
2294 return ENOBUFS;
2295
2296 /*
2297 * Send all the fragments. Note that the mbuf for each fragment
2298 * is freed by the sending machinery.
2299 */
2300 for (mm = mb_copy; mm; mm = mb_copy) {
2301 mb_copy = mm->m_nextpkt;
2302 mm->m_nextpkt = 0;
2303 mm = m_pullup(mm, sizeof(struct ip));
2304 if (mm != NULL) {
2305 ip = mtod(mm, struct ip *);
2306 if ((V_mrt_api_config & MRT_MFC_RP) && !in_nullhost(rt->mfc_rp)) {
2307 pim_register_send_rp(ip, vifp, mm, rt);
2308 } else {
2309 pim_register_send_upcall(ip, vifp, mm, rt);
2310 }
2311 }
2312 }
2313
2314 return 0;
2315 }
2316
2317 /*
2318 * Return a copy of the data packet that is ready for PIM Register
2319 * encapsulation.
2320 * XXX: Note that in the returned copy the IP header is a valid one.
2321 */
2322 static struct mbuf *
pim_register_prepare(struct ip * ip,struct mbuf * m)2323 pim_register_prepare(struct ip *ip, struct mbuf *m)
2324 {
2325 struct mbuf *mb_copy = NULL;
2326 int mtu;
2327
2328 /* Take care of delayed checksums */
2329 if (m->m_pkthdr.csum_flags & CSUM_DELAY_DATA) {
2330 in_delayed_cksum(m);
2331 m->m_pkthdr.csum_flags &= ~CSUM_DELAY_DATA;
2332 }
2333
2334 /*
2335 * Copy the old packet & pullup its IP header into the
2336 * new mbuf so we can modify it.
2337 */
2338 mb_copy = m_copypacket(m, M_NOWAIT);
2339 if (mb_copy == NULL)
2340 return NULL;
2341 mb_copy = m_pullup(mb_copy, ip->ip_hl << 2);
2342 if (mb_copy == NULL)
2343 return NULL;
2344
2345 /* take care of the TTL */
2346 ip = mtod(mb_copy, struct ip *);
2347 --ip->ip_ttl;
2348
2349 /* Compute the MTU after the PIM Register encapsulation */
2350 mtu = 0xffff - sizeof(pim_encap_iphdr) - sizeof(pim_encap_pimhdr);
2351
2352 if (ntohs(ip->ip_len) <= mtu) {
2353 /* Turn the IP header into a valid one */
2354 ip->ip_sum = 0;
2355 ip->ip_sum = in_cksum(mb_copy, ip->ip_hl << 2);
2356 } else {
2357 /* Fragment the packet */
2358 mb_copy->m_pkthdr.csum_flags |= CSUM_IP;
2359 if (ip_fragment(ip, &mb_copy, mtu, 0) != 0) {
2360 m_freem(mb_copy);
2361 return NULL;
2362 }
2363 }
2364 return mb_copy;
2365 }
2366
2367 /*
2368 * Send an upcall with the data packet to the user-level process.
2369 */
2370 static int
pim_register_send_upcall(struct ip * ip,struct vif * vifp,struct mbuf * mb_copy,struct mfc * rt)2371 pim_register_send_upcall(struct ip *ip, struct vif *vifp,
2372 struct mbuf *mb_copy, struct mfc *rt)
2373 {
2374 struct mbuf *mb_first;
2375 int len = ntohs(ip->ip_len);
2376 struct igmpmsg *im;
2377 struct sockaddr_in k_igmpsrc = { sizeof k_igmpsrc, AF_INET };
2378
2379 MRW_LOCK_ASSERT();
2380
2381 /*
2382 * Add a new mbuf with an upcall header
2383 */
2384 mb_first = m_gethdr(M_NOWAIT, MT_DATA);
2385 if (mb_first == NULL) {
2386 m_freem(mb_copy);
2387 return ENOBUFS;
2388 }
2389 mb_first->m_data += max_linkhdr;
2390 mb_first->m_pkthdr.len = len + sizeof(struct igmpmsg);
2391 mb_first->m_len = sizeof(struct igmpmsg);
2392 mb_first->m_next = mb_copy;
2393
2394 /* Send message to routing daemon */
2395 im = mtod(mb_first, struct igmpmsg *);
2396 im->im_msgtype = IGMPMSG_WHOLEPKT;
2397 im->im_mbz = 0;
2398 im->im_vif = vifp - V_viftable;
2399 im->im_src = ip->ip_src;
2400 im->im_dst = ip->ip_dst;
2401
2402 k_igmpsrc.sin_addr = ip->ip_src;
2403
2404 MRTSTAT_INC(mrts_upcalls);
2405
2406 if (socket_send(V_ip_mrouter, mb_first, &k_igmpsrc) < 0) {
2407 CTR1(KTR_IPMF, "%s: socket queue full", __func__);
2408 MRTSTAT_INC(mrts_upq_sockfull);
2409 return ENOBUFS;
2410 }
2411
2412 /* Keep statistics */
2413 PIMSTAT_INC(pims_snd_registers_msgs);
2414 PIMSTAT_ADD(pims_snd_registers_bytes, len);
2415
2416 return 0;
2417 }
2418
2419 /*
2420 * Encapsulate the data packet in PIM Register message and send it to the RP.
2421 */
2422 static int
pim_register_send_rp(struct ip * ip,struct vif * vifp,struct mbuf * mb_copy,struct mfc * rt)2423 pim_register_send_rp(struct ip *ip, struct vif *vifp, struct mbuf *mb_copy,
2424 struct mfc *rt)
2425 {
2426 struct mbuf *mb_first;
2427 struct ip *ip_outer;
2428 struct pim_encap_pimhdr *pimhdr;
2429 int len = ntohs(ip->ip_len);
2430 vifi_t vifi = rt->mfc_parent;
2431
2432 MRW_LOCK_ASSERT();
2433
2434 if ((vifi >= V_numvifs) || in_nullhost(V_viftable[vifi].v_lcl_addr)) {
2435 m_freem(mb_copy);
2436 return EADDRNOTAVAIL; /* The iif vif is invalid */
2437 }
2438
2439 /*
2440 * Add a new mbuf with the encapsulating header
2441 */
2442 mb_first = m_gethdr(M_NOWAIT, MT_DATA);
2443 if (mb_first == NULL) {
2444 m_freem(mb_copy);
2445 return ENOBUFS;
2446 }
2447 mb_first->m_data += max_linkhdr;
2448 mb_first->m_len = sizeof(pim_encap_iphdr) + sizeof(pim_encap_pimhdr);
2449 mb_first->m_next = mb_copy;
2450
2451 mb_first->m_pkthdr.len = len + mb_first->m_len;
2452
2453 /*
2454 * Fill in the encapsulating IP and PIM header
2455 */
2456 ip_outer = mtod(mb_first, struct ip *);
2457 *ip_outer = pim_encap_iphdr;
2458 ip_outer->ip_len = htons(len + sizeof(pim_encap_iphdr) +
2459 sizeof(pim_encap_pimhdr));
2460 ip_outer->ip_src = V_viftable[vifi].v_lcl_addr;
2461 ip_outer->ip_dst = rt->mfc_rp;
2462 /*
2463 * Copy the inner header TOS to the outer header, and take care of the
2464 * IP_DF bit.
2465 */
2466 ip_outer->ip_tos = ip->ip_tos;
2467 if (ip->ip_off & htons(IP_DF))
2468 ip_outer->ip_off |= htons(IP_DF);
2469 ip_fillid(ip_outer);
2470 pimhdr = (struct pim_encap_pimhdr *)((caddr_t)ip_outer
2471 + sizeof(pim_encap_iphdr));
2472 *pimhdr = pim_encap_pimhdr;
2473 /* If the iif crosses a border, set the Border-bit */
2474 if (rt->mfc_flags[vifi] & MRT_MFC_FLAGS_BORDER_VIF & V_mrt_api_config)
2475 pimhdr->flags |= htonl(PIM_BORDER_REGISTER);
2476
2477 mb_first->m_data += sizeof(pim_encap_iphdr);
2478 pimhdr->pim.pim_cksum = in_cksum(mb_first, sizeof(pim_encap_pimhdr));
2479 mb_first->m_data -= sizeof(pim_encap_iphdr);
2480
2481 send_packet(vifp, mb_first);
2482
2483 /* Keep statistics */
2484 PIMSTAT_INC(pims_snd_registers_msgs);
2485 PIMSTAT_ADD(pims_snd_registers_bytes, len);
2486
2487 return 0;
2488 }
2489
2490 /*
2491 * pim_encapcheck() is called by the encap4_input() path at runtime to
2492 * determine if a packet is for PIM; allowing PIM to be dynamically loaded
2493 * into the kernel.
2494 */
2495 static int
pim_encapcheck(const struct mbuf * m __unused,int off __unused,int proto __unused,void * arg __unused)2496 pim_encapcheck(const struct mbuf *m __unused, int off __unused,
2497 int proto __unused, void *arg __unused)
2498 {
2499
2500 KASSERT(proto == IPPROTO_PIM, ("not for IPPROTO_PIM"));
2501 return (8); /* claim the datagram. */
2502 }
2503
2504 /*
2505 * PIM-SMv2 and PIM-DM messages processing.
2506 * Receives and verifies the PIM control messages, and passes them
2507 * up to the listening socket, using rip_input().
2508 * The only message with special processing is the PIM_REGISTER message
2509 * (used by PIM-SM): the PIM header is stripped off, and the inner packet
2510 * is passed to if_simloop().
2511 */
2512 static int
pim_input(struct mbuf * m,int off,int proto,void * arg __unused)2513 pim_input(struct mbuf *m, int off, int proto, void *arg __unused)
2514 {
2515 struct ip *ip = mtod(m, struct ip *);
2516 struct pim *pim;
2517 int iphlen = off;
2518 int minlen;
2519 int datalen = ntohs(ip->ip_len) - iphlen;
2520 int ip_tos;
2521
2522 /* Keep statistics */
2523 PIMSTAT_INC(pims_rcv_total_msgs);
2524 PIMSTAT_ADD(pims_rcv_total_bytes, datalen);
2525
2526 /*
2527 * Validate lengths
2528 */
2529 if (datalen < PIM_MINLEN) {
2530 PIMSTAT_INC(pims_rcv_tooshort);
2531 CTR3(KTR_IPMF, "%s: short packet (%d) from 0x%08x",
2532 __func__, datalen, ntohl(ip->ip_src.s_addr));
2533 m_freem(m);
2534 return (IPPROTO_DONE);
2535 }
2536
2537 /*
2538 * If the packet is at least as big as a REGISTER, go agead
2539 * and grab the PIM REGISTER header size, to avoid another
2540 * possible m_pullup() later.
2541 *
2542 * PIM_MINLEN == pimhdr + u_int32_t == 4 + 4 = 8
2543 * PIM_REG_MINLEN == pimhdr + reghdr + encap_iphdr == 4 + 4 + 20 = 28
2544 */
2545 minlen = iphlen + (datalen >= PIM_REG_MINLEN ? PIM_REG_MINLEN : PIM_MINLEN);
2546 /*
2547 * Get the IP and PIM headers in contiguous memory, and
2548 * possibly the PIM REGISTER header.
2549 */
2550 if (m->m_len < minlen && (m = m_pullup(m, minlen)) == NULL) {
2551 CTR1(KTR_IPMF, "%s: m_pullup() failed", __func__);
2552 return (IPPROTO_DONE);
2553 }
2554
2555 /* m_pullup() may have given us a new mbuf so reset ip. */
2556 ip = mtod(m, struct ip *);
2557 ip_tos = ip->ip_tos;
2558
2559 /* adjust mbuf to point to the PIM header */
2560 m->m_data += iphlen;
2561 m->m_len -= iphlen;
2562 pim = mtod(m, struct pim *);
2563
2564 /*
2565 * Validate checksum. If PIM REGISTER, exclude the data packet.
2566 *
2567 * XXX: some older PIMv2 implementations don't make this distinction,
2568 * so for compatibility reason perform the checksum over part of the
2569 * message, and if error, then over the whole message.
2570 */
2571 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER && in_cksum(m, PIM_MINLEN) == 0) {
2572 /* do nothing, checksum okay */
2573 } else if (in_cksum(m, datalen)) {
2574 PIMSTAT_INC(pims_rcv_badsum);
2575 CTR1(KTR_IPMF, "%s: invalid checksum", __func__);
2576 m_freem(m);
2577 return (IPPROTO_DONE);
2578 }
2579
2580 /* PIM version check */
2581 if (PIM_VT_V(pim->pim_vt) < PIM_VERSION) {
2582 PIMSTAT_INC(pims_rcv_badversion);
2583 CTR3(KTR_IPMF, "%s: bad version %d expect %d", __func__,
2584 (int)PIM_VT_V(pim->pim_vt), PIM_VERSION);
2585 m_freem(m);
2586 return (IPPROTO_DONE);
2587 }
2588
2589 /* restore mbuf back to the outer IP */
2590 m->m_data -= iphlen;
2591 m->m_len += iphlen;
2592
2593 if (PIM_VT_T(pim->pim_vt) == PIM_REGISTER) {
2594 /*
2595 * Since this is a REGISTER, we'll make a copy of the register
2596 * headers ip + pim + u_int32 + encap_ip, to be passed up to the
2597 * routing daemon.
2598 */
2599 struct sockaddr_in dst = { sizeof(dst), AF_INET };
2600 struct mbuf *mcp;
2601 struct ip *encap_ip;
2602 u_int32_t *reghdr;
2603 struct ifnet *vifp;
2604
2605 MRW_RLOCK();
2606 if ((V_reg_vif_num >= V_numvifs) || (V_reg_vif_num == VIFI_INVALID)) {
2607 MRW_RUNLOCK();
2608 CTR2(KTR_IPMF, "%s: register vif not set: %d", __func__,
2609 (int)V_reg_vif_num);
2610 m_freem(m);
2611 return (IPPROTO_DONE);
2612 }
2613 /* XXX need refcnt? */
2614 vifp = V_viftable[V_reg_vif_num].v_ifp;
2615 MRW_RUNLOCK();
2616
2617 /*
2618 * Validate length
2619 */
2620 if (datalen < PIM_REG_MINLEN) {
2621 PIMSTAT_INC(pims_rcv_tooshort);
2622 PIMSTAT_INC(pims_rcv_badregisters);
2623 CTR1(KTR_IPMF, "%s: register packet size too small", __func__);
2624 m_freem(m);
2625 return (IPPROTO_DONE);
2626 }
2627
2628 reghdr = (u_int32_t *)(pim + 1);
2629 encap_ip = (struct ip *)(reghdr + 1);
2630
2631 CTR3(KTR_IPMF, "%s: register: encap ip src 0x%08x len %d",
2632 __func__, ntohl(encap_ip->ip_src.s_addr),
2633 ntohs(encap_ip->ip_len));
2634
2635 /* verify the version number of the inner packet */
2636 if (encap_ip->ip_v != IPVERSION) {
2637 PIMSTAT_INC(pims_rcv_badregisters);
2638 CTR1(KTR_IPMF, "%s: bad encap ip version", __func__);
2639 m_freem(m);
2640 return (IPPROTO_DONE);
2641 }
2642
2643 /* verify the inner packet is destined to a mcast group */
2644 if (!IN_MULTICAST(ntohl(encap_ip->ip_dst.s_addr))) {
2645 PIMSTAT_INC(pims_rcv_badregisters);
2646 CTR2(KTR_IPMF, "%s: bad encap ip dest 0x%08x", __func__,
2647 ntohl(encap_ip->ip_dst.s_addr));
2648 m_freem(m);
2649 return (IPPROTO_DONE);
2650 }
2651
2652 /* If a NULL_REGISTER, pass it to the daemon */
2653 if ((ntohl(*reghdr) & PIM_NULL_REGISTER))
2654 goto pim_input_to_daemon;
2655
2656 /*
2657 * Copy the TOS from the outer IP header to the inner IP header.
2658 */
2659 if (encap_ip->ip_tos != ip_tos) {
2660 /* Outer TOS -> inner TOS */
2661 encap_ip->ip_tos = ip_tos;
2662 /* Recompute the inner header checksum. Sigh... */
2663
2664 /* adjust mbuf to point to the inner IP header */
2665 m->m_data += (iphlen + PIM_MINLEN);
2666 m->m_len -= (iphlen + PIM_MINLEN);
2667
2668 encap_ip->ip_sum = 0;
2669 encap_ip->ip_sum = in_cksum(m, encap_ip->ip_hl << 2);
2670
2671 /* restore mbuf to point back to the outer IP header */
2672 m->m_data -= (iphlen + PIM_MINLEN);
2673 m->m_len += (iphlen + PIM_MINLEN);
2674 }
2675
2676 /*
2677 * Decapsulate the inner IP packet and loopback to forward it
2678 * as a normal multicast packet. Also, make a copy of the
2679 * outer_iphdr + pimhdr + reghdr + encap_iphdr
2680 * to pass to the daemon later, so it can take the appropriate
2681 * actions (e.g., send back PIM_REGISTER_STOP).
2682 * XXX: here m->m_data points to the outer IP header.
2683 */
2684 mcp = m_copym(m, 0, iphlen + PIM_REG_MINLEN, M_NOWAIT);
2685 if (mcp == NULL) {
2686 CTR1(KTR_IPMF, "%s: m_copym() failed", __func__);
2687 m_freem(m);
2688 return (IPPROTO_DONE);
2689 }
2690
2691 /* Keep statistics */
2692 /* XXX: registers_bytes include only the encap. mcast pkt */
2693 PIMSTAT_INC(pims_rcv_registers_msgs);
2694 PIMSTAT_ADD(pims_rcv_registers_bytes, ntohs(encap_ip->ip_len));
2695
2696 /*
2697 * forward the inner ip packet; point m_data at the inner ip.
2698 */
2699 m_adj(m, iphlen + PIM_MINLEN);
2700
2701 CTR4(KTR_IPMF,
2702 "%s: forward decap'd REGISTER: src %lx dst %lx vif %d",
2703 __func__,
2704 (u_long)ntohl(encap_ip->ip_src.s_addr),
2705 (u_long)ntohl(encap_ip->ip_dst.s_addr),
2706 (int)V_reg_vif_num);
2707
2708 /* NB: vifp was collected above; can it change on us? */
2709 if_simloop(vifp, m, dst.sin_family, 0);
2710
2711 /* prepare the register head to send to the mrouting daemon */
2712 m = mcp;
2713 }
2714
2715 pim_input_to_daemon:
2716 /*
2717 * Pass the PIM message up to the daemon; if it is a Register message,
2718 * pass the 'head' only up to the daemon. This includes the
2719 * outer IP header, PIM header, PIM-Register header and the
2720 * inner IP header.
2721 * XXX: the outer IP header pkt size of a Register is not adjust to
2722 * reflect the fact that the inner multicast data is truncated.
2723 */
2724 return (rip_input(&m, &off, proto));
2725 }
2726
2727 static int
sysctl_mfctable(SYSCTL_HANDLER_ARGS)2728 sysctl_mfctable(SYSCTL_HANDLER_ARGS)
2729 {
2730 struct mfc *rt;
2731 int error, i;
2732
2733 if (req->newptr)
2734 return (EPERM);
2735 if (V_mfchashtbl == NULL) /* XXX unlocked */
2736 return (0);
2737 error = sysctl_wire_old_buffer(req, 0);
2738 if (error)
2739 return (error);
2740
2741 MRW_RLOCK();
2742 if (V_mfchashtbl == NULL)
2743 goto out_locked;
2744
2745 for (i = 0; i < mfchashsize; i++) {
2746 LIST_FOREACH(rt, &V_mfchashtbl[i], mfc_hash) {
2747 error = SYSCTL_OUT(req, rt, sizeof(struct mfc));
2748 if (error)
2749 goto out_locked;
2750 }
2751 }
2752 out_locked:
2753 MRW_RUNLOCK();
2754 return (error);
2755 }
2756
2757 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mfctable,
2758 CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_mfctable,
2759 "IPv4 Multicast Forwarding Table "
2760 "(struct *mfc[mfchashsize], netinet/ip_mroute.h)");
2761
2762 static int
sysctl_viflist(SYSCTL_HANDLER_ARGS)2763 sysctl_viflist(SYSCTL_HANDLER_ARGS)
2764 {
2765 int error, i;
2766
2767 if (req->newptr)
2768 return (EPERM);
2769 if (V_viftable == NULL) /* XXX unlocked */
2770 return (0);
2771 error = sysctl_wire_old_buffer(req, MROUTE_VIF_SYSCTL_LEN * MAXVIFS);
2772 if (error)
2773 return (error);
2774
2775 MRW_RLOCK();
2776 /* Copy out user-visible portion of vif entry. */
2777 for (i = 0; i < MAXVIFS; i++) {
2778 error = SYSCTL_OUT(req, &V_viftable[i], MROUTE_VIF_SYSCTL_LEN);
2779 if (error)
2780 break;
2781 }
2782 MRW_RUNLOCK();
2783 return (error);
2784 }
2785
2786 SYSCTL_PROC(_net_inet_ip, OID_AUTO, viftable,
2787 CTLTYPE_OPAQUE | CTLFLAG_VNET | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
2788 sysctl_viflist, "S,vif[MAXVIFS]",
2789 "IPv4 Multicast Interfaces (struct vif[MAXVIFS], netinet/ip_mroute.h)");
2790
2791 static void
vnet_mroute_init(const void * unused __unused)2792 vnet_mroute_init(const void *unused __unused)
2793 {
2794
2795 V_nexpire = malloc(mfchashsize, M_MRTABLE, M_WAITOK|M_ZERO);
2796
2797 V_viftable = mallocarray(MAXVIFS, sizeof(*V_viftable),
2798 M_MRTABLE, M_WAITOK|M_ZERO);
2799
2800 callout_init_rw(&V_expire_upcalls_ch, &mrouter_lock, 0);
2801 callout_init_rw(&V_bw_upcalls_ch, &mrouter_lock, 0);
2802
2803 /* Prepare taskqueue */
2804 V_task_queue = taskqueue_create_fast("ip_mroute_tskq", M_NOWAIT,
2805 taskqueue_thread_enqueue, &V_task_queue);
2806 taskqueue_start_threads(&V_task_queue, 1, PI_NET, "ip_mroute_tskq task");
2807 }
2808
2809 VNET_SYSINIT(vnet_mroute_init, SI_SUB_PROTO_MC, SI_ORDER_ANY, vnet_mroute_init,
2810 NULL);
2811
2812 static void
vnet_mroute_uninit(const void * unused __unused)2813 vnet_mroute_uninit(const void *unused __unused)
2814 {
2815
2816 /* Taskqueue should be cancelled and drained before freeing */
2817 taskqueue_free(V_task_queue);
2818
2819 free(V_viftable, M_MRTABLE);
2820 free(V_nexpire, M_MRTABLE);
2821 V_nexpire = NULL;
2822 }
2823
2824 VNET_SYSUNINIT(vnet_mroute_uninit, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE,
2825 vnet_mroute_uninit, NULL);
2826
2827 static int
ip_mroute_modevent(module_t mod,int type,void * unused)2828 ip_mroute_modevent(module_t mod, int type, void *unused)
2829 {
2830
2831 switch (type) {
2832 case MOD_LOAD:
2833 MRW_TEARDOWN_LOCK_INIT();
2834 MRW_LOCK_INIT();
2835
2836 if_detach_event_tag = EVENTHANDLER_REGISTER(ifnet_departure_event,
2837 if_detached_event, NULL, EVENTHANDLER_PRI_ANY);
2838 if (if_detach_event_tag == NULL) {
2839 printf("ip_mroute: unable to register "
2840 "ifnet_departure_event handler\n");
2841 MRW_LOCK_DESTROY();
2842 return (EINVAL);
2843 }
2844
2845 if (!powerof2(mfchashsize)) {
2846 printf("WARNING: %s not a power of 2; using default\n",
2847 "net.inet.ip.mfchashsize");
2848 mfchashsize = MFCHASHSIZE;
2849 }
2850
2851 pim_encap_cookie = ip_encap_attach(&ipv4_encap_cfg, NULL, M_WAITOK);
2852
2853 ip_mcast_src = X_ip_mcast_src;
2854 ip_mforward = X_ip_mforward;
2855 ip_mrouter_done = X_ip_mrouter_done;
2856 ip_mrouter_get = X_ip_mrouter_get;
2857 ip_mrouter_set = X_ip_mrouter_set;
2858
2859 ip_rsvp_force_done = X_ip_rsvp_force_done;
2860 ip_rsvp_vif = X_ip_rsvp_vif;
2861
2862 legal_vif_num = X_legal_vif_num;
2863 mrt_ioctl = X_mrt_ioctl;
2864 rsvp_input_p = X_rsvp_input;
2865 break;
2866
2867 case MOD_UNLOAD:
2868 /*
2869 * Typically module unload happens after the user-level
2870 * process has shutdown the kernel services (the check
2871 * below insures someone can't just yank the module out
2872 * from under a running process). But if the module is
2873 * just loaded and then unloaded w/o starting up a user
2874 * process we still need to cleanup.
2875 */
2876 MRW_WLOCK();
2877 if (ip_mrouter_cnt != 0) {
2878 MRW_WUNLOCK();
2879 return (EINVAL);
2880 }
2881 ip_mrouter_unloading = 1;
2882 MRW_WUNLOCK();
2883
2884 EVENTHANDLER_DEREGISTER(ifnet_departure_event, if_detach_event_tag);
2885
2886 if (pim_encap_cookie) {
2887 ip_encap_detach(pim_encap_cookie);
2888 pim_encap_cookie = NULL;
2889 }
2890
2891 ip_mcast_src = NULL;
2892 ip_mforward = NULL;
2893 ip_mrouter_done = NULL;
2894 ip_mrouter_get = NULL;
2895 ip_mrouter_set = NULL;
2896
2897 ip_rsvp_force_done = NULL;
2898 ip_rsvp_vif = NULL;
2899
2900 legal_vif_num = NULL;
2901 mrt_ioctl = NULL;
2902 rsvp_input_p = NULL;
2903
2904 MRW_LOCK_DESTROY();
2905 MRW_TEARDOWN_LOCK_DESTROY();
2906 break;
2907
2908 default:
2909 return EOPNOTSUPP;
2910 }
2911 return 0;
2912 }
2913
2914 static moduledata_t ip_mroutemod = {
2915 "ip_mroute",
2916 ip_mroute_modevent,
2917 0
2918 };
2919
2920 DECLARE_MODULE(ip_mroute, ip_mroutemod, SI_SUB_PROTO_MC, SI_ORDER_MIDDLE);
2921