xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision afab0816ecb604f0099a09ad8ee398f0d7b77b1c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Procedures for the kernel part of DVMRP,
28  * a Distance-Vector Multicast Routing Protocol.
29  * (See RFC-1075)
30  * Written by David Waitzman, BBN Labs, August 1988.
31  * Modified by Steve Deering, Stanford, February 1989.
32  * Modified by Mark J. Steiglitz, Stanford, May, 1991
33  * Modified by Van Jacobson, LBL, January 1993
34  * Modified by Ajit Thyagarajan, PARC, August 1993
35  * Modified by Bill Fenner, PARC, April 1995
36  *
37  * MROUTING 3.5
38  */
39 
40 /*
41  * TODO
42  * - function pointer field in vif, void *vif_sendit()
43  */
44 
45 #include <sys/types.h>
46 #include <sys/stream.h>
47 #include <sys/stropts.h>
48 #include <sys/strlog.h>
49 #include <sys/systm.h>
50 #include <sys/ddi.h>
51 #include <sys/cmn_err.h>
52 #include <sys/zone.h>
53 
54 #include <sys/param.h>
55 #include <sys/socket.h>
56 #include <sys/vtrace.h>
57 #include <sys/debug.h>
58 #include <net/if.h>
59 #include <sys/sockio.h>
60 #include <netinet/in.h>
61 #include <net/if_dl.h>
62 
63 #include <inet/ipsec_impl.h>
64 #include <inet/common.h>
65 #include <inet/mi.h>
66 #include <inet/nd.h>
67 #include <inet/tunables.h>
68 #include <inet/mib2.h>
69 #include <netinet/ip6.h>
70 #include <inet/ip.h>
71 #include <inet/snmpcom.h>
72 
73 #include <netinet/igmp.h>
74 #include <netinet/igmp_var.h>
75 #include <netinet/udp.h>
76 #include <netinet/ip_mroute.h>
77 #include <inet/ip_multi.h>
78 #include <inet/ip_ire.h>
79 #include <inet/ip_ndp.h>
80 #include <inet/ip_if.h>
81 #include <inet/ipclassifier.h>
82 
83 #include <netinet/pim.h>
84 
85 
86 /*
87  * MT Design:
88  *
89  * There are three main data structures viftable, mfctable and tbftable that
90  * need to be protected against MT races.
91  *
92  * vitable is a fixed length array of vif structs. There is no lock to protect
93  * the whole array, instead each struct is protected by its own indiviual lock.
94  * The value of v_marks in conjuction with the value of v_refcnt determines the
95  * current state of a vif structure. One special state that needs mention
96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
97  * that vif is being initalized.
98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
99  * in when the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
100  * which prevents the struct from further use.  When the refcnt goes to zero
101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
103  * from  going away a refhold is put on the ipif before using it. see
104  * lock_good_vif() and unlock_good_vif().
105  *
106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
107  * of the vif struct.
108  *
109  * tbftable is also a fixed length array of tbf structs and is only accessed
110  * via v_tbf.  It is protected by its own lock tbf_lock.
111  *
112  * Lock Ordering is
113  * v_lock --> tbf_lock
114  * v_lock --> ill_locK
115  *
116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
120  * protect the struct elements.
121  *
122  * mfc structs are dynamically allocated and are singly linked
123  * at the head of the chain. When an mfc structure is to be deleted
124  * it is marked condemned and so is the state in the bucket struct.
125  * When the last walker of the hash bucket exits all the mfc structs
126  * marked condemed are freed.
127  *
128  * Locking Hierarchy:
129  * The bucket lock should be acquired before the mfc struct lock.
130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
131  * operations on the bucket struct.
132  *
133  * last_encap_lock and numvifs_mutex should be acquired after
134  * acquring vif or mfc locks. These locks protect some global variables.
135  *
136  * The statistics are not currently protected by a lock
137  * causing the stats be be approximate, not exact.
138  */
139 
140 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
141 
142 /*
143  * Timeouts:
144  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
145  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
146  *	SunOS 5.x uses mfc->timeout for each mfc.
147  *	Some Unixes are limited in the number of simultaneous timeouts
148  * 	that can be run, SunOS 5.x does not have this restriction.
149  */
150 
151 /*
152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
155  */
156 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
157 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
158 
159 /*
160  * Hash function for a source, group entry
161  */
162 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
163 	((g) >> 20) ^ ((g) >> 10) ^ (g))
164 
165 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
166 
167 /* Identify PIM packet that came on a Register interface */
168 #define	PIM_REGISTER_MARKER	0xffffffff
169 
170 /* Function declarations */
171 static int	add_mfc(struct mfcctl *, ip_stack_t *);
172 static int	add_vif(struct vifctl *, conn_t *, ip_stack_t *);
173 static int	del_mfc(struct mfcctl *, ip_stack_t *);
174 static int	del_vif(vifi_t *, ip_stack_t *);
175 static void	del_vifp(struct vif *);
176 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
177 static void	expire_upcalls(void *);
178 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
179 static void	free_queue(struct mfc *);
180 static int	get_assert(uchar_t *, ip_stack_t *);
181 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
182 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
183 static int	get_version(uchar_t *);
184 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
185 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
186 		    ipaddr_t, struct mfc *);
187 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
188 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
189 static int	register_mforward(mblk_t *, ip_recv_attr_t *);
190 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
191 static int	set_assert(int *, ip_stack_t *);
192 
193 /*
194  * Token Bucket Filter functions
195  */
196 static int  priority(struct vif *, ipha_t *);
197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
198 static int  tbf_dq_sel(struct vif *, ipha_t *);
199 static void tbf_process_q(struct vif *);
200 static void tbf_queue(struct vif *, mblk_t *);
201 static void tbf_reprocess_q(void *);
202 static void tbf_send_packet(struct vif *, mblk_t *);
203 static void tbf_update_tokens(struct vif *);
204 static void release_mfc(struct mfcb *);
205 
206 static boolean_t is_mrouter_off(ip_stack_t *);
207 /*
208  * Encapsulation packets
209  */
210 
211 #define	ENCAP_TTL	64
212 
213 /* prototype IP hdr for encapsulated packets */
214 static ipha_t multicast_encap_iphdr = {
215 	IP_SIMPLE_HDR_VERSION,
216 	0,				/* tos */
217 	sizeof (ipha_t),		/* total length */
218 	0,				/* id */
219 	0,				/* frag offset */
220 	ENCAP_TTL, IPPROTO_ENCAP,
221 	0,				/* checksum */
222 };
223 
224 /*
225  * Rate limit for assert notification messages, in nsec.
226  */
227 #define	ASSERT_MSG_TIME		3000000000
228 
229 
230 #define	VIF_REFHOLD(vifp) {			\
231 	mutex_enter(&(vifp)->v_lock);		\
232 	(vifp)->v_refcnt++;			\
233 	mutex_exit(&(vifp)->v_lock);		\
234 }
235 
236 #define	VIF_REFRELE_LOCKED(vifp) {				\
237 	(vifp)->v_refcnt--;					\
238 	if ((vifp)->v_refcnt == 0 &&				\
239 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
240 			del_vifp(vifp);				\
241 	} else {						\
242 		mutex_exit(&(vifp)->v_lock);			\
243 	}							\
244 }
245 
246 #define	VIF_REFRELE(vifp) {					\
247 	mutex_enter(&(vifp)->v_lock);				\
248 	(vifp)->v_refcnt--;					\
249 	if ((vifp)->v_refcnt == 0 &&				\
250 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
251 			del_vifp(vifp);				\
252 	} else {						\
253 		mutex_exit(&(vifp)->v_lock);			\
254 	}							\
255 }
256 
257 #define	MFCB_REFHOLD(mfcb) {				\
258 	mutex_enter(&(mfcb)->mfcb_lock);		\
259 	(mfcb)->mfcb_refcnt++;				\
260 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
261 	mutex_exit(&(mfcb)->mfcb_lock);			\
262 }
263 
264 #define	MFCB_REFRELE(mfcb) {					\
265 	mutex_enter(&(mfcb)->mfcb_lock);			\
266 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
267 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
268 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
269 			release_mfc(mfcb);			\
270 	}							\
271 	mutex_exit(&(mfcb)->mfcb_lock);				\
272 }
273 
274 /*
275  * MFCFIND:
276  * Find a route for a given origin IP address and multicast group address.
277  * Skip entries with pending upcalls.
278  * Type of service parameter to be added in the future!
279  */
280 #define	MFCFIND(mfcbp, o, g, rt) { \
281 	struct mfc *_mb_rt = NULL; \
282 	rt = NULL; \
283 	_mb_rt = mfcbp->mfcb_mfc; \
284 	while (_mb_rt) { \
285 		if ((_mb_rt->mfc_origin.s_addr == o) && \
286 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
287 		    (_mb_rt->mfc_rte == NULL) && \
288 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
289 		    rt = _mb_rt; \
290 		    break; \
291 		} \
292 	_mb_rt = _mb_rt->mfc_next; \
293 	} \
294 }
295 
296 /*
297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
298  * are inefficient. We use gethrestime() which returns a timespec_t with
299  * sec and nsec, the resolution is machine dependent.
300  * The following 2 macros have been changed to use nsec instead of usec.
301  */
302 /*
303  * Macros to compute elapsed time efficiently.
304  * Borrowed from Van Jacobson's scheduling code.
305  * Delta should be a hrtime_t.
306  */
307 #define	TV_DELTA(a, b, delta) { \
308 	int xxs; \
309  \
310 	delta = (a).tv_nsec - (b).tv_nsec; \
311 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
312 		switch (xxs) { \
313 		case 2: \
314 		    delta += 1000000000; \
315 		    /*FALLTHROUGH*/ \
316 		case 1: \
317 		    delta += 1000000000; \
318 		    break; \
319 		default: \
320 		    delta += (1000000000 * xxs); \
321 		} \
322 	} \
323 }
324 
325 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
326 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
327 
328 /*
329  * Handle MRT setsockopt commands to modify the multicast routing tables.
330  */
331 int
332 ip_mrouter_set(int cmd, conn_t *connp, int checkonly, uchar_t *data,
333     int datalen)
334 {
335 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
336 
337 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
338 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
339 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
340 		return (EACCES);
341 	}
342 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
343 
344 	if (checkonly) {
345 		/*
346 		 * do not do operation, just pretend to - new T_CHECK
347 		 * Note: Even routines further on can probably fail but
348 		 * this T_CHECK stuff is only to please XTI so it not
349 		 * necessary to be perfect.
350 		 */
351 		switch (cmd) {
352 		case MRT_INIT:
353 		case MRT_DONE:
354 		case MRT_ADD_VIF:
355 		case MRT_DEL_VIF:
356 		case MRT_ADD_MFC:
357 		case MRT_DEL_MFC:
358 		case MRT_ASSERT:
359 			return (0);
360 		default:
361 			return (EOPNOTSUPP);
362 		}
363 	}
364 
365 	/*
366 	 * make sure no command is issued after multicast routing has been
367 	 * turned off.
368 	 */
369 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
370 		if (is_mrouter_off(ipst))
371 			return (EINVAL);
372 	}
373 
374 	switch (cmd) {
375 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
376 	case MRT_DONE:	return (ip_mrouter_done(ipst));
377 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp, ipst));
378 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, ipst));
379 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
380 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
381 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
382 	default:	   return (EOPNOTSUPP);
383 	}
384 }
385 
386 /*
387  * Handle MRT getsockopt commands
388  */
389 int
390 ip_mrouter_get(int cmd, conn_t *connp, uchar_t *data)
391 {
392 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
393 
394 	if (connp != ipst->ips_ip_g_mrouter)
395 		return (EACCES);
396 
397 	switch (cmd) {
398 	case MRT_VERSION:	return (get_version((uchar_t *)data));
399 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
400 	default:		return (EOPNOTSUPP);
401 	}
402 }
403 
404 /*
405  * Handle ioctl commands to obtain information from the cache.
406  * Called with shared access to IP. These are read_only ioctls.
407  */
408 /* ARGSUSED */
409 int
410 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
411     ip_ioctl_cmd_t *ipip, void *if_req)
412 {
413 	mblk_t	*mp1;
414 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
415 	conn_t		*connp = Q_TO_CONN(q);
416 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
417 
418 	/* Existence verified in ip_wput_nondata */
419 	mp1 = mp->b_cont->b_cont;
420 
421 	switch (iocp->ioc_cmd) {
422 	case (SIOCGETVIFCNT):
423 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
424 	case (SIOCGETSGCNT):
425 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
426 	case (SIOCGETLSGCNT):
427 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
428 	default:
429 		return (EINVAL);
430 	}
431 }
432 
433 /*
434  * Returns the packet, byte, rpf-failure count for the source, group provided.
435  */
436 static int
437 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
438 {
439 	struct mfc *rt;
440 	struct mfcb *mfcbp;
441 
442 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
443 	MFCB_REFHOLD(mfcbp);
444 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
445 
446 	if (rt != NULL) {
447 		mutex_enter(&rt->mfc_mutex);
448 		req->pktcnt   = rt->mfc_pkt_cnt;
449 		req->bytecnt  = rt->mfc_byte_cnt;
450 		req->wrong_if = rt->mfc_wrong_if;
451 		mutex_exit(&rt->mfc_mutex);
452 	} else
453 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
454 
455 	MFCB_REFRELE(mfcbp);
456 	return (0);
457 }
458 
459 /*
460  * Returns the packet, byte, rpf-failure count for the source, group provided.
461  * Uses larger counters and IPv6 addresses.
462  */
463 /* ARGSUSED XXX until implemented */
464 static int
465 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
466 {
467 	/* XXX TODO SIOCGETLSGCNT */
468 	return (ENXIO);
469 }
470 
471 /*
472  * Returns the input and output packet and byte counts on the vif provided.
473  */
474 static int
475 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
476 {
477 	vifi_t vifi = req->vifi;
478 
479 	if (vifi >= ipst->ips_numvifs)
480 		return (EINVAL);
481 
482 	/*
483 	 * No locks here, an approximation is fine.
484 	 */
485 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
486 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
487 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
488 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
489 
490 	return (0);
491 }
492 
493 static int
494 get_version(uchar_t *data)
495 {
496 	int *v = (int *)data;
497 
498 	*v = 0x0305;	/* XXX !!!! */
499 
500 	return (0);
501 }
502 
503 /*
504  * Set PIM assert processing global.
505  */
506 static int
507 set_assert(int *i, ip_stack_t *ipst)
508 {
509 	if ((*i != 1) && (*i != 0))
510 		return (EINVAL);
511 
512 	ipst->ips_pim_assert = *i;
513 
514 	return (0);
515 }
516 
517 /*
518  * Get PIM assert processing global.
519  */
520 static int
521 get_assert(uchar_t *data, ip_stack_t *ipst)
522 {
523 	int *i = (int *)data;
524 
525 	*i = ipst->ips_pim_assert;
526 
527 	return (0);
528 }
529 
530 /*
531  * Enable multicast routing.
532  */
533 static int
534 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
535 {
536 	int	*v;
537 
538 	if (data == NULL || (datalen != sizeof (int)))
539 		return (ENOPROTOOPT);
540 
541 	v = (int *)data;
542 	if (*v != 1)
543 		return (ENOPROTOOPT);
544 
545 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
546 	if (ipst->ips_ip_g_mrouter != NULL) {
547 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
548 		return (EADDRINUSE);
549 	}
550 
551 	/*
552 	 * MRT_INIT should only be allowed for RAW sockets, but we double
553 	 * check.
554 	 */
555 	if (!IPCL_IS_RAWIP(connp)) {
556 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
557 		return (EINVAL);
558 	}
559 
560 	ipst->ips_ip_g_mrouter = connp;
561 	connp->conn_multi_router = 1;
562 	/* In order for tunnels to work we have to turn ip_g_forward on */
563 	if (!WE_ARE_FORWARDING(ipst)) {
564 		if (ipst->ips_ip_mrtdebug > 1) {
565 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
566 			    "ip_mrouter_init: turning on forwarding");
567 		}
568 		ipst->ips_saved_ip_forwarding = ipst->ips_ip_forwarding;
569 		ipst->ips_ip_forwarding = IP_FORWARD_ALWAYS;
570 	}
571 
572 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
573 	return (0);
574 }
575 
576 void
577 ip_mrouter_stack_init(ip_stack_t *ipst)
578 {
579 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
580 
581 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
582 	    KM_SLEEP);
583 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
584 	/*
585 	 * mfctable:
586 	 * Includes all mfcs, including waiting upcalls.
587 	 * Multiple mfcs per bucket.
588 	 */
589 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
590 	    KM_SLEEP);
591 	/*
592 	 * Define the token bucket filter structures.
593 	 * tbftable -> each vif has one of these for storing info.
594 	 */
595 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
596 
597 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
598 
599 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
600 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
601 }
602 
603 /*
604  * Disable multicast routing.
605  * Didn't use global timeout_val (BSD version), instead check the mfctable.
606  */
607 int
608 ip_mrouter_done(ip_stack_t *ipst)
609 {
610 	conn_t		*mrouter;
611 	vifi_t 		vifi;
612 	struct mfc	*mfc_rt;
613 	int		i;
614 
615 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
616 	if (ipst->ips_ip_g_mrouter == NULL) {
617 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
618 		return (EINVAL);
619 	}
620 
621 	mrouter = ipst->ips_ip_g_mrouter;
622 
623 	if (ipst->ips_saved_ip_forwarding != -1) {
624 		if (ipst->ips_ip_mrtdebug > 1) {
625 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
626 			    "ip_mrouter_done: turning off forwarding");
627 		}
628 		ipst->ips_ip_forwarding = ipst->ips_saved_ip_forwarding;
629 		ipst->ips_saved_ip_forwarding = -1;
630 	}
631 
632 	/*
633 	 * Always clear cache when vifs change.
634 	 * No need to get ipst->ips_last_encap_lock since we are running as
635 	 * a writer.
636 	 */
637 	mutex_enter(&ipst->ips_last_encap_lock);
638 	ipst->ips_last_encap_src = 0;
639 	ipst->ips_last_encap_vif = NULL;
640 	mutex_exit(&ipst->ips_last_encap_lock);
641 	mrouter->conn_multi_router = 0;
642 
643 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
644 
645 	/*
646 	 * For each phyint in use,
647 	 * disable promiscuous reception of all IP multicasts.
648 	 */
649 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
650 		struct vif *vifp = ipst->ips_vifs + vifi;
651 
652 		mutex_enter(&vifp->v_lock);
653 		/*
654 		 * if the vif is active mark it condemned.
655 		 */
656 		if (vifp->v_marks & VIF_MARK_GOOD) {
657 			ASSERT(vifp->v_ipif != NULL);
658 			ipif_refhold(vifp->v_ipif);
659 			/* Phyint only */
660 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
661 				ipif_t *ipif = vifp->v_ipif;
662 				ilm_t *ilm = vifp->v_ilm;
663 
664 				vifp->v_ilm = NULL;
665 				vifp->v_marks &= ~VIF_MARK_GOOD;
666 				vifp->v_marks |= VIF_MARK_CONDEMNED;
667 
668 				mutex_exit(&(vifp)->v_lock);
669 				if (ilm != NULL) {
670 					ill_t *ill = ipif->ipif_ill;
671 
672 					(void) ip_delmulti(ilm);
673 					ASSERT(ill->ill_mrouter_cnt > 0);
674 					atomic_dec_32(&ill->ill_mrouter_cnt);
675 				}
676 				mutex_enter(&vifp->v_lock);
677 			}
678 			ipif_refrele(vifp->v_ipif);
679 			/*
680 			 * decreases the refcnt added in add_vif.
681 			 * and release v_lock.
682 			 */
683 			VIF_REFRELE_LOCKED(vifp);
684 		} else {
685 			mutex_exit(&vifp->v_lock);
686 			continue;
687 		}
688 	}
689 
690 	mutex_enter(&ipst->ips_numvifs_mutex);
691 	ipst->ips_numvifs = 0;
692 	ipst->ips_pim_assert = 0;
693 	ipst->ips_reg_vif_num = ALL_VIFS;
694 	mutex_exit(&ipst->ips_numvifs_mutex);
695 
696 	/*
697 	 * Free upcall msgs.
698 	 * Go through mfctable and stop any outstanding upcall
699 	 * timeouts remaining on mfcs.
700 	 */
701 	for (i = 0; i < MFCTBLSIZ; i++) {
702 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
703 		ipst->ips_mfcs[i].mfcb_refcnt++;
704 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
705 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
706 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
707 		while (mfc_rt) {
708 			/* Free upcalls */
709 			mutex_enter(&mfc_rt->mfc_mutex);
710 			if (mfc_rt->mfc_rte != NULL) {
711 				if (mfc_rt->mfc_timeout_id != 0) {
712 					/*
713 					 * OK to drop the lock as we have
714 					 * a refcnt on the bucket. timeout
715 					 * can fire but it will see that
716 					 * mfc_timeout_id == 0 and not do
717 					 * anything. see expire_upcalls().
718 					 */
719 					mfc_rt->mfc_timeout_id = 0;
720 					mutex_exit(&mfc_rt->mfc_mutex);
721 					(void) untimeout(
722 					    mfc_rt->mfc_timeout_id);
723 						mfc_rt->mfc_timeout_id = 0;
724 					mutex_enter(&mfc_rt->mfc_mutex);
725 
726 					/*
727 					 * all queued upcall packets
728 					 * and mblk will be freed in
729 					 * release_mfc().
730 					 */
731 				}
732 			}
733 
734 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
735 
736 			mutex_exit(&mfc_rt->mfc_mutex);
737 			mfc_rt = mfc_rt->mfc_next;
738 		}
739 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
740 	}
741 
742 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
743 	ipst->ips_ip_g_mrouter = NULL;
744 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
745 	return (0);
746 }
747 
748 void
749 ip_mrouter_stack_destroy(ip_stack_t *ipst)
750 {
751 	struct mfcb *mfcbp;
752 	struct mfc  *rt;
753 	int i;
754 
755 	for (i = 0; i < MFCTBLSIZ; i++) {
756 		mfcbp = &ipst->ips_mfcs[i];
757 
758 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
759 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
760 			    i);
761 
762 			mfcbp->mfcb_mfc = rt->mfc_next;
763 			free_queue(rt);
764 			mi_free(rt);
765 		}
766 	}
767 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
768 	ipst->ips_vifs = NULL;
769 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
770 	ipst->ips_mrtstat = NULL;
771 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
772 	ipst->ips_mfcs = NULL;
773 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
774 	ipst->ips_tbfs = NULL;
775 
776 	mutex_destroy(&ipst->ips_last_encap_lock);
777 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
778 }
779 
780 static boolean_t
781 is_mrouter_off(ip_stack_t *ipst)
782 {
783 	conn_t	*mrouter;
784 
785 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
786 	if (ipst->ips_ip_g_mrouter == NULL) {
787 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
788 		return (B_TRUE);
789 	}
790 
791 	mrouter = ipst->ips_ip_g_mrouter;
792 	if (mrouter->conn_multi_router == 0) {
793 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
794 		return (B_TRUE);
795 	}
796 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
797 	return (B_FALSE);
798 }
799 
800 static void
801 unlock_good_vif(struct vif *vifp)
802 {
803 	ASSERT(vifp->v_ipif != NULL);
804 	ipif_refrele(vifp->v_ipif);
805 	VIF_REFRELE(vifp);
806 }
807 
808 static boolean_t
809 lock_good_vif(struct vif *vifp)
810 {
811 	mutex_enter(&vifp->v_lock);
812 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
813 		mutex_exit(&vifp->v_lock);
814 		return (B_FALSE);
815 	}
816 
817 	ASSERT(vifp->v_ipif != NULL);
818 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
819 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
820 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
821 		mutex_exit(&vifp->v_lock);
822 		return (B_FALSE);
823 	}
824 	ipif_refhold_locked(vifp->v_ipif);
825 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
826 	vifp->v_refcnt++;
827 	mutex_exit(&vifp->v_lock);
828 	return (B_TRUE);
829 }
830 
831 /*
832  * Add a vif to the vif table.
833  */
834 static int
835 add_vif(struct vifctl *vifcp, conn_t *connp, ip_stack_t *ipst)
836 {
837 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
838 	ipif_t		*ipif;
839 	int		error = 0;
840 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
841 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
842 	ilm_t		*ilm;
843 	ill_t		*ill;
844 
845 	ASSERT(connp != NULL);
846 
847 	if (vifcp->vifc_vifi >= MAXVIFS)
848 		return (EINVAL);
849 
850 	if (is_mrouter_off(ipst))
851 		return (EINVAL);
852 
853 	mutex_enter(&vifp->v_lock);
854 	/*
855 	 * Viftable entry should be 0.
856 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
857 	 * initialized.
858 	 *
859 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
860 	 * request while the delete is in progress, mrouted only sends add
861 	 * requests when a new interface is added and the new interface cannot
862 	 * have the same vifi as an existing interface. We make sure that
863 	 * ill_delete will block till the vif is deleted by adding a refcnt
864 	 * to ipif in del_vif().
865 	 */
866 	if (vifp->v_lcl_addr.s_addr != 0 ||
867 	    vifp->v_marks != 0 ||
868 	    vifp->v_refcnt != 0) {
869 		mutex_exit(&vifp->v_lock);
870 		return (EADDRINUSE);
871 	}
872 
873 	/* Incoming vif should not be 0 */
874 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
875 		mutex_exit(&vifp->v_lock);
876 		return (EINVAL);
877 	}
878 
879 	vifp->v_refcnt++;
880 	mutex_exit(&vifp->v_lock);
881 	/* Find the interface with the local address */
882 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
883 	    IPCL_ZONEID(connp), ipst);
884 	if (ipif == NULL) {
885 		VIF_REFRELE(vifp);
886 		return (EADDRNOTAVAIL);
887 	}
888 
889 	if (ipst->ips_ip_mrtdebug > 1) {
890 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
891 		    "add_vif: src 0x%x enter",
892 		    vifcp->vifc_lcl_addr.s_addr);
893 	}
894 
895 	mutex_enter(&vifp->v_lock);
896 	/*
897 	 * Always clear cache when vifs change.
898 	 * Needed to ensure that src isn't left over from before vif was added.
899 	 * No need to get last_encap_lock, since we are running as a writer.
900 	 */
901 
902 	mutex_enter(&ipst->ips_last_encap_lock);
903 	ipst->ips_last_encap_src = 0;
904 	ipst->ips_last_encap_vif = NULL;
905 	mutex_exit(&ipst->ips_last_encap_lock);
906 
907 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
908 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
909 			cmn_err(CE_WARN,
910 			    "add_vif: source route tunnels not supported\n");
911 			VIF_REFRELE_LOCKED(vifp);
912 			ipif_refrele(ipif);
913 			return (EOPNOTSUPP);
914 		}
915 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
916 
917 	} else {
918 		/* Phyint or Register vif */
919 		if (vifcp->vifc_flags & VIFF_REGISTER) {
920 			/*
921 			 * Note: Since all IPPROTO_IP level options (including
922 			 * MRT_ADD_VIF) are done exclusively via
923 			 * ip_optmgmt_writer(), a lock is not necessary to
924 			 * protect reg_vif_num.
925 			 */
926 			mutex_enter(&ipst->ips_numvifs_mutex);
927 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
928 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
929 				mutex_exit(&ipst->ips_numvifs_mutex);
930 			} else {
931 				mutex_exit(&ipst->ips_numvifs_mutex);
932 				VIF_REFRELE_LOCKED(vifp);
933 				ipif_refrele(ipif);
934 				return (EADDRINUSE);
935 			}
936 		}
937 
938 		/* Make sure the interface supports multicast */
939 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
940 			VIF_REFRELE_LOCKED(vifp);
941 			ipif_refrele(ipif);
942 			if (vifcp->vifc_flags & VIFF_REGISTER) {
943 				mutex_enter(&ipst->ips_numvifs_mutex);
944 				ipst->ips_reg_vif_num = ALL_VIFS;
945 				mutex_exit(&ipst->ips_numvifs_mutex);
946 			}
947 			return (EOPNOTSUPP);
948 		}
949 		/* Enable promiscuous reception of all IP mcasts from the if */
950 		mutex_exit(&vifp->v_lock);
951 
952 		ill = ipif->ipif_ill;
953 		if (IS_UNDER_IPMP(ill))
954 			ill = ipmp_ill_hold_ipmp_ill(ill);
955 
956 		if (ill == NULL) {
957 			ilm = NULL;
958 		} else {
959 			ilm = ip_addmulti(&ipv6_all_zeros, ill,
960 			    ipif->ipif_zoneid, &error);
961 			if (ilm != NULL)
962 				atomic_inc_32(&ill->ill_mrouter_cnt);
963 			if (IS_UNDER_IPMP(ipif->ipif_ill)) {
964 				ill_refrele(ill);
965 				ill = ipif->ipif_ill;
966 			}
967 		}
968 
969 		mutex_enter(&vifp->v_lock);
970 		/*
971 		 * since we released the lock lets make sure that
972 		 * ip_mrouter_done() has not been called.
973 		 */
974 		if (ilm == NULL || is_mrouter_off(ipst)) {
975 			if (ilm != NULL) {
976 				(void) ip_delmulti(ilm);
977 				ASSERT(ill->ill_mrouter_cnt > 0);
978 				atomic_dec_32(&ill->ill_mrouter_cnt);
979 			}
980 			if (vifcp->vifc_flags & VIFF_REGISTER) {
981 				mutex_enter(&ipst->ips_numvifs_mutex);
982 				ipst->ips_reg_vif_num = ALL_VIFS;
983 				mutex_exit(&ipst->ips_numvifs_mutex);
984 			}
985 			VIF_REFRELE_LOCKED(vifp);
986 			ipif_refrele(ipif);
987 			return (error?error:EINVAL);
988 		}
989 		vifp->v_ilm = ilm;
990 	}
991 	/* Define parameters for the tbf structure */
992 	vifp->v_tbf = v_tbf;
993 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
994 	vifp->v_tbf->tbf_n_tok = 0;
995 	vifp->v_tbf->tbf_q_len = 0;
996 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
997 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
998 
999 	vifp->v_flags = vifcp->vifc_flags;
1000 	vifp->v_threshold = vifcp->vifc_threshold;
1001 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1002 	vifp->v_ipif = ipif;
1003 	ipif_refrele(ipif);
1004 	/* Scaling up here, allows division by 1024 in critical code.	*/
1005 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1006 	vifp->v_timeout_id = 0;
1007 	/* initialize per vif pkt counters */
1008 	vifp->v_pkt_in = 0;
1009 	vifp->v_pkt_out = 0;
1010 	vifp->v_bytes_in = 0;
1011 	vifp->v_bytes_out = 0;
1012 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1013 
1014 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1015 	mutex_enter(&ipst->ips_numvifs_mutex);
1016 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1017 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1018 	mutex_exit(&ipst->ips_numvifs_mutex);
1019 
1020 	if (ipst->ips_ip_mrtdebug > 1) {
1021 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1022 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1023 		    vifcp->vifc_vifi,
1024 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1025 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1026 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1027 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1028 	}
1029 
1030 	vifp->v_marks = VIF_MARK_GOOD;
1031 	mutex_exit(&vifp->v_lock);
1032 	return (0);
1033 }
1034 
1035 
1036 /* Delete a vif from the vif table. */
1037 static void
1038 del_vifp(struct vif *vifp)
1039 {
1040 	struct tbf	*t = vifp->v_tbf;
1041 	mblk_t  *mp0;
1042 	vifi_t  vifi;
1043 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1044 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1045 
1046 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1047 	ASSERT(t != NULL);
1048 
1049 	if (ipst->ips_ip_mrtdebug > 1) {
1050 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1051 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1052 	}
1053 
1054 	if (vifp->v_timeout_id != 0) {
1055 		(void) untimeout(vifp->v_timeout_id);
1056 		vifp->v_timeout_id = 0;
1057 	}
1058 
1059 	/*
1060 	 * Free packets queued at the interface.
1061 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1062 	 */
1063 	mutex_enter(&t->tbf_lock);
1064 	while (t->tbf_q != NULL) {
1065 		mp0 = t->tbf_q;
1066 		t->tbf_q = t->tbf_q->b_next;
1067 		mp0->b_prev = mp0->b_next = NULL;
1068 		freemsg(mp0);
1069 	}
1070 	mutex_exit(&t->tbf_lock);
1071 
1072 	/*
1073 	 * Always clear cache when vifs change.
1074 	 * No need to get last_encap_lock since we are running as a writer.
1075 	 */
1076 	mutex_enter(&ipst->ips_last_encap_lock);
1077 	if (vifp == ipst->ips_last_encap_vif) {
1078 		ipst->ips_last_encap_vif = NULL;
1079 		ipst->ips_last_encap_src = 0;
1080 	}
1081 	mutex_exit(&ipst->ips_last_encap_lock);
1082 
1083 	mutex_destroy(&t->tbf_lock);
1084 
1085 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1086 
1087 	/* Adjust numvifs down */
1088 	mutex_enter(&ipst->ips_numvifs_mutex);
1089 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1090 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1091 			break;
1092 	ipst->ips_numvifs = vifi;
1093 	mutex_exit(&ipst->ips_numvifs_mutex);
1094 
1095 	bzero(vifp, sizeof (*vifp));
1096 }
1097 
1098 static int
1099 del_vif(vifi_t *vifip, ip_stack_t *ipst)
1100 {
1101 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1102 
1103 	if (*vifip >= ipst->ips_numvifs)
1104 		return (EINVAL);
1105 
1106 	mutex_enter(&vifp->v_lock);
1107 	/*
1108 	 * Not initialized
1109 	 * Here we are not looking at the vif that is being initialized
1110 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1111 	 */
1112 	if (vifp->v_lcl_addr.s_addr == 0 ||
1113 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1114 		mutex_exit(&vifp->v_lock);
1115 		return (EADDRNOTAVAIL);
1116 	}
1117 
1118 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1119 	vifp->v_marks &= ~VIF_MARK_GOOD;
1120 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1121 
1122 	/* Phyint only */
1123 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1124 		ipif_t *ipif = vifp->v_ipif;
1125 		ilm_t *ilm = vifp->v_ilm;
1126 
1127 		vifp->v_ilm = NULL;
1128 
1129 		ASSERT(ipif != NULL);
1130 		/*
1131 		 * should be OK to drop the lock as we
1132 		 * have marked this as CONDEMNED.
1133 		 */
1134 		mutex_exit(&(vifp)->v_lock);
1135 		if (ilm != NULL) {
1136 			(void) ip_delmulti(ilm);
1137 			ASSERT(ipif->ipif_ill->ill_mrouter_cnt > 0);
1138 			atomic_dec_32(&ipif->ipif_ill->ill_mrouter_cnt);
1139 		}
1140 		mutex_enter(&(vifp)->v_lock);
1141 	}
1142 
1143 	if (vifp->v_flags & VIFF_REGISTER) {
1144 		mutex_enter(&ipst->ips_numvifs_mutex);
1145 		ipst->ips_reg_vif_num = ALL_VIFS;
1146 		mutex_exit(&ipst->ips_numvifs_mutex);
1147 	}
1148 
1149 	/*
1150 	 * decreases the refcnt added in add_vif.
1151 	 */
1152 	VIF_REFRELE_LOCKED(vifp);
1153 	return (0);
1154 }
1155 
1156 /*
1157  * Add an mfc entry.
1158  */
1159 static int
1160 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1161 {
1162 	struct mfc *rt;
1163 	struct rtdetq *rte;
1164 	ushort_t nstl;
1165 	int i;
1166 	struct mfcb *mfcbp;
1167 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1168 
1169 	/*
1170 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1171 	 * did not have a real route for pkt.
1172 	 * We want this pkt without rt installed in the mfctable to prevent
1173 	 * multiiple tries, so go ahead and put it in mfctable, it will
1174 	 * be discarded later in ip_mdq() because the child is NULL.
1175 	 */
1176 
1177 	/* Error checking, out of bounds? */
1178 	if (mfccp->mfcc_parent > MAXVIFS) {
1179 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1180 		    (int)mfccp->mfcc_parent));
1181 		return (EINVAL);
1182 	}
1183 
1184 	if ((mfccp->mfcc_parent != NO_VIF) &&
1185 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1186 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1187 		    (int)mfccp->mfcc_parent));
1188 		return (EINVAL);
1189 	}
1190 
1191 	if (is_mrouter_off(ipst)) {
1192 		return (EINVAL);
1193 	}
1194 
1195 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1196 	    mfccp->mfcc_mcastgrp.s_addr)];
1197 	MFCB_REFHOLD(mfcbp);
1198 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1199 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1200 
1201 	/* If an entry already exists, just update the fields */
1202 	if (rt) {
1203 		if (ipst->ips_ip_mrtdebug > 1) {
1204 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1205 			    "add_mfc: update o %x grp %x parent %x",
1206 			    ntohl(mfccp->mfcc_origin.s_addr),
1207 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1208 			    mfccp->mfcc_parent);
1209 		}
1210 		mutex_enter(&rt->mfc_mutex);
1211 		rt->mfc_parent = mfccp->mfcc_parent;
1212 
1213 		mutex_enter(&ipst->ips_numvifs_mutex);
1214 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1215 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1216 		mutex_exit(&ipst->ips_numvifs_mutex);
1217 		mutex_exit(&rt->mfc_mutex);
1218 
1219 		MFCB_REFRELE(mfcbp);
1220 		return (0);
1221 	}
1222 
1223 	/*
1224 	 * Find the entry for which the upcall was made and update.
1225 	 */
1226 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1227 		mutex_enter(&rt->mfc_mutex);
1228 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1229 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1230 		    (rt->mfc_rte != NULL) &&
1231 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1232 			if (nstl++ != 0)
1233 				cmn_err(CE_WARN,
1234 				    "add_mfc: %s o %x g %x p %x",
1235 				    "multiple kernel entries",
1236 				    ntohl(mfccp->mfcc_origin.s_addr),
1237 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1238 				    mfccp->mfcc_parent);
1239 
1240 			if (ipst->ips_ip_mrtdebug > 1) {
1241 				(void) mi_strlog(mrouter->conn_rq, 1,
1242 				    SL_TRACE,
1243 				    "add_mfc: o %x g %x p %x",
1244 				    ntohl(mfccp->mfcc_origin.s_addr),
1245 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1246 				    mfccp->mfcc_parent);
1247 			}
1248 			fill_route(rt, mfccp, ipst);
1249 
1250 			/*
1251 			 * Prevent cleanup of cache entry.
1252 			 * Timer starts in ip_mforward.
1253 			 */
1254 			if (rt->mfc_timeout_id != 0) {
1255 				timeout_id_t id;
1256 				id = rt->mfc_timeout_id;
1257 				/*
1258 				 * setting id to zero will avoid this
1259 				 * entry from being cleaned up in
1260 				 * expire_up_calls().
1261 				 */
1262 				rt->mfc_timeout_id = 0;
1263 				/*
1264 				 * dropping the lock is fine as we
1265 				 * have a refhold on the bucket.
1266 				 * so mfc cannot be freed.
1267 				 * The timeout can fire but it will see
1268 				 * that mfc_timeout_id == 0 and not cleanup.
1269 				 */
1270 				mutex_exit(&rt->mfc_mutex);
1271 				(void) untimeout(id);
1272 				mutex_enter(&rt->mfc_mutex);
1273 			}
1274 
1275 			/*
1276 			 * Send all pkts that are queued waiting for the upcall.
1277 			 * ip_mdq param tun set to 0 -
1278 			 * the return value of ip_mdq() isn't used here,
1279 			 * so value we send doesn't matter.
1280 			 */
1281 			while (rt->mfc_rte != NULL) {
1282 				rte = rt->mfc_rte;
1283 				rt->mfc_rte = rte->rte_next;
1284 				mutex_exit(&rt->mfc_mutex);
1285 				(void) ip_mdq(rte->mp, (ipha_t *)
1286 				    rte->mp->b_rptr, rte->ill, 0, rt);
1287 				freemsg(rte->mp);
1288 				mi_free((char *)rte);
1289 				mutex_enter(&rt->mfc_mutex);
1290 			}
1291 		}
1292 		mutex_exit(&rt->mfc_mutex);
1293 	}
1294 
1295 
1296 	/*
1297 	 * It is possible that an entry is being inserted without an upcall
1298 	 */
1299 	if (nstl == 0) {
1300 		mutex_enter(&(mfcbp->mfcb_lock));
1301 		if (ipst->ips_ip_mrtdebug > 1) {
1302 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1303 			    "add_mfc: no upcall o %x g %x p %x",
1304 			    ntohl(mfccp->mfcc_origin.s_addr),
1305 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1306 			    mfccp->mfcc_parent);
1307 		}
1308 		if (is_mrouter_off(ipst)) {
1309 			mutex_exit(&mfcbp->mfcb_lock);
1310 			MFCB_REFRELE(mfcbp);
1311 			return (EINVAL);
1312 		}
1313 
1314 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1315 
1316 			mutex_enter(&rt->mfc_mutex);
1317 			if ((rt->mfc_origin.s_addr ==
1318 			    mfccp->mfcc_origin.s_addr) &&
1319 			    (rt->mfc_mcastgrp.s_addr ==
1320 			    mfccp->mfcc_mcastgrp.s_addr) &&
1321 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1322 				fill_route(rt, mfccp, ipst);
1323 				mutex_exit(&rt->mfc_mutex);
1324 				break;
1325 			}
1326 			mutex_exit(&rt->mfc_mutex);
1327 		}
1328 
1329 		/* No upcall, so make a new entry into mfctable */
1330 		if (rt == NULL) {
1331 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1332 			if (rt == NULL) {
1333 				ip1dbg(("add_mfc: out of memory\n"));
1334 				mutex_exit(&mfcbp->mfcb_lock);
1335 				MFCB_REFRELE(mfcbp);
1336 				return (ENOBUFS);
1337 			}
1338 
1339 			/* Insert new entry at head of hash chain */
1340 			mutex_enter(&rt->mfc_mutex);
1341 			fill_route(rt, mfccp, ipst);
1342 
1343 			/* Link into table */
1344 			rt->mfc_next   = mfcbp->mfcb_mfc;
1345 			mfcbp->mfcb_mfc = rt;
1346 			mutex_exit(&rt->mfc_mutex);
1347 		}
1348 		mutex_exit(&mfcbp->mfcb_lock);
1349 	}
1350 
1351 	MFCB_REFRELE(mfcbp);
1352 	return (0);
1353 }
1354 
1355 /*
1356  * Fills in mfc structure from mrouted mfcctl.
1357  */
1358 static void
1359 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1360 {
1361 	int i;
1362 
1363 	rt->mfc_origin		= mfccp->mfcc_origin;
1364 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1365 	rt->mfc_parent		= mfccp->mfcc_parent;
1366 	mutex_enter(&ipst->ips_numvifs_mutex);
1367 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1368 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1369 	}
1370 	mutex_exit(&ipst->ips_numvifs_mutex);
1371 	/* Initialize pkt counters per src-grp */
1372 	rt->mfc_pkt_cnt	= 0;
1373 	rt->mfc_byte_cnt	= 0;
1374 	rt->mfc_wrong_if	= 0;
1375 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1376 
1377 }
1378 
1379 static void
1380 free_queue(struct mfc *mfcp)
1381 {
1382 	struct rtdetq *rte0;
1383 
1384 	/*
1385 	 * Drop all queued upcall packets.
1386 	 * Free the mbuf with the pkt.
1387 	 */
1388 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1389 		mfcp->mfc_rte = rte0->rte_next;
1390 		freemsg(rte0->mp);
1391 		mi_free((char *)rte0);
1392 	}
1393 }
1394 /*
1395  * go thorugh the hash bucket and free all the entries marked condemned.
1396  */
1397 void
1398 release_mfc(struct mfcb *mfcbp)
1399 {
1400 	struct mfc *current_mfcp;
1401 	struct mfc *prev_mfcp;
1402 
1403 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1404 
1405 	while (current_mfcp != NULL) {
1406 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1407 			if (current_mfcp == mfcbp->mfcb_mfc) {
1408 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1409 				free_queue(current_mfcp);
1410 				mi_free(current_mfcp);
1411 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1412 				continue;
1413 			}
1414 			ASSERT(prev_mfcp != NULL);
1415 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1416 			free_queue(current_mfcp);
1417 			mi_free(current_mfcp);
1418 			current_mfcp = NULL;
1419 		} else {
1420 			prev_mfcp = current_mfcp;
1421 		}
1422 
1423 		current_mfcp = prev_mfcp->mfc_next;
1424 
1425 	}
1426 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1427 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1428 }
1429 
1430 /*
1431  * Delete an mfc entry.
1432  */
1433 static int
1434 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1435 {
1436 	struct in_addr	origin;
1437 	struct in_addr	mcastgrp;
1438 	struct mfc 	*rt;
1439 	uint_t		hash;
1440 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1441 
1442 	origin = mfccp->mfcc_origin;
1443 	mcastgrp = mfccp->mfcc_mcastgrp;
1444 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1445 
1446 	if (ipst->ips_ip_mrtdebug > 1) {
1447 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1448 		    "del_mfc: o %x g %x",
1449 		    ntohl(origin.s_addr),
1450 		    ntohl(mcastgrp.s_addr));
1451 	}
1452 
1453 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1454 
1455 	/* Find mfc in mfctable, finds only entries without upcalls */
1456 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1457 		mutex_enter(&rt->mfc_mutex);
1458 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1459 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1460 		    rt->mfc_rte == NULL &&
1461 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1462 			break;
1463 		mutex_exit(&rt->mfc_mutex);
1464 	}
1465 
1466 	/*
1467 	 * Return if there was an upcall (mfc_rte != NULL,
1468 	 * or rt not in mfctable.
1469 	 */
1470 	if (rt == NULL) {
1471 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1472 		return (EADDRNOTAVAIL);
1473 	}
1474 
1475 
1476 	/*
1477 	 * no need to hold lock as we have a reference.
1478 	 */
1479 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1480 	/* error checking */
1481 	if (rt->mfc_timeout_id != 0) {
1482 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1483 		/*
1484 		 * Its ok to drop the lock,  the struct cannot be freed
1485 		 * since we have a ref on the hash bucket.
1486 		 */
1487 		rt->mfc_timeout_id = 0;
1488 		mutex_exit(&rt->mfc_mutex);
1489 		(void) untimeout(rt->mfc_timeout_id);
1490 		mutex_enter(&rt->mfc_mutex);
1491 	}
1492 
1493 	ASSERT(rt->mfc_rte == NULL);
1494 
1495 
1496 	/*
1497 	 * Delete the entry from the cache
1498 	 */
1499 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1500 	mutex_exit(&rt->mfc_mutex);
1501 
1502 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1503 
1504 	return (0);
1505 }
1506 
1507 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1508 
1509 /*
1510  * IP multicast forwarding function. This function assumes that the packet
1511  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1512  * pointed to by "ill", and the packet is to be relayed to other networks
1513  * that have members of the packet's destination IP multicast group.
1514  *
1515  * The packet is returned unscathed to the caller, unless it is
1516  * erroneous, in which case a -1 value tells the caller (IP)
1517  * to discard it.
1518  *
1519  * Unlike BSD, SunOS 5.x needs to return to IP info about
1520  * whether pkt came in thru a tunnel, so it can be discarded, unless
1521  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1522  * to be delivered.
1523  * Return values are 0 - pkt is okay and phyint
1524  *		    -1 - pkt is malformed and to be tossed
1525  *                   1 - pkt came in on tunnel
1526  */
1527 int
1528 ip_mforward(mblk_t *mp, ip_recv_attr_t *ira)
1529 {
1530 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
1531 	ill_t		*ill = ira->ira_ill;
1532 	struct mfc 	*rt;
1533 	ipaddr_t	src, dst, tunnel_src = 0;
1534 	static int	srctun = 0;
1535 	vifi_t		vifi;
1536 	boolean_t	pim_reg_packet = B_FALSE;
1537 	struct mfcb	*mfcbp;
1538 	ip_stack_t	*ipst = ill->ill_ipst;
1539 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1540 	ill_t		*rill = ira->ira_rill;
1541 
1542 	ASSERT(ira->ira_pktlen == msgdsize(mp));
1543 
1544 	if (ipst->ips_ip_mrtdebug > 1) {
1545 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1546 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1547 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1548 		    ill->ill_name);
1549 	}
1550 
1551 	dst = ipha->ipha_dst;
1552 	if (ira->ira_flags & IRAF_PIM_REGISTER)
1553 		pim_reg_packet = B_TRUE;
1554 	else if (ira->ira_flags & IRAF_MROUTE_TUNNEL_SET)
1555 		tunnel_src = ira->ira_mroute_tunnel;
1556 
1557 	/*
1558 	 * Don't forward a packet with time-to-live of zero or one,
1559 	 * or a packet destined to a local-only group.
1560 	 */
1561 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1562 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1563 		if (ipst->ips_ip_mrtdebug > 1) {
1564 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1565 			    "ip_mforward: not forwarded ttl %d,"
1566 			    " dst 0x%x ill %s",
1567 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1568 		}
1569 		if (tunnel_src != 0)
1570 			return (1);
1571 		else
1572 			return (0);
1573 	}
1574 
1575 	if ((tunnel_src != 0) || pim_reg_packet) {
1576 		/*
1577 		 * Packet arrived over an encapsulated tunnel or via a PIM
1578 		 * register message.
1579 		 */
1580 		if (ipst->ips_ip_mrtdebug > 1) {
1581 			if (tunnel_src != 0) {
1582 				(void) mi_strlog(mrouter->conn_rq, 1,
1583 				    SL_TRACE,
1584 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1585 				    ill->ill_name);
1586 			} else if (pim_reg_packet) {
1587 				(void) mi_strlog(mrouter->conn_rq, 1,
1588 				    SL_TRACE,
1589 				    "ip_mforward: ill %s arrived via"
1590 				    "  REGISTER VIF",
1591 				    ill->ill_name);
1592 			}
1593 		}
1594 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1595 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1596 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1597 		/* Packet arrived via a physical interface. */
1598 		if (ipst->ips_ip_mrtdebug > 1) {
1599 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1600 			    "ip_mforward: ill %s arrived via PHYINT",
1601 			    ill->ill_name);
1602 		}
1603 
1604 	} else {
1605 		/*
1606 		 * Packet arrived through a SRCRT tunnel.
1607 		 * Source-route tunnels are no longer supported.
1608 		 * Error message printed every 1000 times.
1609 		 */
1610 		if ((srctun++ % 1000) == 0) {
1611 			cmn_err(CE_WARN,
1612 			    "ip_mforward: received source-routed pkt from %x",
1613 			    ntohl(ipha->ipha_src));
1614 		}
1615 		return (-1);
1616 	}
1617 
1618 	ipst->ips_mrtstat->mrts_fwd_in++;
1619 	src = ipha->ipha_src;
1620 
1621 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1622 
1623 	/*
1624 	 * Lock the mfctable against changes made by ip_mforward.
1625 	 * Note that only add_mfc and del_mfc can remove entries and
1626 	 * they run with exclusive access to IP. So we do not need to
1627 	 * guard against the rt being deleted, so release lock after reading.
1628 	 */
1629 
1630 	if (is_mrouter_off(ipst))
1631 		return (-1);
1632 
1633 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1634 	MFCB_REFHOLD(mfcbp);
1635 	MFCFIND(mfcbp, src, dst, rt);
1636 
1637 	/* Entry exists, so forward if necessary */
1638 	if (rt != NULL) {
1639 		int ret = 0;
1640 		ipst->ips_mrtstat->mrts_mfc_hits++;
1641 		if (pim_reg_packet) {
1642 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1643 			ret = ip_mdq(mp, ipha,
1644 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1645 			    v_ipif->ipif_ill,
1646 			    0, rt);
1647 		} else {
1648 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1649 		}
1650 
1651 		MFCB_REFRELE(mfcbp);
1652 		return (ret);
1653 
1654 		/*
1655 		 * Don't forward if we don't have a cache entry.  Mrouted will
1656 		 * always provide a cache entry in response to an upcall.
1657 		 */
1658 	} else {
1659 		/*
1660 		 * If we don't have a route for packet's origin, make a copy
1661 		 * of the packet and send message to routing daemon.
1662 		 */
1663 		struct mfc	*mfc_rt	 = NULL;
1664 		mblk_t		*mp0	 = NULL;
1665 		mblk_t		*mp_copy = NULL;
1666 		struct rtdetq	*rte	 = NULL;
1667 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1668 		uint_t		hash;
1669 		int		npkts;
1670 		boolean_t	new_mfc = B_FALSE;
1671 		ipst->ips_mrtstat->mrts_mfc_misses++;
1672 		/* BSD uses mrts_no_route++ */
1673 		if (ipst->ips_ip_mrtdebug > 1) {
1674 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1675 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1676 			    ill->ill_name, ntohl(src), ntohl(dst),
1677 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1678 		}
1679 		/*
1680 		 * The order of the following code differs from the BSD code.
1681 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1682 		 * code works, so SunOS 5.x wasn't changed to conform to the
1683 		 * BSD version.
1684 		 */
1685 
1686 		/* Lock mfctable. */
1687 		hash = MFCHASH(src, dst);
1688 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1689 
1690 		/*
1691 		 * If we are turning off mrouted return an error
1692 		 */
1693 		if (is_mrouter_off(ipst)) {
1694 			mutex_exit(&mfcbp->mfcb_lock);
1695 			MFCB_REFRELE(mfcbp);
1696 			return (-1);
1697 		}
1698 
1699 		/* Is there an upcall waiting for this packet? */
1700 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1701 		    mfc_rt = mfc_rt->mfc_next) {
1702 			mutex_enter(&mfc_rt->mfc_mutex);
1703 			if (ipst->ips_ip_mrtdebug > 1) {
1704 				(void) mi_strlog(mrouter->conn_rq, 1,
1705 				    SL_TRACE,
1706 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1707 				    " g 0x%x\n",
1708 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1709 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1710 			}
1711 			/* There is an upcall */
1712 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1713 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1714 			    (mfc_rt->mfc_rte != NULL) &&
1715 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1716 				break;
1717 			}
1718 			mutex_exit(&mfc_rt->mfc_mutex);
1719 		}
1720 		/* No upcall, so make a new entry into mfctable */
1721 		if (mfc_rt == NULL) {
1722 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1723 			if (mfc_rt == NULL) {
1724 				ipst->ips_mrtstat->mrts_fwd_drop++;
1725 				ip1dbg(("ip_mforward: out of memory "
1726 				    "for mfc, mfc_rt\n"));
1727 				goto error_return;
1728 			} else
1729 				new_mfc = B_TRUE;
1730 			/* Get resources */
1731 			/* TODO could copy header and dup rest */
1732 			mp_copy = copymsg(mp);
1733 			if (mp_copy == NULL) {
1734 				ipst->ips_mrtstat->mrts_fwd_drop++;
1735 				ip1dbg(("ip_mforward: out of memory for "
1736 				    "mblk, mp_copy\n"));
1737 				goto error_return;
1738 			}
1739 			mutex_enter(&mfc_rt->mfc_mutex);
1740 		}
1741 		/* Get resources for rte, whether first rte or not first. */
1742 		/* Add this packet into rtdetq */
1743 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1744 		if (rte == NULL) {
1745 			ipst->ips_mrtstat->mrts_fwd_drop++;
1746 			mutex_exit(&mfc_rt->mfc_mutex);
1747 			ip1dbg(("ip_mforward: out of memory for"
1748 			    " rtdetq, rte\n"));
1749 			goto error_return;
1750 		}
1751 
1752 		mp0 = copymsg(mp);
1753 		if (mp0 == NULL) {
1754 			ipst->ips_mrtstat->mrts_fwd_drop++;
1755 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1756 			mutex_exit(&mfc_rt->mfc_mutex);
1757 			goto error_return;
1758 		}
1759 		rte->mp		= mp0;
1760 		if (pim_reg_packet) {
1761 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1762 			rte->ill =
1763 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1764 			    v_ipif->ipif_ill;
1765 		} else {
1766 			rte->ill = ill;
1767 		}
1768 		rte->rte_next	= NULL;
1769 
1770 		/*
1771 		 * Determine if upcall q (rtdetq) has overflowed.
1772 		 * mfc_rt->mfc_rte is null by mi_zalloc
1773 		 * if it is the first message.
1774 		 */
1775 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1776 		    rte_m = rte_m->rte_next)
1777 			npkts++;
1778 		if (ipst->ips_ip_mrtdebug > 1) {
1779 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1780 			    "ip_mforward: upcalls %d\n", npkts);
1781 		}
1782 		if (npkts > MAX_UPQ) {
1783 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1784 			mutex_exit(&mfc_rt->mfc_mutex);
1785 			goto error_return;
1786 		}
1787 
1788 		if (npkts == 0) {	/* first upcall */
1789 			int i = 0;
1790 			/*
1791 			 * Now finish installing the new mfc! Now that we have
1792 			 * resources!  Insert new entry at head of hash chain.
1793 			 * Use src and dst which are ipaddr_t's.
1794 			 */
1795 			mfc_rt->mfc_origin.s_addr = src;
1796 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1797 
1798 			mutex_enter(&ipst->ips_numvifs_mutex);
1799 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1800 				mfc_rt->mfc_ttls[i] = 0;
1801 			mutex_exit(&ipst->ips_numvifs_mutex);
1802 			mfc_rt->mfc_parent = ALL_VIFS;
1803 
1804 			/* Link into table */
1805 			if (ipst->ips_ip_mrtdebug > 1) {
1806 				(void) mi_strlog(mrouter->conn_rq, 1,
1807 				    SL_TRACE,
1808 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1809 				    "g 0x%x\n", hash,
1810 				    ntohl(mfc_rt->mfc_origin.s_addr),
1811 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1812 			}
1813 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1814 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1815 			mfc_rt->mfc_rte = NULL;
1816 		}
1817 
1818 		/* Link in the upcall */
1819 		/* First upcall */
1820 		if (mfc_rt->mfc_rte == NULL)
1821 			mfc_rt->mfc_rte = rte;
1822 		else {
1823 			/* not the first upcall */
1824 			prev_rte = mfc_rt->mfc_rte;
1825 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1826 			    prev_rte = rte1, rte1 = rte1->rte_next)
1827 				;
1828 			prev_rte->rte_next = rte;
1829 		}
1830 
1831 		/*
1832 		 * No upcalls waiting, this is first one, so send a message to
1833 		 * routing daemon to install a route into kernel table.
1834 		 */
1835 		if (npkts == 0) {
1836 			struct igmpmsg	*im;
1837 			/* ipha_protocol is 0, for upcall */
1838 			ASSERT(mp_copy != NULL);
1839 			im = (struct igmpmsg *)mp_copy->b_rptr;
1840 			im->im_msgtype	= IGMPMSG_NOCACHE;
1841 			im->im_mbz = 0;
1842 			mutex_enter(&ipst->ips_numvifs_mutex);
1843 			if (pim_reg_packet) {
1844 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1845 				mutex_exit(&ipst->ips_numvifs_mutex);
1846 			} else {
1847 				/*
1848 				 * XXX do we need to hold locks here ?
1849 				 */
1850 				for (vifi = 0;
1851 				    vifi < ipst->ips_numvifs;
1852 				    vifi++) {
1853 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1854 						continue;
1855 					if (ipst->ips_vifs[vifi].
1856 					    v_ipif->ipif_ill == ill) {
1857 						im->im_vif = (uchar_t)vifi;
1858 						break;
1859 					}
1860 				}
1861 				mutex_exit(&ipst->ips_numvifs_mutex);
1862 				ASSERT(vifi < ipst->ips_numvifs);
1863 			}
1864 
1865 			ipst->ips_mrtstat->mrts_upcalls++;
1866 			/* Timer to discard upcalls if mrouted is too slow */
1867 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1868 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1869 			mutex_exit(&mfc_rt->mfc_mutex);
1870 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1871 			/* Pass to RAWIP */
1872 			ira->ira_ill = ira->ira_rill = NULL;
1873 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, ira);
1874 			ira->ira_ill = ill;
1875 			ira->ira_rill = rill;
1876 		} else {
1877 			mutex_exit(&mfc_rt->mfc_mutex);
1878 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1879 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1880 			ip_drop_input("ip_mforward - upcall already waiting",
1881 			    mp_copy, ill);
1882 			freemsg(mp_copy);
1883 		}
1884 
1885 		MFCB_REFRELE(mfcbp);
1886 		if (tunnel_src != 0)
1887 			return (1);
1888 		else
1889 			return (0);
1890 	error_return:
1891 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1892 		MFCB_REFRELE(mfcbp);
1893 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1894 			mi_free((char *)mfc_rt);
1895 		if (rte != NULL)
1896 			mi_free((char *)rte);
1897 		if (mp_copy != NULL) {
1898 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
1899 			ip_drop_input("ip_mforward error", mp_copy, ill);
1900 			freemsg(mp_copy);
1901 		}
1902 		if (mp0 != NULL)
1903 			freemsg(mp0);
1904 		return (-1);
1905 	}
1906 }
1907 
1908 /*
1909  * Clean up the mfctable cache entry if upcall is not serviced.
1910  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1911  */
1912 static void
1913 expire_upcalls(void *arg)
1914 {
1915 	struct mfc *mfc_rt = arg;
1916 	uint_t hash;
1917 	struct mfc *prev_mfc, *mfc0;
1918 	ip_stack_t	*ipst;
1919 	conn_t		*mrouter;
1920 
1921 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1922 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1923 		return;
1924 	}
1925 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1926 	mrouter = ipst->ips_ip_g_mrouter;
1927 
1928 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1929 	if (ipst->ips_ip_mrtdebug > 1) {
1930 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1931 		    "expire_upcalls: hash %d s %x g %x",
1932 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1933 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1934 	}
1935 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1936 	mutex_enter(&mfc_rt->mfc_mutex);
1937 	/*
1938 	 * if timeout has been set to zero, than the
1939 	 * entry has been filled, no need to delete it.
1940 	 */
1941 	if (mfc_rt->mfc_timeout_id == 0)
1942 		goto done;
1943 	ipst->ips_mrtstat->mrts_cache_cleanups++;
1944 	mfc_rt->mfc_timeout_id = 0;
1945 
1946 	/* Determine entry to be cleaned up in cache table. */
1947 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
1948 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1949 		if (mfc0 == mfc_rt)
1950 			break;
1951 
1952 	/* del_mfc takes care of gone mfcs */
1953 	ASSERT(prev_mfc != NULL);
1954 	ASSERT(mfc0 != NULL);
1955 
1956 	/*
1957 	 * Delete the entry from the cache
1958 	 */
1959 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1960 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1961 
1962 	/*
1963 	 * release_mfc will drop all queued upcall packets.
1964 	 * and will free the mbuf with the pkt, if, timing info.
1965 	 */
1966 done:
1967 	mutex_exit(&mfc_rt->mfc_mutex);
1968 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1969 }
1970 
1971 /*
1972  * Packet forwarding routine once entry in the cache is made.
1973  */
1974 static int
1975 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
1976     struct mfc *rt)
1977 {
1978 	vifi_t vifi;
1979 	struct vif *vifp;
1980 	ipaddr_t dst = ipha->ipha_dst;
1981 	size_t  plen = msgdsize(mp);
1982 	vifi_t num_of_vifs;
1983 	ip_stack_t	*ipst = ill->ill_ipst;
1984 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1985 	ip_recv_attr_t	iras;
1986 
1987 	if (ipst->ips_ip_mrtdebug > 1) {
1988 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1989 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
1990 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1991 		    ill->ill_name);
1992 	}
1993 
1994 	/* Macro to send packet on vif */
1995 #define	MC_SEND(ipha, mp, vifp, dst) { \
1996 	if ((vifp)->v_flags & VIFF_TUNNEL) \
1997 		encap_send((ipha), (mp), (vifp), (dst)); \
1998 	else if ((vifp)->v_flags & VIFF_REGISTER) \
1999 		register_send((ipha), (mp), (vifp), (dst)); \
2000 	else \
2001 		phyint_send((ipha), (mp), (vifp), (dst)); \
2002 }
2003 
2004 	vifi = rt->mfc_parent;
2005 
2006 	/*
2007 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2008 	 * Mrouted had no route.
2009 	 * We wanted the route installed in the mfctable to prevent multiple
2010 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2011 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2012 	 * 3.6.
2013 	 */
2014 	if (vifi == NO_VIF) {
2015 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2016 		    ill->ill_name));
2017 		if (ipst->ips_ip_mrtdebug > 1) {
2018 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2019 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2020 		}
2021 		return (-1);	/* drop pkt */
2022 	}
2023 
2024 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2025 		return (-1);
2026 	/*
2027 	 * The MFC entries are not cleaned up when an ipif goes
2028 	 * away thus this code has to guard against an MFC referencing
2029 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2030 	 * sets the v_ipif to NULL when the ipif disappears.
2031 	 */
2032 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2033 
2034 	if (vifi >= ipst->ips_numvifs) {
2035 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2036 		    "%d ill %s viftable ill %s\n",
2037 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2038 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2039 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2040 		return (-1);
2041 	}
2042 	/*
2043 	 * Don't forward if it didn't arrive from the parent vif for its
2044 	 * origin.
2045 	 */
2046 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill) ||
2047 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2048 		/* Came in the wrong interface */
2049 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2050 			"numvifs %d ill %s viftable ill %s\n",
2051 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2052 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2053 		if (ipst->ips_ip_mrtdebug > 1) {
2054 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2055 			    "ip_mdq: arrived wrong if, vifi %d ill "
2056 			    "%s viftable ill %s\n",
2057 			    (int)vifi, ill->ill_name,
2058 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2059 		}
2060 		ipst->ips_mrtstat->mrts_wrong_if++;
2061 		rt->mfc_wrong_if++;
2062 
2063 		/*
2064 		 * If we are doing PIM assert processing and we are forwarding
2065 		 * packets on this interface, and it is a broadcast medium
2066 		 * interface (and not a tunnel), send a message to the routing.
2067 		 *
2068 		 * We use the first ipif on the list, since it's all we have.
2069 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2070 		 */
2071 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2072 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2073 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2074 			mblk_t		*mp_copy;
2075 			struct igmpmsg	*im;
2076 
2077 			/* TODO could copy header and dup rest */
2078 			mp_copy = copymsg(mp);
2079 			if (mp_copy == NULL) {
2080 				ipst->ips_mrtstat->mrts_fwd_drop++;
2081 				ip1dbg(("ip_mdq: out of memory "
2082 				    "for mblk, mp_copy\n"));
2083 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2084 				return (-1);
2085 			}
2086 
2087 			im = (struct igmpmsg *)mp_copy->b_rptr;
2088 			im->im_msgtype = IGMPMSG_WRONGVIF;
2089 			im->im_mbz = 0;
2090 			im->im_vif = (ushort_t)vifi;
2091 			/* Pass to RAWIP */
2092 
2093 			bzero(&iras, sizeof (iras));
2094 			iras.ira_flags = IRAF_IS_IPV4;
2095 			iras.ira_ip_hdr_length =
2096 			    IPH_HDR_LENGTH(mp_copy->b_rptr);
2097 			iras.ira_pktlen = msgdsize(mp_copy);
2098 			(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2099 			ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2100 		}
2101 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2102 		if (tunnel_src != 0)
2103 			return (1);
2104 		else
2105 			return (0);
2106 	}
2107 	/*
2108 	 * If I sourced this packet, it counts as output, else it was input.
2109 	 */
2110 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2111 		ipst->ips_vifs[vifi].v_pkt_out++;
2112 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2113 	} else {
2114 		ipst->ips_vifs[vifi].v_pkt_in++;
2115 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2116 	}
2117 	mutex_enter(&rt->mfc_mutex);
2118 	rt->mfc_pkt_cnt++;
2119 	rt->mfc_byte_cnt += plen;
2120 	mutex_exit(&rt->mfc_mutex);
2121 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2122 	/*
2123 	 * For each vif, decide if a copy of the packet should be forwarded.
2124 	 * Forward if:
2125 	 *		- the vif threshold ttl is non-zero AND
2126 	 *		- the pkt ttl exceeds the vif's threshold
2127 	 * A non-zero mfc_ttl indicates that the vif is part of
2128 	 * the output set for the mfc entry.
2129 	 */
2130 	mutex_enter(&ipst->ips_numvifs_mutex);
2131 	num_of_vifs = ipst->ips_numvifs;
2132 	mutex_exit(&ipst->ips_numvifs_mutex);
2133 	for (vifp = ipst->ips_vifs, vifi = 0;
2134 	    vifi < num_of_vifs;
2135 	    vifp++, vifi++) {
2136 		if (!lock_good_vif(vifp))
2137 			continue;
2138 		if ((rt->mfc_ttls[vifi] > 0) &&
2139 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2140 			/*
2141 			 * lock_good_vif should not have succedded if
2142 			 * v_ipif is null.
2143 			 */
2144 			ASSERT(vifp->v_ipif != NULL);
2145 			vifp->v_pkt_out++;
2146 			vifp->v_bytes_out += plen;
2147 			MC_SEND(ipha, mp, vifp, dst);
2148 			ipst->ips_mrtstat->mrts_fwd_out++;
2149 		}
2150 		unlock_good_vif(vifp);
2151 	}
2152 	if (tunnel_src != 0)
2153 		return (1);
2154 	else
2155 		return (0);
2156 }
2157 
2158 /*
2159  * Send the packet on physical interface.
2160  * Caller assumes can continue to use mp on return.
2161  */
2162 /* ARGSUSED */
2163 static void
2164 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2165 {
2166 	mblk_t 	*mp_copy;
2167 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2168 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2169 
2170 	/* Make a new reference to the packet */
2171 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2172 	if (mp_copy == NULL) {
2173 		ipst->ips_mrtstat->mrts_fwd_drop++;
2174 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2175 		return;
2176 	}
2177 	if (vifp->v_rate_limit <= 0)
2178 		tbf_send_packet(vifp, mp_copy);
2179 	else  {
2180 		if (ipst->ips_ip_mrtdebug > 1) {
2181 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2182 			    "phyint_send: tbf_contr rate %d "
2183 			    "vifp 0x%p mp 0x%p dst 0x%x",
2184 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2185 		}
2186 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2187 	}
2188 }
2189 
2190 /*
2191  * Send the whole packet for REGISTER encapsulation to PIM daemon
2192  * Caller assumes it can continue to use mp on return.
2193  */
2194 /* ARGSUSED */
2195 static void
2196 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2197 {
2198 	struct igmpmsg	*im;
2199 	mblk_t		*mp_copy;
2200 	ipha_t		*ipha_copy;
2201 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2202 	ip_stack_t	*ipst = ill->ill_ipst;
2203 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2204 	ip_recv_attr_t	iras;
2205 
2206 	if (ipst->ips_ip_mrtdebug > 1) {
2207 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2208 		    "register_send: src %x, dst %x\n",
2209 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2210 	}
2211 
2212 	/*
2213 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2214 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2215 	 * ethernet driver will.
2216 	 */
2217 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2218 	if (mp_copy == NULL) {
2219 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2220 		if (ipst->ips_ip_mrtdebug > 3) {
2221 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2222 			    "register_send: allocb failure.");
2223 		}
2224 		return;
2225 	}
2226 
2227 	/*
2228 	 * Bump write pointer to account for igmpmsg being added.
2229 	 */
2230 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2231 
2232 	/*
2233 	 * Chain packet to new mblk_t.
2234 	 */
2235 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2236 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2237 		if (ipst->ips_ip_mrtdebug > 3) {
2238 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2239 			    "register_send: copymsg failure.");
2240 		}
2241 		freeb(mp_copy);
2242 		return;
2243 	}
2244 
2245 	/*
2246 	 * icmp_input() asserts that IP version field is set to an
2247 	 * appropriate version. Hence, the struct igmpmsg that this really
2248 	 * becomes, needs to have the correct IP version field.
2249 	 */
2250 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2251 	*ipha_copy = multicast_encap_iphdr;
2252 
2253 	/*
2254 	 * The kernel uses the struct igmpmsg header to encode the messages to
2255 	 * the multicast routing daemon. Fill in the fields in the header
2256 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2257 	 */
2258 	im = (struct igmpmsg *)mp_copy->b_rptr;
2259 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2260 	im->im_src.s_addr = ipha->ipha_src;
2261 	im->im_dst.s_addr = ipha->ipha_dst;
2262 
2263 	/*
2264 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2265 	 * header with renamed fields and the multicast routing daemon uses
2266 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2267 	 */
2268 	im->im_mbz = 0;
2269 
2270 	++ipst->ips_mrtstat->mrts_upcalls;
2271 	if (IPCL_IS_NONSTR(mrouter) ? mrouter->conn_flow_cntrld :
2272 	    !canputnext(mrouter->conn_rq)) {
2273 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2274 		if (ipst->ips_ip_mrtdebug > 3) {
2275 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2276 			    "register_send: register upcall failure.");
2277 		}
2278 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2279 		ip_drop_input("mrts_pim_regsend_drops", mp_copy, ill);
2280 		freemsg(mp_copy);
2281 	} else {
2282 		/* Pass to RAWIP */
2283 		bzero(&iras, sizeof (iras));
2284 		iras.ira_flags = IRAF_IS_IPV4;
2285 		iras.ira_ip_hdr_length = sizeof (ipha_t);
2286 		iras.ira_pktlen = msgdsize(mp_copy);
2287 		(mrouter->conn_recv)(mrouter, mp_copy, NULL, &iras);
2288 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
2289 	}
2290 }
2291 
2292 /*
2293  * pim_validate_cksum handles verification of the checksum in the
2294  * pim header.  For PIM Register packets, the checksum is calculated
2295  * across the PIM header only.  For all other packets, the checksum
2296  * is for the PIM header and remainder of the packet.
2297  *
2298  * returns: B_TRUE, if checksum is okay.
2299  *          B_FALSE, if checksum is not valid.
2300  */
2301 static boolean_t
2302 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2303 {
2304 	mblk_t *mp_dup;
2305 
2306 	if ((mp_dup = dupmsg(mp)) == NULL)
2307 		return (B_FALSE);
2308 
2309 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2310 	if (pimp->pim_type == PIM_REGISTER)
2311 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2312 	if (IP_CSUM(mp_dup, 0, 0)) {
2313 		freemsg(mp_dup);
2314 		return (B_FALSE);
2315 	}
2316 	freemsg(mp_dup);
2317 	return (B_TRUE);
2318 }
2319 
2320 /*
2321  * Process PIM protocol packets i.e. IP Protocol 103.
2322  * Register messages are decapsulated and sent onto multicast forwarding.
2323  *
2324  * Return NULL for a bad packet that is discarded here.
2325  * Return mp if the message is OK and should be handed to "raw" receivers.
2326  * Callers of pim_input() may need to reinitialize variables that were copied
2327  * from the mblk as this calls pullupmsg().
2328  */
2329 mblk_t *
2330 pim_input(mblk_t *mp, ip_recv_attr_t *ira)
2331 {
2332 	ipha_t		*eip, *ip;
2333 	int		iplen, pimlen, iphlen;
2334 	struct pim	*pimp;	/* pointer to a pim struct */
2335 	uint32_t	*reghdr;
2336 	ill_t		*ill = ira->ira_ill;
2337 	ip_stack_t	*ipst = ill->ill_ipst;
2338 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2339 
2340 	/*
2341 	 * Pullup the msg for PIM protocol processing.
2342 	 */
2343 	if (pullupmsg(mp, -1) == 0) {
2344 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2345 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2346 		ip_drop_input("mrts_pim_nomemory", mp, ill);
2347 		freemsg(mp);
2348 		return (NULL);
2349 	}
2350 
2351 	ip = (ipha_t *)mp->b_rptr;
2352 	iplen = ip->ipha_length;
2353 	iphlen = IPH_HDR_LENGTH(ip);
2354 	pimlen = ntohs(iplen) - iphlen;
2355 
2356 	/*
2357 	 * Validate lengths
2358 	 */
2359 	if (pimlen < PIM_MINLEN) {
2360 		++ipst->ips_mrtstat->mrts_pim_malformed;
2361 		if (ipst->ips_ip_mrtdebug > 1) {
2362 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2363 			    "pim_input: length not at least minlen");
2364 		}
2365 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2366 		ip_drop_input("mrts_pim_malformed", mp, ill);
2367 		freemsg(mp);
2368 		return (NULL);
2369 	}
2370 
2371 	/*
2372 	 * Point to the PIM header.
2373 	 */
2374 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2375 
2376 	/*
2377 	 * Check the version number.
2378 	 */
2379 	if (pimp->pim_vers != PIM_VERSION) {
2380 		++ipst->ips_mrtstat->mrts_pim_badversion;
2381 		if (ipst->ips_ip_mrtdebug > 1) {
2382 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2383 			    "pim_input: unknown version of PIM");
2384 		}
2385 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2386 		ip_drop_input("mrts_pim_badversion", mp, ill);
2387 		freemsg(mp);
2388 		return (NULL);
2389 	}
2390 
2391 	/*
2392 	 * Validate the checksum
2393 	 */
2394 	if (!pim_validate_cksum(mp, ip, pimp)) {
2395 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2396 		if (ipst->ips_ip_mrtdebug > 1) {
2397 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2398 			    "pim_input: invalid checksum");
2399 		}
2400 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2401 		ip_drop_input("pim_rcv_badcsum", mp, ill);
2402 		freemsg(mp);
2403 		return (NULL);
2404 	}
2405 
2406 	if (pimp->pim_type != PIM_REGISTER)
2407 		return (mp);
2408 
2409 	reghdr = (uint32_t *)(pimp + 1);
2410 	eip = (ipha_t *)(reghdr + 1);
2411 
2412 	/*
2413 	 * check if the inner packet is destined to mcast group
2414 	 */
2415 	if (!CLASSD(eip->ipha_dst)) {
2416 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2417 		if (ipst->ips_ip_mrtdebug > 1) {
2418 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2419 			    "pim_input: Inner pkt not mcast .. !");
2420 		}
2421 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2422 		ip_drop_input("mrts_pim_badregisters", mp, ill);
2423 		freemsg(mp);
2424 		return (NULL);
2425 	}
2426 	if (ipst->ips_ip_mrtdebug > 1) {
2427 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2428 		    "register from %x, to %x, len %d",
2429 		    ntohl(eip->ipha_src),
2430 		    ntohl(eip->ipha_dst),
2431 		    ntohs(eip->ipha_length));
2432 	}
2433 	/*
2434 	 * If the null register bit is not set, decapsulate
2435 	 * the packet before forwarding it.
2436 	 * Avoid this in no register vif
2437 	 */
2438 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER) &&
2439 	    ipst->ips_reg_vif_num != ALL_VIFS) {
2440 		mblk_t *mp_copy;
2441 		uint_t saved_pktlen;
2442 
2443 		/* Copy the message */
2444 		if ((mp_copy = copymsg(mp)) == NULL) {
2445 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2446 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2447 			ip_drop_input("mrts_pim_nomemory", mp, ill);
2448 			freemsg(mp);
2449 			return (NULL);
2450 		}
2451 
2452 		/*
2453 		 * Decapsulate the packet and give it to
2454 		 * register_mforward.
2455 		 */
2456 		mp_copy->b_rptr += iphlen + sizeof (pim_t) + sizeof (*reghdr);
2457 		saved_pktlen = ira->ira_pktlen;
2458 		ira->ira_pktlen -= iphlen + sizeof (pim_t) + sizeof (*reghdr);
2459 		if (register_mforward(mp_copy, ira) != 0) {
2460 			/* register_mforward already called ip_drop_input */
2461 			freemsg(mp);
2462 			ira->ira_pktlen = saved_pktlen;
2463 			return (NULL);
2464 		}
2465 		ira->ira_pktlen = saved_pktlen;
2466 	}
2467 
2468 	/*
2469 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2470 	 * PIM socket. For Solaris it is done right after pim_input() is
2471 	 * called.
2472 	 */
2473 	return (mp);
2474 }
2475 
2476 /*
2477  * PIM sparse mode hook.  Called by pim_input after decapsulating
2478  * the packet. Loop back the packet, as if we have received it.
2479  * In pim_input() we have to check if the destination is a multicast address.
2480  */
2481 static int
2482 register_mforward(mblk_t *mp, ip_recv_attr_t *ira)
2483 {
2484 	ire_t		*ire;
2485 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2486 	ill_t		*ill = ira->ira_ill;
2487 	ip_stack_t	*ipst = ill->ill_ipst;
2488 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2489 
2490 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2491 
2492 	if (ipst->ips_ip_mrtdebug > 3) {
2493 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2494 		    "register_mforward: src %x, dst %x\n",
2495 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2496 	}
2497 	/*
2498 	 * Need to pass in to ip_mforward() the information that the
2499 	 * packet has arrived on the register_vif. We mark it with
2500 	 * the IRAF_PIM_REGISTER attribute.
2501 	 * pim_input verified that the (inner) destination is multicast,
2502 	 * hence we skip the generic code in ip_input.
2503 	 */
2504 	ira->ira_flags |= IRAF_PIM_REGISTER;
2505 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2506 
2507 	if (!CLASSD(ipha->ipha_dst)) {
2508 		ire = ire_route_recursive_v4(ipha->ipha_dst, 0, NULL, ALL_ZONES,
2509 		    ira->ira_tsl, MATCH_IRE_SECATTR, IRR_ALLOCATE, 0, ipst,
2510 		    NULL, NULL, NULL);
2511 	} else {
2512 		ire = ire_multicast(ill);
2513 	}
2514 	ASSERT(ire != NULL);
2515 	/* Normally this will return the IRE_MULTICAST */
2516 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2517 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2518 		ip_drop_input("mrts_pim RTF_REJECT", mp, ill);
2519 		freemsg(mp);
2520 		ire_refrele(ire);
2521 		return (-1);
2522 	}
2523 	ASSERT(ire->ire_type & IRE_MULTICAST);
2524 	(*ire->ire_recvfn)(ire, mp, ipha, ira);
2525 	ire_refrele(ire);
2526 
2527 	return (0);
2528 }
2529 
2530 /*
2531  * Send an encapsulated packet.
2532  * Caller assumes can continue to use mp when routine returns.
2533  */
2534 /* ARGSUSED */
2535 static void
2536 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2537 {
2538 	mblk_t 	*mp_copy;
2539 	ipha_t 	*ipha_copy;
2540 	size_t	len;
2541 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2542 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2543 
2544 	if (ipst->ips_ip_mrtdebug > 1) {
2545 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2546 		    "encap_send: vif %ld enter",
2547 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2548 	}
2549 	len = ntohs(ipha->ipha_length);
2550 
2551 	/*
2552 	 * Copy the old packet & pullup it's IP header into the
2553 	 * new mbuf so we can modify it.  Try to fill the new
2554 	 * mbuf since if we don't the ethernet driver will.
2555 	 */
2556 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2557 	if (mp_copy == NULL)
2558 		return;
2559 	mp_copy->b_rptr += 32;
2560 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2561 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2562 		freeb(mp_copy);
2563 		return;
2564 	}
2565 
2566 	/*
2567 	 * Fill in the encapsulating IP header.
2568 	 * Remote tunnel dst in rmt_addr, from add_vif().
2569 	 */
2570 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2571 	*ipha_copy = multicast_encap_iphdr;
2572 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2573 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2574 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2575 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2576 	ASSERT(ipha_copy->ipha_ident == 0);
2577 
2578 	/* Turn the encapsulated IP header back into a valid one. */
2579 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2580 	ipha->ipha_ttl--;
2581 	ipha->ipha_hdr_checksum = 0;
2582 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2583 
2584 	ipha_copy->ipha_ttl = ipha->ipha_ttl;
2585 
2586 	if (ipst->ips_ip_mrtdebug > 1) {
2587 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2588 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2589 	}
2590 	if (vifp->v_rate_limit <= 0)
2591 		tbf_send_packet(vifp, mp_copy);
2592 	else
2593 		/* ipha is from the original header */
2594 		tbf_control(vifp, mp_copy, ipha);
2595 }
2596 
2597 /*
2598  * De-encapsulate a packet and feed it back through IP input if it
2599  * matches one of our multicast tunnels.
2600  *
2601  * This routine is called whenever IP gets a packet with prototype
2602  * IPPROTO_ENCAP and a local destination address and the packet didn't
2603  * match one of our configured IP-in-IP tunnels.
2604  */
2605 void
2606 ip_mroute_decap(mblk_t *mp, ip_recv_attr_t *ira)
2607 {
2608 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2609 	ipha_t		*ipha_encap;
2610 	int		hlen = IPH_HDR_LENGTH(ipha);
2611 	int		hlen_encap;
2612 	ipaddr_t	src;
2613 	struct vif	*vifp;
2614 	ire_t		*ire;
2615 	ill_t		*ill = ira->ira_ill;
2616 	ip_stack_t	*ipst = ill->ill_ipst;
2617 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2618 
2619 	/* Make sure we have all of the inner header */
2620 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2621 	if (mp->b_wptr - mp->b_rptr < hlen + IP_SIMPLE_HDR_LENGTH) {
2622 		ipha = ip_pullup(mp, hlen + IP_SIMPLE_HDR_LENGTH, ira);
2623 		if (ipha == NULL) {
2624 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2625 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2626 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2627 			freemsg(mp);
2628 			return;
2629 		}
2630 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2631 	}
2632 	hlen_encap = IPH_HDR_LENGTH(ipha_encap);
2633 	if (mp->b_wptr - mp->b_rptr < hlen + hlen_encap) {
2634 		ipha = ip_pullup(mp, hlen + hlen_encap, ira);
2635 		if (ipha == NULL) {
2636 			ipst->ips_mrtstat->mrts_bad_tunnel++;
2637 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2638 			ip_drop_input("ip_mroute_decap: too short", mp, ill);
2639 			freemsg(mp);
2640 			return;
2641 		}
2642 		ipha_encap = (ipha_t *)((char *)ipha + hlen);
2643 	}
2644 
2645 	/*
2646 	 * Dump the packet if it's not to a multicast destination or if
2647 	 * we don't have an encapsulating tunnel with the source.
2648 	 * Note:  This code assumes that the remote site IP address
2649 	 * uniquely identifies the tunnel (i.e., that this site has
2650 	 * at most one tunnel with the remote site).
2651 	 */
2652 	if (!CLASSD(ipha_encap->ipha_dst)) {
2653 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2654 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2655 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2656 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2657 		freemsg(mp);
2658 		return;
2659 	}
2660 	src = (ipaddr_t)ipha->ipha_src;
2661 	mutex_enter(&ipst->ips_last_encap_lock);
2662 	if (src != ipst->ips_last_encap_src) {
2663 		struct vif *vife;
2664 
2665 		vifp = ipst->ips_vifs;
2666 		vife = vifp + ipst->ips_numvifs;
2667 		ipst->ips_last_encap_src = src;
2668 		ipst->ips_last_encap_vif = 0;
2669 		for (; vifp < vife; ++vifp) {
2670 			if (!lock_good_vif(vifp))
2671 				continue;
2672 			if (vifp->v_rmt_addr.s_addr == src) {
2673 				if (vifp->v_flags & VIFF_TUNNEL)
2674 					ipst->ips_last_encap_vif = vifp;
2675 				if (ipst->ips_ip_mrtdebug > 1) {
2676 					(void) mi_strlog(mrouter->conn_rq,
2677 					    1, SL_TRACE,
2678 					    "ip_mroute_decap: good tun "
2679 					    "vif %ld with %x",
2680 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2681 					    ntohl(src));
2682 				}
2683 				unlock_good_vif(vifp);
2684 				break;
2685 			}
2686 			unlock_good_vif(vifp);
2687 		}
2688 	}
2689 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2690 		mutex_exit(&ipst->ips_last_encap_lock);
2691 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2692 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2693 		ip_drop_input("mrts_bad_tunnel", mp, ill);
2694 		freemsg(mp);
2695 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2696 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2697 		return;
2698 	}
2699 	mutex_exit(&ipst->ips_last_encap_lock);
2700 
2701 	/*
2702 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2703 	 * verify that the packet arrived over the correct vif.)
2704 	 */
2705 	ira->ira_flags |= IRAF_MROUTE_TUNNEL_SET;
2706 	ira->ira_mroute_tunnel = src;
2707 	mp->b_rptr += hlen;
2708 	ira->ira_pktlen -= hlen;
2709 	ira->ira_ip_hdr_length = hlen_encap;
2710 
2711 	/*
2712 	 * We don't redo any of the filtering in ill_input_full_v4 and we
2713 	 * have checked that all of ipha_encap and any IP options are
2714 	 * pulled up. Hence we call ire_recv_multicast_v4 directly.
2715 	 * However, we have to check for RSVP as in ip_input_full_v4
2716 	 * and if so we pass it to ire_recv_broadcast_v4 for local delivery
2717 	 * to the rsvpd.
2718 	 */
2719 	if (ipha_encap->ipha_protocol == IPPROTO_RSVP &&
2720 	    ipst->ips_ipcl_proto_fanout_v4[IPPROTO_RSVP].connf_head != NULL) {
2721 		ire = ire_route_recursive_v4(INADDR_BROADCAST, 0, ill,
2722 		    ALL_ZONES, ira->ira_tsl, MATCH_IRE_ILL|MATCH_IRE_SECATTR,
2723 		    IRR_ALLOCATE, 0, ipst, NULL, NULL, NULL);
2724 	} else {
2725 		ire = ire_multicast(ill);
2726 	}
2727 	ASSERT(ire != NULL);
2728 	/* Normally this will return the IRE_MULTICAST or IRE_BROADCAST */
2729 	if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) {
2730 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards);
2731 		ip_drop_input("ip_mroute_decap: RTF_REJECT", mp, ill);
2732 		freemsg(mp);
2733 		ire_refrele(ire);
2734 		return;
2735 	}
2736 	ire->ire_ib_pkt_count++;
2737 	ASSERT(ire->ire_type & (IRE_MULTICAST|IRE_BROADCAST));
2738 	(*ire->ire_recvfn)(ire, mp, ipha_encap, ira);
2739 	ire_refrele(ire);
2740 }
2741 
2742 /*
2743  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2744  * (stream closed).  Called as writer.
2745  */
2746 void
2747 reset_mrt_vif_ipif(ipif_t *ipif)
2748 {
2749 	vifi_t vifi, tmp_vifi;
2750 	vifi_t num_of_vifs;
2751 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2752 
2753 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2754 
2755 	mutex_enter(&ipst->ips_numvifs_mutex);
2756 	num_of_vifs = ipst->ips_numvifs;
2757 	mutex_exit(&ipst->ips_numvifs_mutex);
2758 
2759 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2760 		tmp_vifi = vifi - 1;
2761 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2762 			(void) del_vif(&tmp_vifi, ipst);
2763 		}
2764 	}
2765 }
2766 
2767 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2768 void
2769 reset_mrt_ill(ill_t *ill)
2770 {
2771 	struct mfc	*rt;
2772 	struct rtdetq	*rte;
2773 	int		i;
2774 	ip_stack_t	*ipst = ill->ill_ipst;
2775 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2776 	timeout_id_t	id;
2777 
2778 	for (i = 0; i < MFCTBLSIZ; i++) {
2779 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2780 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2781 			if (ipst->ips_ip_mrtdebug > 1) {
2782 				(void) mi_strlog(mrouter->conn_rq, 1,
2783 				    SL_TRACE,
2784 				    "reset_mrt_ill: mfctable [%d]", i);
2785 			}
2786 			while (rt != NULL) {
2787 				mutex_enter(&rt->mfc_mutex);
2788 				while ((rte = rt->mfc_rte) != NULL) {
2789 					if (rte->ill == ill &&
2790 					    (id = rt->mfc_timeout_id) != 0) {
2791 						/*
2792 						 * Its ok to drop the lock,  the
2793 						 * struct cannot be freed since
2794 						 * we have a ref on the hash
2795 						 * bucket.
2796 						 */
2797 						mutex_exit(&rt->mfc_mutex);
2798 						(void) untimeout(id);
2799 						mutex_enter(&rt->mfc_mutex);
2800 					}
2801 					if (rte->ill == ill) {
2802 						if (ipst->ips_ip_mrtdebug > 1) {
2803 						(void) mi_strlog(
2804 						    mrouter->conn_rq,
2805 						    1, SL_TRACE,
2806 						    "reset_mrt_ill: "
2807 						    "ill 0x%p", (void *)ill);
2808 						}
2809 						rt->mfc_rte = rte->rte_next;
2810 						freemsg(rte->mp);
2811 						mi_free((char *)rte);
2812 					}
2813 				}
2814 				mutex_exit(&rt->mfc_mutex);
2815 				rt = rt->mfc_next;
2816 			}
2817 		}
2818 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2819 	}
2820 }
2821 
2822 /*
2823  * Token bucket filter module.
2824  * The ipha is for mcastgrp destination for phyint and encap.
2825  */
2826 static void
2827 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2828 {
2829 	size_t 	p_len =  msgdsize(mp);
2830 	struct tbf	*t    = vifp->v_tbf;
2831 	timeout_id_t id = 0;
2832 	ill_t		*ill = vifp->v_ipif->ipif_ill;
2833 	ip_stack_t	*ipst = ill->ill_ipst;
2834 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2835 
2836 	/* Drop if packet is too large */
2837 	if (p_len > MAX_BKT_SIZE) {
2838 		ipst->ips_mrtstat->mrts_pkt2large++;
2839 		BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2840 		ip_drop_output("tbf_control - too large", mp, ill);
2841 		freemsg(mp);
2842 		return;
2843 	}
2844 	if (ipst->ips_ip_mrtdebug > 1) {
2845 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2846 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2847 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2848 		    ntohl(ipha->ipha_dst));
2849 	}
2850 
2851 	mutex_enter(&t->tbf_lock);
2852 
2853 	tbf_update_tokens(vifp);
2854 
2855 	/*
2856 	 * If there are enough tokens,
2857 	 * and the queue is empty, send this packet out.
2858 	 */
2859 	if (ipst->ips_ip_mrtdebug > 1) {
2860 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2861 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2862 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2863 		    t->tbf_q_len);
2864 	}
2865 	/* No packets are queued */
2866 	if (t->tbf_q_len == 0) {
2867 		/* queue empty, send packet if enough tokens */
2868 		if (p_len <= t->tbf_n_tok) {
2869 			t->tbf_n_tok -= p_len;
2870 			mutex_exit(&t->tbf_lock);
2871 			tbf_send_packet(vifp, mp);
2872 			return;
2873 		} else {
2874 			/* Queue packet and timeout till later */
2875 			tbf_queue(vifp, mp);
2876 			ASSERT(vifp->v_timeout_id == 0);
2877 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2878 			    TBF_REPROCESS);
2879 		}
2880 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2881 		/* Finite queue length, so queue pkts and process queue */
2882 		tbf_queue(vifp, mp);
2883 		tbf_process_q(vifp);
2884 	} else {
2885 		/* Check that we have UDP header with IP header */
2886 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2887 		    sizeof (struct udphdr);
2888 
2889 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2890 			if (!pullupmsg(mp, hdr_length)) {
2891 				BUMP_MIB(ill->ill_ip_mib,
2892 				    ipIfStatsOutDiscards);
2893 				ip_drop_output("tbf_control - pullup", mp, ill);
2894 				freemsg(mp);
2895 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2896 				    "vif %ld src 0x%x dst 0x%x\n",
2897 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2898 				    ntohl(ipha->ipha_src),
2899 				    ntohl(ipha->ipha_dst)));
2900 				mutex_exit(&vifp->v_tbf->tbf_lock);
2901 				return;
2902 			} else
2903 				/* Have to reassign ipha after pullupmsg */
2904 				ipha = (ipha_t *)mp->b_rptr;
2905 		}
2906 		/*
2907 		 * Queue length too much,
2908 		 * try to selectively dq, or queue and process
2909 		 */
2910 		if (!tbf_dq_sel(vifp, ipha)) {
2911 			ipst->ips_mrtstat->mrts_q_overflow++;
2912 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2913 			ip_drop_output("mrts_q_overflow", mp, ill);
2914 			freemsg(mp);
2915 		} else {
2916 			tbf_queue(vifp, mp);
2917 			tbf_process_q(vifp);
2918 		}
2919 	}
2920 	if (t->tbf_q_len == 0) {
2921 		id = vifp->v_timeout_id;
2922 		vifp->v_timeout_id = 0;
2923 	}
2924 	mutex_exit(&vifp->v_tbf->tbf_lock);
2925 	if (id != 0)
2926 		(void) untimeout(id);
2927 }
2928 
2929 /*
2930  * Adds a packet to the tbf queue at the interface.
2931  * The ipha is for mcastgrp destination for phyint and encap.
2932  */
2933 static void
2934 tbf_queue(struct vif *vifp, mblk_t *mp)
2935 {
2936 	struct tbf	*t = vifp->v_tbf;
2937 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2938 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2939 
2940 	if (ipst->ips_ip_mrtdebug > 1) {
2941 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2942 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2943 	}
2944 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2945 
2946 	if (t->tbf_t == NULL) {
2947 		/* Queue was empty */
2948 		t->tbf_q = mp;
2949 	} else {
2950 		/* Insert at tail */
2951 		t->tbf_t->b_next = mp;
2952 	}
2953 	/* set new tail pointer */
2954 	t->tbf_t = mp;
2955 
2956 	mp->b_next = mp->b_prev = NULL;
2957 
2958 	t->tbf_q_len++;
2959 }
2960 
2961 /*
2962  * Process the queue at the vif interface.
2963  * Drops the tbf_lock when sending packets.
2964  *
2965  * NOTE : The caller should quntimeout if the queue length is 0.
2966  */
2967 static void
2968 tbf_process_q(struct vif *vifp)
2969 {
2970 	mblk_t	*mp;
2971 	struct tbf	*t = vifp->v_tbf;
2972 	size_t	len;
2973 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2974 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2975 
2976 	if (ipst->ips_ip_mrtdebug > 1) {
2977 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2978 		    "tbf_process_q 1: vif %ld qlen = %d",
2979 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2980 	}
2981 
2982 	/*
2983 	 * Loop through the queue at the interface and send
2984 	 * as many packets as possible.
2985 	 */
2986 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2987 
2988 	while (t->tbf_q_len > 0) {
2989 		mp = t->tbf_q;
2990 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2991 
2992 		/* Determine if the packet can be sent */
2993 		if (len <= t->tbf_n_tok) {
2994 			/*
2995 			 * If so, reduce no. of tokens, dequeue the packet,
2996 			 * send the packet.
2997 			 */
2998 			t->tbf_n_tok -= len;
2999 
3000 			t->tbf_q = mp->b_next;
3001 			if (--t->tbf_q_len == 0) {
3002 				t->tbf_t = NULL;
3003 			}
3004 			mp->b_next = NULL;
3005 			/* Exit mutex before sending packet, then re-enter */
3006 			mutex_exit(&t->tbf_lock);
3007 			tbf_send_packet(vifp, mp);
3008 			mutex_enter(&t->tbf_lock);
3009 		} else
3010 			break;
3011 	}
3012 }
3013 
3014 /* Called at tbf timeout to update tokens, process q and reset timer.  */
3015 static void
3016 tbf_reprocess_q(void *arg)
3017 {
3018 	struct vif *vifp = arg;
3019 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3020 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3021 
3022 	mutex_enter(&vifp->v_tbf->tbf_lock);
3023 	vifp->v_timeout_id = 0;
3024 	tbf_update_tokens(vifp);
3025 
3026 	tbf_process_q(vifp);
3027 
3028 	if (vifp->v_tbf->tbf_q_len > 0) {
3029 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
3030 		    TBF_REPROCESS);
3031 	}
3032 	mutex_exit(&vifp->v_tbf->tbf_lock);
3033 
3034 	if (ipst->ips_ip_mrtdebug > 1) {
3035 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3036 		    "tbf_reprcess_q: vif %ld timeout id = %p",
3037 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
3038 	}
3039 }
3040 
3041 /*
3042  * Function that will selectively discard a member of the tbf queue,
3043  * based on the precedence value and the priority.
3044  *
3045  * NOTE : The caller should quntimeout if the queue length is 0.
3046  */
3047 static int
3048 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
3049 {
3050 	uint_t		p;
3051 	struct tbf		*t = vifp->v_tbf;
3052 	mblk_t		**np;
3053 	mblk_t		*last, *mp;
3054 	ill_t		*ill = vifp->v_ipif->ipif_ill;
3055 	ip_stack_t	*ipst = ill->ill_ipst;
3056 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3057 
3058 	if (ipst->ips_ip_mrtdebug > 1) {
3059 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3060 		    "dq_sel: vif %ld dst 0x%x",
3061 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
3062 	}
3063 
3064 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3065 	p = priority(vifp, ipha);
3066 
3067 	np = &t->tbf_q;
3068 	last = NULL;
3069 	while ((mp = *np) != NULL) {
3070 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
3071 			*np = mp->b_next;
3072 			/* If removing the last packet, fix the tail pointer */
3073 			if (mp == t->tbf_t)
3074 				t->tbf_t = last;
3075 			mp->b_prev = mp->b_next = NULL;
3076 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3077 			ip_drop_output("tbf_dq_send", mp, ill);
3078 			freemsg(mp);
3079 			/*
3080 			 * It's impossible for the queue to be empty, but
3081 			 * we check anyway.
3082 			 */
3083 			if (--t->tbf_q_len == 0) {
3084 				t->tbf_t = NULL;
3085 			}
3086 			ipst->ips_mrtstat->mrts_drop_sel++;
3087 			return (1);
3088 		}
3089 		np = &mp->b_next;
3090 		last = mp;
3091 	}
3092 	return (0);
3093 }
3094 
3095 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3096 static void
3097 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3098 {
3099 	ipif_t		*ipif = vifp->v_ipif;
3100 	ill_t		*ill = ipif->ipif_ill;
3101 	ip_stack_t	*ipst = ill->ill_ipst;
3102 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3103 	ipha_t		*ipha;
3104 
3105 	ipha = (ipha_t *)mp->b_rptr;
3106 	/* If encap tunnel options */
3107 	if (vifp->v_flags & VIFF_TUNNEL)  {
3108 		ip_xmit_attr_t	ixas;
3109 
3110 		if (ipst->ips_ip_mrtdebug > 1) {
3111 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3112 			    "tbf_send_packet: ENCAP tunnel vif %ld",
3113 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3114 		}
3115 		bzero(&ixas, sizeof (ixas));
3116 		ixas.ixa_flags =
3117 		    IXAF_IS_IPV4 | IXAF_NO_TTL_CHANGE | IXAF_VERIFY_SOURCE;
3118 		ixas.ixa_ipst = ipst;
3119 		ixas.ixa_ifindex = 0;
3120 		ixas.ixa_cred = kcred;
3121 		ixas.ixa_cpid = NOPID;
3122 		ixas.ixa_tsl = NULL;
3123 		ixas.ixa_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3124 		ixas.ixa_pktlen = ntohs(ipha->ipha_length);
3125 		ixas.ixa_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3126 
3127 		/*
3128 		 * Feed into ip_output_simple which will set the ident field
3129 		 * and checksum the encapsulating header.
3130 		 * BSD gets the cached route vifp->v_route from ip_output()
3131 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3132 		 * One could make multicast forwarding faster by putting an
3133 		 * ip_xmit_attr_t in each vif thereby caching the ire/nce.
3134 		 */
3135 		(void) ip_output_simple(mp, &ixas);
3136 		ixa_cleanup(&ixas);
3137 		return;
3138 
3139 		/* phyint */
3140 	} else {
3141 		/* Need to loop back to members on the outgoing interface. */
3142 		ipaddr_t	dst;
3143 		ip_recv_attr_t	iras;
3144 		nce_t		*nce;
3145 
3146 		bzero(&iras, sizeof (iras));
3147 		iras.ira_flags = IRAF_IS_IPV4;
3148 		iras.ira_ill = iras.ira_rill = ill;
3149 		iras.ira_ruifindex = ill->ill_phyint->phyint_ifindex;
3150 		iras.ira_zoneid = GLOBAL_ZONEID; /* Multicast router in GZ */
3151 		iras.ira_pktlen = ntohs(ipha->ipha_length);
3152 		iras.ira_ip_hdr_length = IPH_HDR_LENGTH(ipha);
3153 
3154 		dst = ipha->ipha_dst;
3155 		if (ill_hasmembers_v4(ill, dst)) {
3156 			iras.ira_flags |= IRAF_LOOPBACK_COPY;
3157 		}
3158 		if (ipst->ips_ip_mrtdebug > 1) {
3159 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3160 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3161 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3162 		}
3163 		/*
3164 		 * Find an NCE which matches the nexthop.
3165 		 * For a pt-pt interface we use the other end of the pt-pt
3166 		 * link.
3167 		 */
3168 		if (ipif->ipif_flags & IPIF_POINTOPOINT) {
3169 			dst = ipif->ipif_pp_dst_addr;
3170 			nce = arp_nce_init(ill, dst, ill->ill_net_type);
3171 		} else {
3172 			nce = arp_nce_init(ill, dst, IRE_MULTICAST);
3173 		}
3174 		if (nce == NULL) {
3175 			BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3176 			ip_drop_output("tbf_send_packet - no nce", mp, ill);
3177 			freemsg(mp);
3178 			return;
3179 		}
3180 
3181 		/*
3182 		 * We don't remeber the incoming ill. Thus we
3183 		 * pretend the  packet arrived on the outbound ill. This means
3184 		 * statistics for input errors will be increased on the wrong
3185 		 * ill but that isn't a big deal.
3186 		 */
3187 		ip_forward_xmit_v4(nce, ill, mp, ipha, &iras, ill->ill_mc_mtu,
3188 		    0);
3189 		ASSERT(!(iras.ira_flags & IRAF_IPSEC_SECURE));
3190 
3191 		nce_refrele(nce);
3192 	}
3193 }
3194 
3195 /*
3196  * Determine the current time and then the elapsed time (between the last time
3197  * and time now).  Update the no. of tokens in the bucket.
3198  */
3199 static void
3200 tbf_update_tokens(struct vif *vifp)
3201 {
3202 	timespec_t	tp;
3203 	hrtime_t	tm;
3204 	struct tbf	*t = vifp->v_tbf;
3205 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3206 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3207 
3208 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3209 
3210 	/* Time in secs and nsecs, rate limit in kbits/sec */
3211 	gethrestime(&tp);
3212 
3213 	/*LINTED*/
3214 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3215 
3216 	/*
3217 	 * This formula is actually
3218 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3219 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3220 	 *
3221 	 * The (1000/1024) was introduced in add_vif to optimize
3222 	 * this divide into a shift.
3223 	 */
3224 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3225 	t->tbf_last_pkt_t = tp;
3226 
3227 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3228 		t->tbf_n_tok = MAX_BKT_SIZE;
3229 	if (ipst->ips_ip_mrtdebug > 1) {
3230 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3231 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3232 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3233 	}
3234 }
3235 
3236 /*
3237  * Priority currently is based on port nos.
3238  * Different forwarding mechanisms have different ways
3239  * of obtaining the port no. Hence, the vif must be
3240  * given along with the packet itself.
3241  *
3242  */
3243 static int
3244 priority(struct vif *vifp, ipha_t *ipha)
3245 {
3246 	int prio;
3247 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3248 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3249 
3250 	/* Temporary hack; may add general packet classifier some day */
3251 
3252 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3253 
3254 	/*
3255 	 * The UDP port space is divided up into four priority ranges:
3256 	 * [0, 16384)	: unclassified - lowest priority
3257 	 * [16384, 32768)	: audio - highest priority
3258 	 * [32768, 49152)	: whiteboard - medium priority
3259 	 * [49152, 65536)	: video - low priority
3260 	 */
3261 
3262 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3263 		struct udphdr *udp =
3264 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3265 		switch (ntohs(udp->uh_dport) & 0xc000) {
3266 		case 0x4000:
3267 			prio = 70;
3268 			break;
3269 		case 0x8000:
3270 			prio = 60;
3271 			break;
3272 		case 0xc000:
3273 			prio = 55;
3274 			break;
3275 		default:
3276 			prio = 50;
3277 			break;
3278 		}
3279 		if (ipst->ips_ip_mrtdebug > 1) {
3280 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3281 			    "priority: port %x prio %d\n",
3282 			    ntohs(udp->uh_dport), prio);
3283 		}
3284 	} else
3285 		prio = 50;  /* default priority */
3286 	return (prio);
3287 }
3288 
3289 /*
3290  * End of token bucket filter modifications
3291  */
3292 
3293 
3294 
3295 /*
3296  * Produces data for netstat -M.
3297  */
3298 int
3299 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3300 {
3301 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3302 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3303 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3304 		sizeof (struct mrtstat))) {
3305 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3306 		    (size_t)sizeof (struct mrtstat)));
3307 		return (0);
3308 	}
3309 	return (1);
3310 }
3311 
3312 /*
3313  * Sends info for SNMP's MIB.
3314  */
3315 int
3316 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3317 {
3318 	struct vifctl 	vi;
3319 	vifi_t		vifi;
3320 
3321 	mutex_enter(&ipst->ips_numvifs_mutex);
3322 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3323 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3324 			continue;
3325 		/*
3326 		 * No locks here, an approximation is fine.
3327 		 */
3328 		vi.vifc_vifi = vifi;
3329 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3330 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3331 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3332 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3333 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3334 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3335 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3336 
3337 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3338 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3339 			    (size_t)sizeof (vi)));
3340 			mutex_exit(&ipst->ips_numvifs_mutex);
3341 			return (0);
3342 		}
3343 	}
3344 	mutex_exit(&ipst->ips_numvifs_mutex);
3345 	return (1);
3346 }
3347 
3348 /*
3349  * Called by ip_snmp_get to send up multicast routing table.
3350  */
3351 int
3352 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3353 {
3354 	int			i, j;
3355 	struct mfc		*rt;
3356 	struct mfcctl	mfcc;
3357 
3358 	/*
3359 	 * Make sure multicast has not been turned off.
3360 	 */
3361 	if (is_mrouter_off(ipst))
3362 		return (1);
3363 
3364 	/* Loop over all hash buckets and their chains */
3365 	for (i = 0; i < MFCTBLSIZ; i++) {
3366 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3367 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3368 			mutex_enter(&rt->mfc_mutex);
3369 			if (rt->mfc_rte != NULL ||
3370 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3371 				mutex_exit(&rt->mfc_mutex);
3372 				continue;
3373 			}
3374 			mfcc.mfcc_origin = rt->mfc_origin;
3375 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3376 			mfcc.mfcc_parent = rt->mfc_parent;
3377 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3378 			mutex_enter(&ipst->ips_numvifs_mutex);
3379 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3380 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3381 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3382 				mfcc.mfcc_ttls[j] = 0;
3383 			mutex_exit(&ipst->ips_numvifs_mutex);
3384 
3385 			mutex_exit(&rt->mfc_mutex);
3386 			if (!snmp_append_data(mp, (char *)&mfcc,
3387 			    sizeof (mfcc))) {
3388 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3389 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3390 				    (size_t)sizeof (mfcc)));
3391 				return (0);
3392 			}
3393 		}
3394 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3395 	}
3396 	return (1);
3397 }
3398