xref: /titanic_52/usr/src/uts/common/inet/ip/ip_mroute.c (revision 1529f529004c61fcfd0d95ab79b0f257d6ad4451)
1 /*
2  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 /*
6  * CDDL HEADER START
7  *
8  * The contents of this file are subject to the terms of the
9  * Common Development and Distribution License (the "License").
10  * You may not use this file except in compliance with the License.
11  *
12  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
13  * or http://www.opensolaris.org/os/licensing.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  *
17  * When distributing Covered Code, include this CDDL HEADER in each
18  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
19  * If applicable, add the following below this CDDL HEADER, with the
20  * fields enclosed by brackets "[]" replaced with your own identifying
21  * information: Portions Copyright [yyyy] [name of copyright owner]
22  *
23  * CDDL HEADER END
24  */
25 /*
26  * Copyright 2008 Sun Microsystems, Inc.
27  * All rights reserved.  Use is subject to license terms.
28  */
29 /* Copyright (c) 1990 Mentat Inc. */
30 
31 /*
32  * Procedures for the kernel part of DVMRP,
33  * a Distance-Vector Multicast Routing Protocol.
34  * (See RFC-1075)
35  * Written by David Waitzman, BBN Labs, August 1988.
36  * Modified by Steve Deering, Stanford, February 1989.
37  * Modified by Mark J. Steiglitz, Stanford, May, 1991
38  * Modified by Van Jacobson, LBL, January 1993
39  * Modified by Ajit Thyagarajan, PARC, August 1993
40  * Modified by Bill Fenner, PARC, April 1995
41  *
42  * MROUTING 3.5
43  */
44 
45 /*
46  * TODO
47  * - function pointer field in vif, void *vif_sendit()
48  */
49 
50 #include <sys/types.h>
51 #include <sys/stream.h>
52 #include <sys/stropts.h>
53 #include <sys/strlog.h>
54 #include <sys/systm.h>
55 #include <sys/ddi.h>
56 #include <sys/cmn_err.h>
57 #include <sys/zone.h>
58 
59 #include <sys/param.h>
60 #include <sys/socket.h>
61 #include <sys/vtrace.h>
62 #include <sys/debug.h>
63 #include <net/if.h>
64 #include <sys/sockio.h>
65 #include <netinet/in.h>
66 #include <net/if_dl.h>
67 
68 #include <inet/common.h>
69 #include <inet/mi.h>
70 #include <inet/nd.h>
71 #include <inet/mib2.h>
72 #include <netinet/ip6.h>
73 #include <inet/ip.h>
74 #include <inet/snmpcom.h>
75 
76 #include <netinet/igmp.h>
77 #include <netinet/igmp_var.h>
78 #include <netinet/udp.h>
79 #include <netinet/ip_mroute.h>
80 #include <inet/ip_multi.h>
81 #include <inet/ip_ire.h>
82 #include <inet/ip_if.h>
83 #include <inet/ipclassifier.h>
84 
85 #include <netinet/pim.h>
86 
87 
88 /*
89  * MT Design:
90  *
91  * There are three main data structures viftable, mfctable and tbftable that
92  * need to be protected against MT races.
93  *
94  * vitable is a fixed length array of vif structs. There is no lock to protect
95  * the whole array, instead each struct is protected by its own indiviual lock.
96  * The value of v_marks in conjuction with the value of v_refcnt determines the
97  * current state of a vif structure. One special state that needs mention
98  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
99  * that vif is being initalized.
100  * Each structure is freed when the refcnt goes down to zero. If a delete comes
101  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
102  * which prevents the struct from further use.  When the refcnt goes to zero
103  * the struct is freed and is marked VIF_MARK_NOTINUSE.
104  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
105  * from  going away a refhold is put on the ipif before using it. see
106  * lock_good_vif() and unlock_good_vif().
107  *
108  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
109  * of the vif struct.
110  *
111  * tbftable is also a fixed length array of tbf structs and is only accessed
112  * via v_tbf.  It is protected by its own lock tbf_lock.
113  *
114  * Lock Ordering is
115  * v_lock --> tbf_lock
116  * v_lock --> ill_locK
117  *
118  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
119  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
120  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
121  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
122  * protect the struct elements.
123  *
124  * mfc structs are dynamically allocated and are singly linked
125  * at the head of the chain. When an mfc structure is to be deleted
126  * it is marked condemned and so is the state in the bucket struct.
127  * When the last walker of the hash bucket exits all the mfc structs
128  * marked condemed are freed.
129  *
130  * Locking Hierarchy:
131  * The bucket lock should be acquired before the mfc struct lock.
132  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
133  * operations on the bucket struct.
134  *
135  * last_encap_lock and numvifs_mutex should be acquired after
136  * acquring vif or mfc locks. These locks protect some global variables.
137  *
138  * The statistics are not currently protected by a lock
139  * causing the stats be be approximate, not exact.
140  */
141 
142 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
143 
144 /*
145  * Timeouts:
146  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
147  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
148  *	SunOS 5.x uses mfc->timeout for each mfc.
149  *	Some Unixes are limited in the number of simultaneous timeouts
150  * 	that can be run, SunOS 5.x does not have this restriction.
151  */
152 
153 /*
154  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
155  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
156  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
157  */
158 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
159 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
160 
161 /*
162  * Hash function for a source, group entry
163  */
164 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
165 	((g) >> 20) ^ ((g) >> 10) ^ (g))
166 
167 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
168 
169 /* Identify PIM packet that came on a Register interface */
170 #define	PIM_REGISTER_MARKER	0xffffffff
171 
172 /* Function declarations */
173 static int	add_mfc(struct mfcctl *, ip_stack_t *);
174 static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
175 static int	del_mfc(struct mfcctl *, ip_stack_t *);
176 static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
177 static void	del_vifp(struct vif *);
178 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
179 static void	expire_upcalls(void *);
180 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
181 static void	free_queue(struct mfc *);
182 static int	get_assert(uchar_t *, ip_stack_t *);
183 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
184 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
185 static int	get_version(uchar_t *);
186 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
187 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
188 		    ipaddr_t, struct mfc *);
189 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
190 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
191 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
192 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
193 static int	set_assert(int *, ip_stack_t *);
194 
195 /*
196  * Token Bucket Filter functions
197  */
198 static int  priority(struct vif *, ipha_t *);
199 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
200 static int  tbf_dq_sel(struct vif *, ipha_t *);
201 static void tbf_process_q(struct vif *);
202 static void tbf_queue(struct vif *, mblk_t *);
203 static void tbf_reprocess_q(void *);
204 static void tbf_send_packet(struct vif *, mblk_t *);
205 static void tbf_update_tokens(struct vif *);
206 static void release_mfc(struct mfcb *);
207 
208 static boolean_t is_mrouter_off(ip_stack_t *);
209 /*
210  * Encapsulation packets
211  */
212 
213 #define	ENCAP_TTL	64
214 
215 /* prototype IP hdr for encapsulated packets */
216 static ipha_t multicast_encap_iphdr = {
217 	IP_SIMPLE_HDR_VERSION,
218 	0,				/* tos */
219 	sizeof (ipha_t),		/* total length */
220 	0,				/* id */
221 	0,				/* frag offset */
222 	ENCAP_TTL, IPPROTO_ENCAP,
223 	0,				/* checksum */
224 };
225 
226 /*
227  * Rate limit for assert notification messages, in nsec.
228  */
229 #define	ASSERT_MSG_TIME		3000000000
230 
231 
232 #define	VIF_REFHOLD(vifp) {			\
233 	mutex_enter(&(vifp)->v_lock);		\
234 	(vifp)->v_refcnt++;			\
235 	mutex_exit(&(vifp)->v_lock);		\
236 }
237 
238 #define	VIF_REFRELE_LOCKED(vifp) {				\
239 	(vifp)->v_refcnt--;					\
240 	if ((vifp)->v_refcnt == 0 &&				\
241 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
242 			del_vifp(vifp);				\
243 	} else {						\
244 		mutex_exit(&(vifp)->v_lock);			\
245 	}							\
246 }
247 
248 #define	VIF_REFRELE(vifp) {					\
249 	mutex_enter(&(vifp)->v_lock);				\
250 	(vifp)->v_refcnt--;					\
251 	if ((vifp)->v_refcnt == 0 &&				\
252 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
253 			del_vifp(vifp);				\
254 	} else {						\
255 		mutex_exit(&(vifp)->v_lock);			\
256 	}							\
257 }
258 
259 #define	MFCB_REFHOLD(mfcb) {				\
260 	mutex_enter(&(mfcb)->mfcb_lock);		\
261 	(mfcb)->mfcb_refcnt++;				\
262 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
263 	mutex_exit(&(mfcb)->mfcb_lock);			\
264 }
265 
266 #define	MFCB_REFRELE(mfcb) {					\
267 	mutex_enter(&(mfcb)->mfcb_lock);			\
268 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
269 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
270 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
271 			release_mfc(mfcb);			\
272 	}							\
273 	mutex_exit(&(mfcb)->mfcb_lock);				\
274 }
275 
276 /*
277  * MFCFIND:
278  * Find a route for a given origin IP address and multicast group address.
279  * Skip entries with pending upcalls.
280  * Type of service parameter to be added in the future!
281  */
282 #define	MFCFIND(mfcbp, o, g, rt) { \
283 	struct mfc *_mb_rt = NULL; \
284 	rt = NULL; \
285 	_mb_rt = mfcbp->mfcb_mfc; \
286 	while (_mb_rt) { \
287 		if ((_mb_rt->mfc_origin.s_addr == o) && \
288 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
289 		    (_mb_rt->mfc_rte == NULL) && \
290 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
291 		    rt = _mb_rt; \
292 		    break; \
293 		} \
294 	_mb_rt = _mb_rt->mfc_next; \
295 	} \
296 }
297 
298 /*
299  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
300  * are inefficient. We use gethrestime() which returns a timespec_t with
301  * sec and nsec, the resolution is machine dependent.
302  * The following 2 macros have been changed to use nsec instead of usec.
303  */
304 /*
305  * Macros to compute elapsed time efficiently.
306  * Borrowed from Van Jacobson's scheduling code.
307  * Delta should be a hrtime_t.
308  */
309 #define	TV_DELTA(a, b, delta) { \
310 	int xxs; \
311  \
312 	delta = (a).tv_nsec - (b).tv_nsec; \
313 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
314 		switch (xxs) { \
315 		case 2: \
316 		    delta += 1000000000; \
317 		    /*FALLTHROUGH*/ \
318 		case 1: \
319 		    delta += 1000000000; \
320 		    break; \
321 		default: \
322 		    delta += (1000000000 * xxs); \
323 		} \
324 	} \
325 }
326 
327 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
328 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
329 
330 /*
331  * Handle MRT setsockopt commands to modify the multicast routing tables.
332  */
333 int
334 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
335     int datalen, mblk_t *first_mp)
336 {
337 	conn_t		*connp = Q_TO_CONN(q);
338 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
339 
340 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
341 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
342 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
343 		return (EACCES);
344 	}
345 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
346 
347 	if (checkonly) {
348 		/*
349 		 * do not do operation, just pretend to - new T_CHECK
350 		 * Note: Even routines further on can probably fail but
351 		 * this T_CHECK stuff is only to please XTI so it not
352 		 * necessary to be perfect.
353 		 */
354 		switch (cmd) {
355 		case MRT_INIT:
356 		case MRT_DONE:
357 		case MRT_ADD_VIF:
358 		case MRT_DEL_VIF:
359 		case MRT_ADD_MFC:
360 		case MRT_DEL_MFC:
361 		case MRT_ASSERT:
362 			return (0);
363 		default:
364 			return (EOPNOTSUPP);
365 		}
366 	}
367 
368 	/*
369 	 * make sure no command is issued after multicast routing has been
370 	 * turned off.
371 	 */
372 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
373 		if (is_mrouter_off(ipst))
374 			return (EINVAL);
375 	}
376 
377 	switch (cmd) {
378 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
379 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
380 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
381 			    first_mp, ipst));
382 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
383 			    ipst));
384 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
385 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
386 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
387 	default:	   return (EOPNOTSUPP);
388 	}
389 }
390 
391 /*
392  * Handle MRT getsockopt commands
393  */
394 int
395 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
396 {
397 	conn_t		*connp = Q_TO_CONN(q);
398 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
399 
400 	if (connp != ipst->ips_ip_g_mrouter)
401 		return (EACCES);
402 
403 	switch (cmd) {
404 	case MRT_VERSION:	return (get_version((uchar_t *)data));
405 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
406 	default:		return (EOPNOTSUPP);
407 	}
408 }
409 
410 /*
411  * Handle ioctl commands to obtain information from the cache.
412  * Called with shared access to IP. These are read_only ioctls.
413  */
414 /* ARGSUSED */
415 int
416 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
417     ip_ioctl_cmd_t *ipip, void *if_req)
418 {
419 	mblk_t	*mp1;
420 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
421 	conn_t		*connp = Q_TO_CONN(q);
422 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
423 
424 	/* Existence verified in ip_wput_nondata */
425 	mp1 = mp->b_cont->b_cont;
426 
427 	switch (iocp->ioc_cmd) {
428 	case (SIOCGETVIFCNT):
429 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
430 	case (SIOCGETSGCNT):
431 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
432 	case (SIOCGETLSGCNT):
433 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
434 	default:
435 		return (EINVAL);
436 	}
437 }
438 
439 /*
440  * Returns the packet, byte, rpf-failure count for the source, group provided.
441  */
442 static int
443 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
444 {
445 	struct mfc *rt;
446 	struct mfcb *mfcbp;
447 
448 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
449 	MFCB_REFHOLD(mfcbp);
450 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
451 
452 	if (rt != NULL) {
453 		mutex_enter(&rt->mfc_mutex);
454 		req->pktcnt   = rt->mfc_pkt_cnt;
455 		req->bytecnt  = rt->mfc_byte_cnt;
456 		req->wrong_if = rt->mfc_wrong_if;
457 		mutex_exit(&rt->mfc_mutex);
458 	} else
459 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
460 
461 	MFCB_REFRELE(mfcbp);
462 	return (0);
463 }
464 
465 /*
466  * Returns the packet, byte, rpf-failure count for the source, group provided.
467  * Uses larger counters and IPv6 addresses.
468  */
469 /* ARGSUSED XXX until implemented */
470 static int
471 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
472 {
473 	/* XXX TODO SIOCGETLSGCNT */
474 	return (ENXIO);
475 }
476 
477 /*
478  * Returns the input and output packet and byte counts on the vif provided.
479  */
480 static int
481 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
482 {
483 	vifi_t vifi = req->vifi;
484 
485 	if (vifi >= ipst->ips_numvifs)
486 		return (EINVAL);
487 
488 	/*
489 	 * No locks here, an approximation is fine.
490 	 */
491 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
492 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
493 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
494 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
495 
496 	return (0);
497 }
498 
499 static int
500 get_version(uchar_t *data)
501 {
502 	int *v = (int *)data;
503 
504 	*v = 0x0305;	/* XXX !!!! */
505 
506 	return (0);
507 }
508 
509 /*
510  * Set PIM assert processing global.
511  */
512 static int
513 set_assert(int *i, ip_stack_t *ipst)
514 {
515 	if ((*i != 1) && (*i != 0))
516 		return (EINVAL);
517 
518 	ipst->ips_pim_assert = *i;
519 
520 	return (0);
521 }
522 
523 /*
524  * Get PIM assert processing global.
525  */
526 static int
527 get_assert(uchar_t *data, ip_stack_t *ipst)
528 {
529 	int *i = (int *)data;
530 
531 	*i = ipst->ips_pim_assert;
532 
533 	return (0);
534 }
535 
536 /*
537  * Enable multicast routing.
538  */
539 static int
540 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
541 {
542 	int	*v;
543 
544 	if (data == NULL || (datalen != sizeof (int)))
545 		return (ENOPROTOOPT);
546 
547 	v = (int *)data;
548 	if (*v != 1)
549 		return (ENOPROTOOPT);
550 
551 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
552 	if (ipst->ips_ip_g_mrouter != NULL) {
553 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
554 		return (EADDRINUSE);
555 	}
556 
557 	/*
558 	 * MRT_INIT should only be allowed for RAW sockets, but we double
559 	 * check.
560 	 */
561 	if (!IPCL_IS_RAWIP(connp)) {
562 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
563 		return (EINVAL);
564 	}
565 
566 	ipst->ips_ip_g_mrouter = connp;
567 	connp->conn_multi_router = 1;
568 	/* In order for tunnels to work we have to turn ip_g_forward on */
569 	if (!WE_ARE_FORWARDING(ipst)) {
570 		if (ipst->ips_ip_mrtdebug > 1) {
571 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
572 			    "ip_mrouter_init: turning on forwarding");
573 		}
574 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
575 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
576 	}
577 
578 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
579 	return (0);
580 }
581 
582 void
583 ip_mrouter_stack_init(ip_stack_t *ipst)
584 {
585 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
586 
587 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
588 	    KM_SLEEP);
589 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
590 	/*
591 	 * mfctable:
592 	 * Includes all mfcs, including waiting upcalls.
593 	 * Multiple mfcs per bucket.
594 	 */
595 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
596 	    KM_SLEEP);
597 	/*
598 	 * Define the token bucket filter structures.
599 	 * tbftable -> each vif has one of these for storing info.
600 	 */
601 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
602 
603 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
604 
605 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
606 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
607 }
608 
609 /*
610  * Disable multicast routing.
611  * Didn't use global timeout_val (BSD version), instead check the mfctable.
612  */
613 int
614 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
615 {
616 	conn_t		*mrouter;
617 	vifi_t 		vifi;
618 	struct mfc	*mfc_rt;
619 	int		i;
620 
621 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
622 	if (ipst->ips_ip_g_mrouter == NULL) {
623 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
624 		return (EINVAL);
625 	}
626 
627 	mrouter = ipst->ips_ip_g_mrouter;
628 
629 	if (ipst->ips_saved_ip_g_forward != -1) {
630 		if (ipst->ips_ip_mrtdebug > 1) {
631 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
632 			    "ip_mrouter_done: turning off forwarding");
633 		}
634 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
635 		ipst->ips_saved_ip_g_forward = -1;
636 	}
637 
638 	/*
639 	 * Always clear cache when vifs change.
640 	 * No need to get ipst->ips_last_encap_lock since we are running as
641 	 * a writer.
642 	 */
643 	mutex_enter(&ipst->ips_last_encap_lock);
644 	ipst->ips_last_encap_src = 0;
645 	ipst->ips_last_encap_vif = NULL;
646 	mutex_exit(&ipst->ips_last_encap_lock);
647 	mrouter->conn_multi_router = 0;
648 
649 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
650 
651 	/*
652 	 * For each phyint in use,
653 	 * disable promiscuous reception of all IP multicasts.
654 	 */
655 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
656 		struct vif *vifp = ipst->ips_vifs + vifi;
657 
658 		mutex_enter(&vifp->v_lock);
659 		/*
660 		 * if the vif is active mark it condemned.
661 		 */
662 		if (vifp->v_marks & VIF_MARK_GOOD) {
663 			ASSERT(vifp->v_ipif != NULL);
664 			ipif_refhold(vifp->v_ipif);
665 			/* Phyint only */
666 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
667 				ipif_t *ipif = vifp->v_ipif;
668 				ipsq_t  *ipsq;
669 				boolean_t suc;
670 				ill_t *ill;
671 
672 				ill = ipif->ipif_ill;
673 				suc = B_FALSE;
674 				if (mp == NULL) {
675 					/*
676 					 * being called from ip_close,
677 					 * lets do it synchronously.
678 					 * Clear VIF_MARK_GOOD and
679 					 * set VIF_MARK_CONDEMNED.
680 					 */
681 					vifp->v_marks &= ~VIF_MARK_GOOD;
682 					vifp->v_marks |= VIF_MARK_CONDEMNED;
683 					mutex_exit(&(vifp)->v_lock);
684 					suc = ipsq_enter(ill, B_FALSE, NEW_OP);
685 					ipsq = ill->ill_phyint->phyint_ipsq;
686 				} else {
687 					ipsq = ipsq_try_enter(ipif, NULL,
688 					    mrouter->conn_wq, mp,
689 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
690 					if (ipsq == NULL) {
691 						mutex_exit(&(vifp)->v_lock);
692 						ipif_refrele(ipif);
693 						return (EINPROGRESS);
694 					}
695 					/*
696 					 * Clear VIF_MARK_GOOD and
697 					 * set VIF_MARK_CONDEMNED.
698 					 */
699 					vifp->v_marks &= ~VIF_MARK_GOOD;
700 					vifp->v_marks |= VIF_MARK_CONDEMNED;
701 					mutex_exit(&(vifp)->v_lock);
702 					suc = B_TRUE;
703 				}
704 
705 				if (suc) {
706 					(void) ip_delmulti(INADDR_ANY, ipif,
707 					    B_TRUE, B_TRUE);
708 					ipsq_exit(ipsq);
709 				}
710 				mutex_enter(&vifp->v_lock);
711 			}
712 			/*
713 			 * decreases the refcnt added in add_vif.
714 			 * and release v_lock.
715 			 */
716 			VIF_REFRELE_LOCKED(vifp);
717 		} else {
718 			mutex_exit(&vifp->v_lock);
719 			continue;
720 		}
721 	}
722 
723 	mutex_enter(&ipst->ips_numvifs_mutex);
724 	ipst->ips_numvifs = 0;
725 	ipst->ips_pim_assert = 0;
726 	ipst->ips_reg_vif_num = ALL_VIFS;
727 	mutex_exit(&ipst->ips_numvifs_mutex);
728 
729 	/*
730 	 * Free upcall msgs.
731 	 * Go through mfctable and stop any outstanding upcall
732 	 * timeouts remaining on mfcs.
733 	 */
734 	for (i = 0; i < MFCTBLSIZ; i++) {
735 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
736 		ipst->ips_mfcs[i].mfcb_refcnt++;
737 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
738 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
739 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
740 		while (mfc_rt) {
741 			/* Free upcalls */
742 			mutex_enter(&mfc_rt->mfc_mutex);
743 			if (mfc_rt->mfc_rte != NULL) {
744 				if (mfc_rt->mfc_timeout_id != 0) {
745 					/*
746 					 * OK to drop the lock as we have
747 					 * a refcnt on the bucket. timeout
748 					 * can fire but it will see that
749 					 * mfc_timeout_id == 0 and not do
750 					 * anything. see expire_upcalls().
751 					 */
752 					mfc_rt->mfc_timeout_id = 0;
753 					mutex_exit(&mfc_rt->mfc_mutex);
754 					(void) untimeout(
755 					    mfc_rt->mfc_timeout_id);
756 						mfc_rt->mfc_timeout_id = 0;
757 					mutex_enter(&mfc_rt->mfc_mutex);
758 
759 					/*
760 					 * all queued upcall packets
761 					 * and mblk will be freed in
762 					 * release_mfc().
763 					 */
764 				}
765 			}
766 
767 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
768 
769 			mutex_exit(&mfc_rt->mfc_mutex);
770 			mfc_rt = mfc_rt->mfc_next;
771 		}
772 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
773 	}
774 
775 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
776 	ipst->ips_ip_g_mrouter = NULL;
777 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
778 	return (0);
779 }
780 
781 void
782 ip_mrouter_stack_destroy(ip_stack_t *ipst)
783 {
784 	struct mfcb *mfcbp;
785 	struct mfc  *rt;
786 	int i;
787 
788 	for (i = 0; i < MFCTBLSIZ; i++) {
789 		mfcbp = &ipst->ips_mfcs[i];
790 
791 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
792 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
793 			    i);
794 
795 			mfcbp->mfcb_mfc = rt->mfc_next;
796 			free_queue(rt);
797 			mi_free(rt);
798 		}
799 	}
800 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
801 	ipst->ips_vifs = NULL;
802 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
803 	ipst->ips_mrtstat = NULL;
804 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
805 	ipst->ips_mfcs = NULL;
806 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
807 	ipst->ips_tbfs = NULL;
808 
809 	mutex_destroy(&ipst->ips_last_encap_lock);
810 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
811 }
812 
813 static boolean_t
814 is_mrouter_off(ip_stack_t *ipst)
815 {
816 	conn_t	*mrouter;
817 
818 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
819 	if (ipst->ips_ip_g_mrouter == NULL) {
820 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
821 		return (B_TRUE);
822 	}
823 
824 	mrouter = ipst->ips_ip_g_mrouter;
825 	if (mrouter->conn_multi_router == 0) {
826 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
827 		return (B_TRUE);
828 	}
829 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
830 	return (B_FALSE);
831 }
832 
833 static void
834 unlock_good_vif(struct vif *vifp)
835 {
836 	ASSERT(vifp->v_ipif != NULL);
837 	ipif_refrele(vifp->v_ipif);
838 	VIF_REFRELE(vifp);
839 }
840 
841 static boolean_t
842 lock_good_vif(struct vif *vifp)
843 {
844 	mutex_enter(&vifp->v_lock);
845 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
846 		mutex_exit(&vifp->v_lock);
847 		return (B_FALSE);
848 	}
849 
850 	ASSERT(vifp->v_ipif != NULL);
851 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
852 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
853 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
854 		mutex_exit(&vifp->v_lock);
855 		return (B_FALSE);
856 	}
857 	ipif_refhold_locked(vifp->v_ipif);
858 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
859 	vifp->v_refcnt++;
860 	mutex_exit(&vifp->v_lock);
861 	return (B_TRUE);
862 }
863 
864 /*
865  * Add a vif to the vif table.
866  */
867 static int
868 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
869 {
870 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
871 	ipif_t		*ipif;
872 	int		error;
873 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
874 	ipsq_t  	*ipsq;
875 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
876 
877 	ASSERT(connp != NULL);
878 
879 	if (vifcp->vifc_vifi >= MAXVIFS)
880 		return (EINVAL);
881 
882 	if (is_mrouter_off(ipst))
883 		return (EINVAL);
884 
885 	mutex_enter(&vifp->v_lock);
886 	/*
887 	 * Viftable entry should be 0.
888 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
889 	 * initialized.
890 	 *
891 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
892 	 * request while the delete is in progress, mrouted only sends add
893 	 * requests when a new interface is added and the new interface cannot
894 	 * have the same vifi as an existing interface. We make sure that
895 	 * ill_delete will block till the vif is deleted by adding a refcnt
896 	 * to ipif in del_vif().
897 	 */
898 	if (vifp->v_lcl_addr.s_addr != 0 ||
899 	    vifp->v_marks != 0 ||
900 	    vifp->v_refcnt != 0) {
901 		mutex_exit(&vifp->v_lock);
902 		return (EADDRINUSE);
903 	}
904 
905 	/* Incoming vif should not be 0 */
906 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
907 		mutex_exit(&vifp->v_lock);
908 		return (EINVAL);
909 	}
910 
911 	vifp->v_refcnt++;
912 	mutex_exit(&vifp->v_lock);
913 	/* Find the interface with the local address */
914 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
915 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
916 	    ip_restart_optmgmt, &error, ipst);
917 	if (ipif == NULL) {
918 		VIF_REFRELE(vifp);
919 		if (error == EINPROGRESS)
920 			return (error);
921 		return (EADDRNOTAVAIL);
922 	}
923 
924 	/*
925 	 * We have to be exclusive as we have to call ip_addmulti()
926 	 * This is the best position to try to be exclusive in case
927 	 * we have to wait.
928 	 */
929 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
930 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
931 	if ((ipsq) == NULL) {
932 		VIF_REFRELE(vifp);
933 		ipif_refrele(ipif);
934 		return (EINPROGRESS);
935 	}
936 
937 	if (ipst->ips_ip_mrtdebug > 1) {
938 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
939 		    "add_vif: src 0x%x enter",
940 		    vifcp->vifc_lcl_addr.s_addr);
941 	}
942 
943 	mutex_enter(&vifp->v_lock);
944 	/*
945 	 * Always clear cache when vifs change.
946 	 * Needed to ensure that src isn't left over from before vif was added.
947 	 * No need to get last_encap_lock, since we are running as a writer.
948 	 */
949 
950 	mutex_enter(&ipst->ips_last_encap_lock);
951 	ipst->ips_last_encap_src = 0;
952 	ipst->ips_last_encap_vif = NULL;
953 	mutex_exit(&ipst->ips_last_encap_lock);
954 
955 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
956 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
957 			cmn_err(CE_WARN,
958 			    "add_vif: source route tunnels not supported\n");
959 			VIF_REFRELE_LOCKED(vifp);
960 			ipif_refrele(ipif);
961 			ipsq_exit(ipsq);
962 			return (EOPNOTSUPP);
963 		}
964 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
965 
966 	} else {
967 		/* Phyint or Register vif */
968 		if (vifcp->vifc_flags & VIFF_REGISTER) {
969 			/*
970 			 * Note: Since all IPPROTO_IP level options (including
971 			 * MRT_ADD_VIF) are done exclusively via
972 			 * ip_optmgmt_writer(), a lock is not necessary to
973 			 * protect reg_vif_num.
974 			 */
975 			mutex_enter(&ipst->ips_numvifs_mutex);
976 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
977 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
978 				mutex_exit(&ipst->ips_numvifs_mutex);
979 			} else {
980 				mutex_exit(&ipst->ips_numvifs_mutex);
981 				VIF_REFRELE_LOCKED(vifp);
982 				ipif_refrele(ipif);
983 				ipsq_exit(ipsq);
984 				return (EADDRINUSE);
985 			}
986 		}
987 
988 		/* Make sure the interface supports multicast */
989 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
990 			VIF_REFRELE_LOCKED(vifp);
991 			ipif_refrele(ipif);
992 			if (vifcp->vifc_flags & VIFF_REGISTER) {
993 				mutex_enter(&ipst->ips_numvifs_mutex);
994 				ipst->ips_reg_vif_num = ALL_VIFS;
995 				mutex_exit(&ipst->ips_numvifs_mutex);
996 			}
997 			ipsq_exit(ipsq);
998 			return (EOPNOTSUPP);
999 		}
1000 		/* Enable promiscuous reception of all IP mcasts from the if */
1001 		mutex_exit(&vifp->v_lock);
1002 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
1003 		    MODE_IS_EXCLUDE, NULL);
1004 		mutex_enter(&vifp->v_lock);
1005 		/*
1006 		 * since we released the lock lets make sure that
1007 		 * ip_mrouter_done() has not been called.
1008 		 */
1009 		if (error != 0 || is_mrouter_off(ipst)) {
1010 			if (error == 0)
1011 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
1012 				    B_TRUE);
1013 			if (vifcp->vifc_flags & VIFF_REGISTER) {
1014 				mutex_enter(&ipst->ips_numvifs_mutex);
1015 				ipst->ips_reg_vif_num = ALL_VIFS;
1016 				mutex_exit(&ipst->ips_numvifs_mutex);
1017 			}
1018 			VIF_REFRELE_LOCKED(vifp);
1019 			ipif_refrele(ipif);
1020 			ipsq_exit(ipsq);
1021 			return (error?error:EINVAL);
1022 		}
1023 	}
1024 	/* Define parameters for the tbf structure */
1025 	vifp->v_tbf = v_tbf;
1026 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1027 	vifp->v_tbf->tbf_n_tok = 0;
1028 	vifp->v_tbf->tbf_q_len = 0;
1029 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1030 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1031 
1032 	vifp->v_flags = vifcp->vifc_flags;
1033 	vifp->v_threshold = vifcp->vifc_threshold;
1034 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1035 	vifp->v_ipif = ipif;
1036 	ipif_refrele(ipif);
1037 	/* Scaling up here, allows division by 1024 in critical code.	*/
1038 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1039 	vifp->v_timeout_id = 0;
1040 	/* initialize per vif pkt counters */
1041 	vifp->v_pkt_in = 0;
1042 	vifp->v_pkt_out = 0;
1043 	vifp->v_bytes_in = 0;
1044 	vifp->v_bytes_out = 0;
1045 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1046 
1047 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1048 	mutex_enter(&ipst->ips_numvifs_mutex);
1049 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1050 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1051 	mutex_exit(&ipst->ips_numvifs_mutex);
1052 
1053 	if (ipst->ips_ip_mrtdebug > 1) {
1054 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1055 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1056 		    vifcp->vifc_vifi,
1057 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1058 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1059 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1060 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1061 	}
1062 
1063 	vifp->v_marks = VIF_MARK_GOOD;
1064 	mutex_exit(&vifp->v_lock);
1065 	ipsq_exit(ipsq);
1066 	return (0);
1067 }
1068 
1069 
1070 /* Delete a vif from the vif table. */
1071 static void
1072 del_vifp(struct vif *vifp)
1073 {
1074 	struct tbf	*t = vifp->v_tbf;
1075 	mblk_t  *mp0;
1076 	vifi_t  vifi;
1077 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1078 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1079 
1080 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1081 	ASSERT(t != NULL);
1082 
1083 	/*
1084 	 * release the ref we put in vif_del.
1085 	 */
1086 	ASSERT(vifp->v_ipif != NULL);
1087 	ipif_refrele(vifp->v_ipif);
1088 
1089 	if (ipst->ips_ip_mrtdebug > 1) {
1090 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1091 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1092 	}
1093 
1094 	if (vifp->v_timeout_id != 0) {
1095 		(void) untimeout(vifp->v_timeout_id);
1096 		vifp->v_timeout_id = 0;
1097 	}
1098 
1099 	/*
1100 	 * Free packets queued at the interface.
1101 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1102 	 */
1103 	mutex_enter(&t->tbf_lock);
1104 	while (t->tbf_q != NULL) {
1105 		mp0 = t->tbf_q;
1106 		t->tbf_q = t->tbf_q->b_next;
1107 		mp0->b_prev = mp0->b_next = NULL;
1108 		freemsg(mp0);
1109 	}
1110 	mutex_exit(&t->tbf_lock);
1111 
1112 	/*
1113 	 * Always clear cache when vifs change.
1114 	 * No need to get last_encap_lock since we are running as a writer.
1115 	 */
1116 	mutex_enter(&ipst->ips_last_encap_lock);
1117 	if (vifp == ipst->ips_last_encap_vif) {
1118 		ipst->ips_last_encap_vif = NULL;
1119 		ipst->ips_last_encap_src = 0;
1120 	}
1121 	mutex_exit(&ipst->ips_last_encap_lock);
1122 
1123 	mutex_destroy(&t->tbf_lock);
1124 
1125 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1126 
1127 	/* Adjust numvifs down */
1128 	mutex_enter(&ipst->ips_numvifs_mutex);
1129 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1130 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1131 			break;
1132 	ipst->ips_numvifs = vifi;
1133 	mutex_exit(&ipst->ips_numvifs_mutex);
1134 
1135 	bzero(vifp, sizeof (*vifp));
1136 }
1137 
1138 static int
1139 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
1140 {
1141 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1142 	ipsq_t  	*ipsq;
1143 
1144 	if (*vifip >= ipst->ips_numvifs)
1145 		return (EINVAL);
1146 
1147 
1148 	mutex_enter(&vifp->v_lock);
1149 	/*
1150 	 * Not initialized
1151 	 * Here we are not looking at the vif that is being initialized
1152 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1153 	 */
1154 	if (vifp->v_lcl_addr.s_addr == 0 ||
1155 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1156 		mutex_exit(&vifp->v_lock);
1157 		return (EADDRNOTAVAIL);
1158 	}
1159 
1160 	/*
1161 	 * This is an optimization, if first_mp == NULL
1162 	 * than we are being called from reset_mrt_vif_ipif()
1163 	 * so we already have exclusive access to the ipsq.
1164 	 * the ASSERT below is a check for this condition.
1165 	 */
1166 	if (first_mp != NULL &&
1167 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1168 		ASSERT(connp != NULL);
1169 		/*
1170 		 * We have to be exclusive as we have to call ip_delmulti()
1171 		 * This is the best position to try to be exclusive in case
1172 		 * we have to wait.
1173 		 */
1174 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1175 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1176 		if ((ipsq) == NULL) {
1177 			mutex_exit(&vifp->v_lock);
1178 			return (EINPROGRESS);
1179 		}
1180 		/* recheck after being exclusive */
1181 		if (vifp->v_lcl_addr.s_addr == 0 ||
1182 		    !vifp->v_marks & VIF_MARK_GOOD) {
1183 			/*
1184 			 * someone beat us.
1185 			 */
1186 			mutex_exit(&vifp->v_lock);
1187 			ipsq_exit(ipsq);
1188 			return (EADDRNOTAVAIL);
1189 		}
1190 	}
1191 
1192 
1193 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1194 
1195 
1196 	/*
1197 	 * add a refhold so that ipif does not go away while
1198 	 * there are still users, this will be released in del_vifp
1199 	 * when we free the vif.
1200 	 */
1201 	ipif_refhold(vifp->v_ipif);
1202 
1203 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1204 	vifp->v_marks &= ~VIF_MARK_GOOD;
1205 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1206 
1207 	/* Phyint only */
1208 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1209 		ipif_t *ipif = vifp->v_ipif;
1210 		ASSERT(ipif != NULL);
1211 		/*
1212 		 * should be OK to drop the lock as we
1213 		 * have marked this as CONDEMNED.
1214 		 */
1215 		mutex_exit(&(vifp)->v_lock);
1216 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1217 		if (first_mp != NULL)
1218 			ipsq_exit(ipsq);
1219 		mutex_enter(&(vifp)->v_lock);
1220 	}
1221 
1222 	/*
1223 	 * decreases the refcnt added in add_vif.
1224 	 */
1225 	VIF_REFRELE_LOCKED(vifp);
1226 	return (0);
1227 }
1228 
1229 /*
1230  * Add an mfc entry.
1231  */
1232 static int
1233 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1234 {
1235 	struct mfc *rt;
1236 	struct rtdetq *rte;
1237 	ushort_t nstl;
1238 	int i;
1239 	struct mfcb *mfcbp;
1240 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1241 
1242 	/*
1243 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1244 	 * did not have a real route for pkt.
1245 	 * We want this pkt without rt installed in the mfctable to prevent
1246 	 * multiiple tries, so go ahead and put it in mfctable, it will
1247 	 * be discarded later in ip_mdq() because the child is NULL.
1248 	 */
1249 
1250 	/* Error checking, out of bounds? */
1251 	if (mfccp->mfcc_parent > MAXVIFS) {
1252 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1253 		    (int)mfccp->mfcc_parent));
1254 		return (EINVAL);
1255 	}
1256 
1257 	if ((mfccp->mfcc_parent != NO_VIF) &&
1258 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1259 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1260 		    (int)mfccp->mfcc_parent));
1261 		return (EINVAL);
1262 	}
1263 
1264 	if (is_mrouter_off(ipst)) {
1265 		return (EINVAL);
1266 	}
1267 
1268 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1269 	    mfccp->mfcc_mcastgrp.s_addr)];
1270 	MFCB_REFHOLD(mfcbp);
1271 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1272 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1273 
1274 	/* If an entry already exists, just update the fields */
1275 	if (rt) {
1276 		if (ipst->ips_ip_mrtdebug > 1) {
1277 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1278 			    "add_mfc: update o %x grp %x parent %x",
1279 			    ntohl(mfccp->mfcc_origin.s_addr),
1280 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1281 			    mfccp->mfcc_parent);
1282 		}
1283 		mutex_enter(&rt->mfc_mutex);
1284 		rt->mfc_parent = mfccp->mfcc_parent;
1285 
1286 		mutex_enter(&ipst->ips_numvifs_mutex);
1287 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1288 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1289 		mutex_exit(&ipst->ips_numvifs_mutex);
1290 		mutex_exit(&rt->mfc_mutex);
1291 
1292 		MFCB_REFRELE(mfcbp);
1293 		return (0);
1294 	}
1295 
1296 	/*
1297 	 * Find the entry for which the upcall was made and update.
1298 	 */
1299 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1300 		mutex_enter(&rt->mfc_mutex);
1301 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1302 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1303 		    (rt->mfc_rte != NULL) &&
1304 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1305 			if (nstl++ != 0)
1306 				cmn_err(CE_WARN,
1307 				    "add_mfc: %s o %x g %x p %x",
1308 				    "multiple kernel entries",
1309 				    ntohl(mfccp->mfcc_origin.s_addr),
1310 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1311 				    mfccp->mfcc_parent);
1312 
1313 			if (ipst->ips_ip_mrtdebug > 1) {
1314 				(void) mi_strlog(mrouter->conn_rq, 1,
1315 				    SL_TRACE,
1316 				    "add_mfc: o %x g %x p %x",
1317 				    ntohl(mfccp->mfcc_origin.s_addr),
1318 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1319 				    mfccp->mfcc_parent);
1320 			}
1321 			fill_route(rt, mfccp, ipst);
1322 
1323 			/*
1324 			 * Prevent cleanup of cache entry.
1325 			 * Timer starts in ip_mforward.
1326 			 */
1327 			if (rt->mfc_timeout_id != 0) {
1328 				timeout_id_t id;
1329 				id = rt->mfc_timeout_id;
1330 				/*
1331 				 * setting id to zero will avoid this
1332 				 * entry from being cleaned up in
1333 				 * expire_up_calls().
1334 				 */
1335 				rt->mfc_timeout_id = 0;
1336 				/*
1337 				 * dropping the lock is fine as we
1338 				 * have a refhold on the bucket.
1339 				 * so mfc cannot be freed.
1340 				 * The timeout can fire but it will see
1341 				 * that mfc_timeout_id == 0 and not cleanup.
1342 				 */
1343 				mutex_exit(&rt->mfc_mutex);
1344 				(void) untimeout(id);
1345 				mutex_enter(&rt->mfc_mutex);
1346 			}
1347 
1348 			/*
1349 			 * Send all pkts that are queued waiting for the upcall.
1350 			 * ip_mdq param tun set to 0 -
1351 			 * the return value of ip_mdq() isn't used here,
1352 			 * so value we send doesn't matter.
1353 			 */
1354 			while (rt->mfc_rte != NULL) {
1355 				rte = rt->mfc_rte;
1356 				rt->mfc_rte = rte->rte_next;
1357 				mutex_exit(&rt->mfc_mutex);
1358 				(void) ip_mdq(rte->mp, (ipha_t *)
1359 				    rte->mp->b_rptr, rte->ill, 0, rt);
1360 				freemsg(rte->mp);
1361 				mi_free((char *)rte);
1362 				mutex_enter(&rt->mfc_mutex);
1363 			}
1364 		}
1365 		mutex_exit(&rt->mfc_mutex);
1366 	}
1367 
1368 
1369 	/*
1370 	 * It is possible that an entry is being inserted without an upcall
1371 	 */
1372 	if (nstl == 0) {
1373 		mutex_enter(&(mfcbp->mfcb_lock));
1374 		if (ipst->ips_ip_mrtdebug > 1) {
1375 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1376 			    "add_mfc: no upcall o %x g %x p %x",
1377 			    ntohl(mfccp->mfcc_origin.s_addr),
1378 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1379 			    mfccp->mfcc_parent);
1380 		}
1381 		if (is_mrouter_off(ipst)) {
1382 			mutex_exit(&mfcbp->mfcb_lock);
1383 			MFCB_REFRELE(mfcbp);
1384 			return (EINVAL);
1385 		}
1386 
1387 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1388 
1389 			mutex_enter(&rt->mfc_mutex);
1390 			if ((rt->mfc_origin.s_addr ==
1391 			    mfccp->mfcc_origin.s_addr) &&
1392 			    (rt->mfc_mcastgrp.s_addr ==
1393 			    mfccp->mfcc_mcastgrp.s_addr) &&
1394 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1395 				fill_route(rt, mfccp, ipst);
1396 				mutex_exit(&rt->mfc_mutex);
1397 				break;
1398 			}
1399 			mutex_exit(&rt->mfc_mutex);
1400 		}
1401 
1402 		/* No upcall, so make a new entry into mfctable */
1403 		if (rt == NULL) {
1404 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1405 			if (rt == NULL) {
1406 				ip1dbg(("add_mfc: out of memory\n"));
1407 				mutex_exit(&mfcbp->mfcb_lock);
1408 				MFCB_REFRELE(mfcbp);
1409 				return (ENOBUFS);
1410 			}
1411 
1412 			/* Insert new entry at head of hash chain */
1413 			mutex_enter(&rt->mfc_mutex);
1414 			fill_route(rt, mfccp, ipst);
1415 
1416 			/* Link into table */
1417 			rt->mfc_next   = mfcbp->mfcb_mfc;
1418 			mfcbp->mfcb_mfc = rt;
1419 			mutex_exit(&rt->mfc_mutex);
1420 		}
1421 		mutex_exit(&mfcbp->mfcb_lock);
1422 	}
1423 
1424 	MFCB_REFRELE(mfcbp);
1425 	return (0);
1426 }
1427 
1428 /*
1429  * Fills in mfc structure from mrouted mfcctl.
1430  */
1431 static void
1432 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1433 {
1434 	int i;
1435 
1436 	rt->mfc_origin		= mfccp->mfcc_origin;
1437 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1438 	rt->mfc_parent		= mfccp->mfcc_parent;
1439 	mutex_enter(&ipst->ips_numvifs_mutex);
1440 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1441 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1442 	}
1443 	mutex_exit(&ipst->ips_numvifs_mutex);
1444 	/* Initialize pkt counters per src-grp */
1445 	rt->mfc_pkt_cnt	= 0;
1446 	rt->mfc_byte_cnt	= 0;
1447 	rt->mfc_wrong_if	= 0;
1448 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1449 
1450 }
1451 
1452 static void
1453 free_queue(struct mfc *mfcp)
1454 {
1455 	struct rtdetq *rte0;
1456 
1457 	/*
1458 	 * Drop all queued upcall packets.
1459 	 * Free the mbuf with the pkt.
1460 	 */
1461 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1462 		mfcp->mfc_rte = rte0->rte_next;
1463 		freemsg(rte0->mp);
1464 		mi_free((char *)rte0);
1465 	}
1466 }
1467 /*
1468  * go thorugh the hash bucket and free all the entries marked condemned.
1469  */
1470 void
1471 release_mfc(struct mfcb *mfcbp)
1472 {
1473 	struct mfc *current_mfcp;
1474 	struct mfc *prev_mfcp;
1475 
1476 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1477 
1478 	while (current_mfcp != NULL) {
1479 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1480 			if (current_mfcp == mfcbp->mfcb_mfc) {
1481 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1482 				free_queue(current_mfcp);
1483 				mi_free(current_mfcp);
1484 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1485 				continue;
1486 			}
1487 			ASSERT(prev_mfcp != NULL);
1488 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1489 			free_queue(current_mfcp);
1490 			mi_free(current_mfcp);
1491 			current_mfcp = NULL;
1492 		} else {
1493 			prev_mfcp = current_mfcp;
1494 		}
1495 
1496 		current_mfcp = prev_mfcp->mfc_next;
1497 
1498 	}
1499 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1500 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1501 }
1502 
1503 /*
1504  * Delete an mfc entry.
1505  */
1506 static int
1507 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1508 {
1509 	struct in_addr	origin;
1510 	struct in_addr	mcastgrp;
1511 	struct mfc 	*rt;
1512 	uint_t		hash;
1513 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1514 
1515 	origin = mfccp->mfcc_origin;
1516 	mcastgrp = mfccp->mfcc_mcastgrp;
1517 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1518 
1519 	if (ipst->ips_ip_mrtdebug > 1) {
1520 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1521 		    "del_mfc: o %x g %x",
1522 		    ntohl(origin.s_addr),
1523 		    ntohl(mcastgrp.s_addr));
1524 	}
1525 
1526 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1527 
1528 	/* Find mfc in mfctable, finds only entries without upcalls */
1529 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1530 		mutex_enter(&rt->mfc_mutex);
1531 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1532 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1533 		    rt->mfc_rte == NULL &&
1534 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1535 			break;
1536 		mutex_exit(&rt->mfc_mutex);
1537 	}
1538 
1539 	/*
1540 	 * Return if there was an upcall (mfc_rte != NULL,
1541 	 * or rt not in mfctable.
1542 	 */
1543 	if (rt == NULL) {
1544 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1545 		return (EADDRNOTAVAIL);
1546 	}
1547 
1548 
1549 	/*
1550 	 * no need to hold lock as we have a reference.
1551 	 */
1552 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1553 	/* error checking */
1554 	if (rt->mfc_timeout_id != 0) {
1555 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1556 		/*
1557 		 * Its ok to drop the lock,  the struct cannot be freed
1558 		 * since we have a ref on the hash bucket.
1559 		 */
1560 		rt->mfc_timeout_id = 0;
1561 		mutex_exit(&rt->mfc_mutex);
1562 		(void) untimeout(rt->mfc_timeout_id);
1563 		mutex_enter(&rt->mfc_mutex);
1564 	}
1565 
1566 	ASSERT(rt->mfc_rte == NULL);
1567 
1568 
1569 	/*
1570 	 * Delete the entry from the cache
1571 	 */
1572 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1573 	mutex_exit(&rt->mfc_mutex);
1574 
1575 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1576 
1577 	return (0);
1578 }
1579 
1580 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1581 
1582 /*
1583  * IP multicast forwarding function. This function assumes that the packet
1584  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1585  * pointed to by "ill", and the packet is to be relayed to other networks
1586  * that have members of the packet's destination IP multicast group.
1587  *
1588  * The packet is returned unscathed to the caller, unless it is
1589  * erroneous, in which case a -1 value tells the caller (IP)
1590  * to discard it.
1591  *
1592  * Unlike BSD, SunOS 5.x needs to return to IP info about
1593  * whether pkt came in thru a tunnel, so it can be discarded, unless
1594  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1595  * to be delivered.
1596  * Return values are 0 - pkt is okay and phyint
1597  *		    -1 - pkt is malformed and to be tossed
1598  *                   1 - pkt came in on tunnel
1599  */
1600 int
1601 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1602 {
1603 	struct mfc 	*rt;
1604 	ipaddr_t	src, dst, tunnel_src = 0;
1605 	static int	srctun = 0;
1606 	vifi_t		vifi;
1607 	boolean_t	pim_reg_packet = B_FALSE;
1608 	struct mfcb *mfcbp;
1609 	ip_stack_t	*ipst = ill->ill_ipst;
1610 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1611 
1612 	if (ipst->ips_ip_mrtdebug > 1) {
1613 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1614 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1615 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1616 		    ill->ill_name);
1617 	}
1618 
1619 	dst = ipha->ipha_dst;
1620 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1621 		pim_reg_packet = B_TRUE;
1622 	else
1623 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1624 
1625 	/*
1626 	 * Don't forward a packet with time-to-live of zero or one,
1627 	 * or a packet destined to a local-only group.
1628 	 */
1629 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1630 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1631 		if (ipst->ips_ip_mrtdebug > 1) {
1632 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1633 			    "ip_mforward: not forwarded ttl %d,"
1634 			    " dst 0x%x ill %s",
1635 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1636 		}
1637 		mp->b_prev = NULL;
1638 		if (tunnel_src != 0)
1639 			return (1);
1640 		else
1641 			return (0);
1642 	}
1643 
1644 	if ((tunnel_src != 0) || pim_reg_packet) {
1645 		/*
1646 		 * Packet arrived over an encapsulated tunnel or via a PIM
1647 		 * register message. Both ip_mroute_decap() and pim_input()
1648 		 * encode information in mp->b_prev.
1649 		 */
1650 		mp->b_prev = NULL;
1651 		if (ipst->ips_ip_mrtdebug > 1) {
1652 			if (tunnel_src != 0) {
1653 				(void) mi_strlog(mrouter->conn_rq, 1,
1654 				    SL_TRACE,
1655 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1656 				    ill->ill_name);
1657 			} else if (pim_reg_packet) {
1658 				(void) mi_strlog(mrouter->conn_rq, 1,
1659 				    SL_TRACE,
1660 				    "ip_mforward: ill %s arrived via"
1661 				    "  REGISTER VIF",
1662 				    ill->ill_name);
1663 			}
1664 		}
1665 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1666 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1667 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1668 		/* Packet arrived via a physical interface. */
1669 		if (ipst->ips_ip_mrtdebug > 1) {
1670 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1671 			    "ip_mforward: ill %s arrived via PHYINT",
1672 			    ill->ill_name);
1673 		}
1674 
1675 	} else {
1676 		/*
1677 		 * Packet arrived through a SRCRT tunnel.
1678 		 * Source-route tunnels are no longer supported.
1679 		 * Error message printed every 1000 times.
1680 		 */
1681 		if ((srctun++ % 1000) == 0) {
1682 			cmn_err(CE_WARN,
1683 			    "ip_mforward: received source-routed pkt from %x",
1684 			    ntohl(ipha->ipha_src));
1685 		}
1686 		return (-1);
1687 	}
1688 
1689 	ipst->ips_mrtstat->mrts_fwd_in++;
1690 	src = ipha->ipha_src;
1691 
1692 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1693 
1694 	/*
1695 	 * Lock the mfctable against changes made by ip_mforward.
1696 	 * Note that only add_mfc and del_mfc can remove entries and
1697 	 * they run with exclusive access to IP. So we do not need to
1698 	 * guard against the rt being deleted, so release lock after reading.
1699 	 */
1700 
1701 	if (is_mrouter_off(ipst))
1702 		return (-1);
1703 
1704 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1705 	MFCB_REFHOLD(mfcbp);
1706 	MFCFIND(mfcbp, src, dst, rt);
1707 
1708 	/* Entry exists, so forward if necessary */
1709 	if (rt != NULL) {
1710 		int ret = 0;
1711 		ipst->ips_mrtstat->mrts_mfc_hits++;
1712 		if (pim_reg_packet) {
1713 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1714 			ret = ip_mdq(mp, ipha,
1715 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1716 			    v_ipif->ipif_ill,
1717 			    0, rt);
1718 		} else {
1719 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1720 		}
1721 
1722 		MFCB_REFRELE(mfcbp);
1723 		return (ret);
1724 
1725 		/*
1726 		 * Don't forward if we don't have a cache entry.  Mrouted will
1727 		 * always provide a cache entry in response to an upcall.
1728 		 */
1729 	} else {
1730 		/*
1731 		 * If we don't have a route for packet's origin, make a copy
1732 		 * of the packet and send message to routing daemon.
1733 		 */
1734 		struct mfc	*mfc_rt	 = NULL;
1735 		mblk_t		*mp0	 = NULL;
1736 		mblk_t		*mp_copy = NULL;
1737 		struct rtdetq	*rte	 = NULL;
1738 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1739 		uint_t		hash;
1740 		int		npkts;
1741 		boolean_t	new_mfc = B_FALSE;
1742 		ipst->ips_mrtstat->mrts_mfc_misses++;
1743 		/* BSD uses mrts_no_route++ */
1744 		if (ipst->ips_ip_mrtdebug > 1) {
1745 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1746 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1747 			    ill->ill_name, ntohl(src), ntohl(dst),
1748 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1749 		}
1750 		/*
1751 		 * The order of the following code differs from the BSD code.
1752 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1753 		 * code works, so SunOS 5.x wasn't changed to conform to the
1754 		 * BSD version.
1755 		 */
1756 
1757 		/* Lock mfctable. */
1758 		hash = MFCHASH(src, dst);
1759 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1760 
1761 		/*
1762 		 * If we are turning off mrouted return an error
1763 		 */
1764 		if (is_mrouter_off(ipst)) {
1765 			mutex_exit(&mfcbp->mfcb_lock);
1766 			MFCB_REFRELE(mfcbp);
1767 			return (-1);
1768 		}
1769 
1770 		/* Is there an upcall waiting for this packet? */
1771 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1772 		    mfc_rt = mfc_rt->mfc_next) {
1773 			mutex_enter(&mfc_rt->mfc_mutex);
1774 			if (ipst->ips_ip_mrtdebug > 1) {
1775 				(void) mi_strlog(mrouter->conn_rq, 1,
1776 				    SL_TRACE,
1777 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1778 				    " g 0x%x\n",
1779 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1780 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1781 			}
1782 			/* There is an upcall */
1783 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1784 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1785 			    (mfc_rt->mfc_rte != NULL) &&
1786 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1787 				break;
1788 			}
1789 			mutex_exit(&mfc_rt->mfc_mutex);
1790 		}
1791 		/* No upcall, so make a new entry into mfctable */
1792 		if (mfc_rt == NULL) {
1793 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1794 			if (mfc_rt == NULL) {
1795 				ipst->ips_mrtstat->mrts_fwd_drop++;
1796 				ip1dbg(("ip_mforward: out of memory "
1797 				    "for mfc, mfc_rt\n"));
1798 				goto error_return;
1799 			} else
1800 				new_mfc = B_TRUE;
1801 			/* Get resources */
1802 			/* TODO could copy header and dup rest */
1803 			mp_copy = copymsg(mp);
1804 			if (mp_copy == NULL) {
1805 				ipst->ips_mrtstat->mrts_fwd_drop++;
1806 				ip1dbg(("ip_mforward: out of memory for "
1807 				    "mblk, mp_copy\n"));
1808 				goto error_return;
1809 			}
1810 			mutex_enter(&mfc_rt->mfc_mutex);
1811 		}
1812 		/* Get resources for rte, whether first rte or not first. */
1813 		/* Add this packet into rtdetq */
1814 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1815 		if (rte == NULL) {
1816 			ipst->ips_mrtstat->mrts_fwd_drop++;
1817 			mutex_exit(&mfc_rt->mfc_mutex);
1818 			ip1dbg(("ip_mforward: out of memory for"
1819 			    " rtdetq, rte\n"));
1820 			goto error_return;
1821 		}
1822 
1823 		mp0 = copymsg(mp);
1824 		if (mp0 == NULL) {
1825 			ipst->ips_mrtstat->mrts_fwd_drop++;
1826 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1827 			mutex_exit(&mfc_rt->mfc_mutex);
1828 			goto error_return;
1829 		}
1830 		rte->mp		= mp0;
1831 		if (pim_reg_packet) {
1832 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1833 			rte->ill =
1834 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1835 			    v_ipif->ipif_ill;
1836 		} else {
1837 			rte->ill = ill;
1838 		}
1839 		rte->rte_next	= NULL;
1840 
1841 		/*
1842 		 * Determine if upcall q (rtdetq) has overflowed.
1843 		 * mfc_rt->mfc_rte is null by mi_zalloc
1844 		 * if it is the first message.
1845 		 */
1846 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1847 		    rte_m = rte_m->rte_next)
1848 			npkts++;
1849 		if (ipst->ips_ip_mrtdebug > 1) {
1850 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1851 			    "ip_mforward: upcalls %d\n", npkts);
1852 		}
1853 		if (npkts > MAX_UPQ) {
1854 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1855 			mutex_exit(&mfc_rt->mfc_mutex);
1856 			goto error_return;
1857 		}
1858 
1859 		if (npkts == 0) {	/* first upcall */
1860 			int i = 0;
1861 			/*
1862 			 * Now finish installing the new mfc! Now that we have
1863 			 * resources!  Insert new entry at head of hash chain.
1864 			 * Use src and dst which are ipaddr_t's.
1865 			 */
1866 			mfc_rt->mfc_origin.s_addr = src;
1867 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1868 
1869 			mutex_enter(&ipst->ips_numvifs_mutex);
1870 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1871 				mfc_rt->mfc_ttls[i] = 0;
1872 			mutex_exit(&ipst->ips_numvifs_mutex);
1873 			mfc_rt->mfc_parent = ALL_VIFS;
1874 
1875 			/* Link into table */
1876 			if (ipst->ips_ip_mrtdebug > 1) {
1877 				(void) mi_strlog(mrouter->conn_rq, 1,
1878 				    SL_TRACE,
1879 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1880 				    "g 0x%x\n", hash,
1881 				    ntohl(mfc_rt->mfc_origin.s_addr),
1882 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1883 			}
1884 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1885 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1886 			mfc_rt->mfc_rte = NULL;
1887 		}
1888 
1889 		/* Link in the upcall */
1890 		/* First upcall */
1891 		if (mfc_rt->mfc_rte == NULL)
1892 			mfc_rt->mfc_rte = rte;
1893 		else {
1894 			/* not the first upcall */
1895 			prev_rte = mfc_rt->mfc_rte;
1896 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1897 			    prev_rte = rte1, rte1 = rte1->rte_next)
1898 				;
1899 			prev_rte->rte_next = rte;
1900 		}
1901 
1902 		/*
1903 		 * No upcalls waiting, this is first one, so send a message to
1904 		 * routing daemon to install a route into kernel table.
1905 		 */
1906 		if (npkts == 0) {
1907 			struct igmpmsg	*im;
1908 			/* ipha_protocol is 0, for upcall */
1909 			ASSERT(mp_copy != NULL);
1910 			im = (struct igmpmsg *)mp_copy->b_rptr;
1911 			im->im_msgtype	= IGMPMSG_NOCACHE;
1912 			im->im_mbz = 0;
1913 			mutex_enter(&ipst->ips_numvifs_mutex);
1914 			if (pim_reg_packet) {
1915 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1916 				mutex_exit(&ipst->ips_numvifs_mutex);
1917 			} else {
1918 				/*
1919 				 * XXX do we need to hold locks here ?
1920 				 */
1921 				for (vifi = 0;
1922 				    vifi < ipst->ips_numvifs;
1923 				    vifi++) {
1924 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1925 						continue;
1926 					if (ipst->ips_vifs[vifi].
1927 					    v_ipif->ipif_ill == ill) {
1928 						im->im_vif = (uchar_t)vifi;
1929 						break;
1930 					}
1931 				}
1932 				mutex_exit(&ipst->ips_numvifs_mutex);
1933 				ASSERT(vifi < ipst->ips_numvifs);
1934 			}
1935 
1936 			ipst->ips_mrtstat->mrts_upcalls++;
1937 			/* Timer to discard upcalls if mrouted is too slow */
1938 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1939 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1940 			mutex_exit(&mfc_rt->mfc_mutex);
1941 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1942 			/* Pass to RAWIP */
1943 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
1944 		} else {
1945 			mutex_exit(&mfc_rt->mfc_mutex);
1946 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1947 			freemsg(mp_copy);
1948 		}
1949 
1950 		MFCB_REFRELE(mfcbp);
1951 		if (tunnel_src != 0)
1952 			return (1);
1953 		else
1954 			return (0);
1955 	error_return:
1956 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1957 		MFCB_REFRELE(mfcbp);
1958 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1959 			mi_free((char *)mfc_rt);
1960 		if (rte != NULL)
1961 			mi_free((char *)rte);
1962 		if (mp_copy != NULL)
1963 			freemsg(mp_copy);
1964 		if (mp0 != NULL)
1965 			freemsg(mp0);
1966 		return (-1);
1967 	}
1968 }
1969 
1970 /*
1971  * Clean up the mfctable cache entry if upcall is not serviced.
1972  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1973  */
1974 static void
1975 expire_upcalls(void *arg)
1976 {
1977 	struct mfc *mfc_rt = arg;
1978 	uint_t hash;
1979 	struct mfc *prev_mfc, *mfc0;
1980 	ip_stack_t	*ipst;
1981 	conn_t		*mrouter;
1982 
1983 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1984 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1985 		return;
1986 	}
1987 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1988 	mrouter = ipst->ips_ip_g_mrouter;
1989 
1990 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1991 	if (ipst->ips_ip_mrtdebug > 1) {
1992 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1993 		    "expire_upcalls: hash %d s %x g %x",
1994 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1995 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1996 	}
1997 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1998 	mutex_enter(&mfc_rt->mfc_mutex);
1999 	/*
2000 	 * if timeout has been set to zero, than the
2001 	 * entry has been filled, no need to delete it.
2002 	 */
2003 	if (mfc_rt->mfc_timeout_id == 0)
2004 		goto done;
2005 	ipst->ips_mrtstat->mrts_cache_cleanups++;
2006 	mfc_rt->mfc_timeout_id = 0;
2007 
2008 	/* Determine entry to be cleaned up in cache table. */
2009 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
2010 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
2011 		if (mfc0 == mfc_rt)
2012 			break;
2013 
2014 	/* del_mfc takes care of gone mfcs */
2015 	ASSERT(prev_mfc != NULL);
2016 	ASSERT(mfc0 != NULL);
2017 
2018 	/*
2019 	 * Delete the entry from the cache
2020 	 */
2021 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
2022 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
2023 
2024 	/*
2025 	 * release_mfc will drop all queued upcall packets.
2026 	 * and will free the mbuf with the pkt, if, timing info.
2027 	 */
2028 done:
2029 	mutex_exit(&mfc_rt->mfc_mutex);
2030 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
2031 }
2032 
2033 /*
2034  * Packet forwarding routine once entry in the cache is made.
2035  */
2036 static int
2037 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
2038     struct mfc *rt)
2039 {
2040 	vifi_t vifi;
2041 	struct vif *vifp;
2042 	ipaddr_t dst = ipha->ipha_dst;
2043 	size_t  plen = msgdsize(mp);
2044 	vifi_t num_of_vifs;
2045 	ip_stack_t	*ipst = ill->ill_ipst;
2046 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2047 
2048 	if (ipst->ips_ip_mrtdebug > 1) {
2049 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2050 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2051 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2052 		    ill->ill_name);
2053 	}
2054 
2055 	/* Macro to send packet on vif */
2056 #define	MC_SEND(ipha, mp, vifp, dst) { \
2057 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2058 		encap_send((ipha), (mp), (vifp), (dst)); \
2059 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2060 		register_send((ipha), (mp), (vifp), (dst)); \
2061 	else \
2062 		phyint_send((ipha), (mp), (vifp), (dst)); \
2063 }
2064 
2065 	vifi = rt->mfc_parent;
2066 
2067 	/*
2068 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2069 	 * Mrouted had no route.
2070 	 * We wanted the route installed in the mfctable to prevent multiple
2071 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2072 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2073 	 * 3.6.
2074 	 */
2075 	if (vifi == NO_VIF) {
2076 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2077 		    ill->ill_name));
2078 		if (ipst->ips_ip_mrtdebug > 1) {
2079 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2080 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2081 		}
2082 		return (-1);	/* drop pkt */
2083 	}
2084 
2085 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2086 		return (-1);
2087 	/*
2088 	 * The MFC entries are not cleaned up when an ipif goes
2089 	 * away thus this code has to guard against an MFC referencing
2090 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2091 	 * sets the v_ipif to NULL when the ipif disappears.
2092 	 */
2093 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2094 
2095 	if (vifi >= ipst->ips_numvifs) {
2096 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2097 		    "%d ill %s viftable ill %s\n",
2098 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2099 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2100 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2101 		return (-1);
2102 	}
2103 	/*
2104 	 * Don't forward if it didn't arrive from the parent vif for its
2105 	 * origin. But do match on the groups as we nominate only one
2106 	 * ill in the group for receiving allmulti packets.
2107 	 */
2108 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
2109 	    (ill->ill_group == NULL ||
2110 	    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
2111 		ill->ill_group)) ||
2112 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2113 		/* Came in the wrong interface */
2114 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2115 			"numvifs %d ill %s viftable ill %s\n",
2116 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2117 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2118 		if (ipst->ips_ip_mrtdebug > 1) {
2119 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2120 			    "ip_mdq: arrived wrong if, vifi %d ill "
2121 			    "%s viftable ill %s\n",
2122 			    (int)vifi, ill->ill_name,
2123 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2124 		}
2125 		ipst->ips_mrtstat->mrts_wrong_if++;
2126 		rt->mfc_wrong_if++;
2127 
2128 		/*
2129 		 * If we are doing PIM assert processing and we are forwarding
2130 		 * packets on this interface, and it is a broadcast medium
2131 		 * interface (and not a tunnel), send a message to the routing.
2132 		 *
2133 		 * We use the first ipif on the list, since it's all we have.
2134 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2135 		 */
2136 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2137 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2138 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2139 			mblk_t		*mp_copy;
2140 			struct igmpmsg	*im;
2141 
2142 			/* TODO could copy header and dup rest */
2143 			mp_copy = copymsg(mp);
2144 			if (mp_copy == NULL) {
2145 				ipst->ips_mrtstat->mrts_fwd_drop++;
2146 				ip1dbg(("ip_mdq: out of memory "
2147 				    "for mblk, mp_copy\n"));
2148 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2149 				return (-1);
2150 			}
2151 
2152 			im = (struct igmpmsg *)mp_copy->b_rptr;
2153 			im->im_msgtype = IGMPMSG_WRONGVIF;
2154 			im->im_mbz = 0;
2155 			im->im_vif = (ushort_t)vifi;
2156 			/* Pass to RAWIP */
2157 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2158 		}
2159 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2160 		if (tunnel_src != 0)
2161 			return (1);
2162 		else
2163 			return (0);
2164 	}
2165 	/*
2166 	 * If I sourced this packet, it counts as output, else it was input.
2167 	 */
2168 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2169 		ipst->ips_vifs[vifi].v_pkt_out++;
2170 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2171 	} else {
2172 		ipst->ips_vifs[vifi].v_pkt_in++;
2173 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2174 	}
2175 	mutex_enter(&rt->mfc_mutex);
2176 	rt->mfc_pkt_cnt++;
2177 	rt->mfc_byte_cnt += plen;
2178 	mutex_exit(&rt->mfc_mutex);
2179 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2180 	/*
2181 	 * For each vif, decide if a copy of the packet should be forwarded.
2182 	 * Forward if:
2183 	 *		- the vif threshold ttl is non-zero AND
2184 	 *		- the pkt ttl exceeds the vif's threshold
2185 	 * A non-zero mfc_ttl indicates that the vif is part of
2186 	 * the output set for the mfc entry.
2187 	 */
2188 	mutex_enter(&ipst->ips_numvifs_mutex);
2189 	num_of_vifs = ipst->ips_numvifs;
2190 	mutex_exit(&ipst->ips_numvifs_mutex);
2191 	for (vifp = ipst->ips_vifs, vifi = 0;
2192 	    vifi < num_of_vifs;
2193 	    vifp++, vifi++) {
2194 		if (!lock_good_vif(vifp))
2195 			continue;
2196 		if ((rt->mfc_ttls[vifi] > 0) &&
2197 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2198 			/*
2199 			 * lock_good_vif should not have succedded if
2200 			 * v_ipif is null.
2201 			 */
2202 			ASSERT(vifp->v_ipif != NULL);
2203 			vifp->v_pkt_out++;
2204 			vifp->v_bytes_out += plen;
2205 			MC_SEND(ipha, mp, vifp, dst);
2206 			ipst->ips_mrtstat->mrts_fwd_out++;
2207 		}
2208 		unlock_good_vif(vifp);
2209 	}
2210 	if (tunnel_src != 0)
2211 		return (1);
2212 	else
2213 		return (0);
2214 }
2215 
2216 /*
2217  * Send the packet on physical interface.
2218  * Caller assumes can continue to use mp on return.
2219  */
2220 /* ARGSUSED */
2221 static void
2222 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2223 {
2224 	mblk_t 	*mp_copy;
2225 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2226 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2227 
2228 	/* Make a new reference to the packet */
2229 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2230 	if (mp_copy == NULL) {
2231 		ipst->ips_mrtstat->mrts_fwd_drop++;
2232 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2233 		return;
2234 	}
2235 	if (vifp->v_rate_limit <= 0)
2236 		tbf_send_packet(vifp, mp_copy);
2237 	else  {
2238 		if (ipst->ips_ip_mrtdebug > 1) {
2239 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2240 			    "phyint_send: tbf_contr rate %d "
2241 			    "vifp 0x%p mp 0x%p dst 0x%x",
2242 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2243 		}
2244 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2245 	}
2246 }
2247 
2248 /*
2249  * Send the whole packet for REGISTER encapsulation to PIM daemon
2250  * Caller assumes it can continue to use mp on return.
2251  */
2252 /* ARGSUSED */
2253 static void
2254 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2255 {
2256 	struct igmpmsg	*im;
2257 	mblk_t		*mp_copy;
2258 	ipha_t		*ipha_copy;
2259 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2260 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2261 
2262 	if (ipst->ips_ip_mrtdebug > 1) {
2263 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2264 		    "register_send: src %x, dst %x\n",
2265 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2266 	}
2267 
2268 	/*
2269 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2270 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2271 	 * ethernet driver will.
2272 	 */
2273 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2274 	if (mp_copy == NULL) {
2275 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2276 		if (ipst->ips_ip_mrtdebug > 3) {
2277 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2278 			    "register_send: allocb failure.");
2279 		}
2280 		return;
2281 	}
2282 
2283 	/*
2284 	 * Bump write pointer to account for igmpmsg being added.
2285 	 */
2286 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2287 
2288 	/*
2289 	 * Chain packet to new mblk_t.
2290 	 */
2291 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2292 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2293 		if (ipst->ips_ip_mrtdebug > 3) {
2294 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2295 			    "register_send: copymsg failure.");
2296 		}
2297 		freeb(mp_copy);
2298 		return;
2299 	}
2300 
2301 	/*
2302 	 * icmp_input() asserts that IP version field is set to an
2303 	 * appropriate version. Hence, the struct igmpmsg that this really
2304 	 * becomes, needs to have the correct IP version field.
2305 	 */
2306 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2307 	*ipha_copy = multicast_encap_iphdr;
2308 
2309 	/*
2310 	 * The kernel uses the struct igmpmsg header to encode the messages to
2311 	 * the multicast routing daemon. Fill in the fields in the header
2312 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2313 	 */
2314 	im = (struct igmpmsg *)mp_copy->b_rptr;
2315 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2316 	im->im_src.s_addr = ipha->ipha_src;
2317 	im->im_dst.s_addr = ipha->ipha_dst;
2318 
2319 	/*
2320 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2321 	 * header with renamed fields and the multicast routing daemon uses
2322 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2323 	 */
2324 	im->im_mbz = 0;
2325 
2326 	++ipst->ips_mrtstat->mrts_upcalls;
2327 	if (!canputnext(mrouter->conn_rq)) {
2328 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2329 		if (ipst->ips_ip_mrtdebug > 3) {
2330 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2331 			    "register_send: register upcall failure.");
2332 		}
2333 		freemsg(mp_copy);
2334 	} else {
2335 		/* Pass to RAWIP */
2336 		(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2337 	}
2338 }
2339 
2340 /*
2341  * pim_validate_cksum handles verification of the checksum in the
2342  * pim header.  For PIM Register packets, the checksum is calculated
2343  * across the PIM header only.  For all other packets, the checksum
2344  * is for the PIM header and remainder of the packet.
2345  *
2346  * returns: B_TRUE, if checksum is okay.
2347  *          B_FALSE, if checksum is not valid.
2348  */
2349 static boolean_t
2350 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2351 {
2352 	mblk_t *mp_dup;
2353 
2354 	if ((mp_dup = dupmsg(mp)) == NULL)
2355 		return (B_FALSE);
2356 
2357 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2358 	if (pimp->pim_type == PIM_REGISTER)
2359 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2360 	if (IP_CSUM(mp_dup, 0, 0)) {
2361 		freemsg(mp_dup);
2362 		return (B_FALSE);
2363 	}
2364 	freemsg(mp_dup);
2365 	return (B_TRUE);
2366 }
2367 
2368 /*
2369  * int
2370  * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
2371  *	IP Protocol 103. Register messages are decapsulated and sent
2372  *	onto multicast forwarding.
2373  */
2374 int
2375 pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
2376 {
2377 	ipha_t		*eip, *ip;
2378 	int		iplen, pimlen, iphlen;
2379 	struct pim	*pimp;	/* pointer to a pim struct */
2380 	uint32_t	*reghdr;
2381 	ip_stack_t	*ipst = ill->ill_ipst;
2382 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2383 
2384 	/*
2385 	 * Pullup the msg for PIM protocol processing.
2386 	 */
2387 	if (pullupmsg(mp, -1) == 0) {
2388 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2389 		freemsg(mp);
2390 		return (-1);
2391 	}
2392 
2393 	ip = (ipha_t *)mp->b_rptr;
2394 	iplen = ip->ipha_length;
2395 	iphlen = IPH_HDR_LENGTH(ip);
2396 	pimlen = ntohs(iplen) - iphlen;
2397 
2398 	/*
2399 	 * Validate lengths
2400 	 */
2401 	if (pimlen < PIM_MINLEN) {
2402 		++ipst->ips_mrtstat->mrts_pim_malformed;
2403 		if (ipst->ips_ip_mrtdebug > 1) {
2404 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2405 			    "pim_input: length not at least minlen");
2406 		}
2407 		freemsg(mp);
2408 		return (-1);
2409 	}
2410 
2411 	/*
2412 	 * Point to the PIM header.
2413 	 */
2414 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2415 
2416 	/*
2417 	 * Check the version number.
2418 	 */
2419 	if (pimp->pim_vers != PIM_VERSION) {
2420 		++ipst->ips_mrtstat->mrts_pim_badversion;
2421 		if (ipst->ips_ip_mrtdebug > 1) {
2422 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2423 			    "pim_input: unknown version of PIM");
2424 		}
2425 		freemsg(mp);
2426 		return (-1);
2427 	}
2428 
2429 	/*
2430 	 * Validate the checksum
2431 	 */
2432 	if (!pim_validate_cksum(mp, ip, pimp)) {
2433 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2434 		if (ipst->ips_ip_mrtdebug > 1) {
2435 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2436 			    "pim_input: invalid checksum");
2437 		}
2438 		freemsg(mp);
2439 		return (-1);
2440 	}
2441 
2442 	if (pimp->pim_type != PIM_REGISTER)
2443 		return (0);
2444 
2445 	reghdr = (uint32_t *)(pimp + 1);
2446 	eip = (ipha_t *)(reghdr + 1);
2447 
2448 	/*
2449 	 * check if the inner packet is destined to mcast group
2450 	 */
2451 	if (!CLASSD(eip->ipha_dst)) {
2452 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2453 		if (ipst->ips_ip_mrtdebug > 1) {
2454 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2455 			    "pim_input: Inner pkt not mcast .. !");
2456 		}
2457 		freemsg(mp);
2458 		return (-1);
2459 	}
2460 	if (ipst->ips_ip_mrtdebug > 1) {
2461 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2462 		    "register from %x, to %x, len %d",
2463 		    ntohl(eip->ipha_src),
2464 		    ntohl(eip->ipha_dst),
2465 		    ntohs(eip->ipha_length));
2466 	}
2467 	/*
2468 	 * If the null register bit is not set, decapsulate
2469 	 * the packet before forwarding it.
2470 	 */
2471 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2472 		mblk_t *mp_copy;
2473 
2474 		/* Copy the message */
2475 		if ((mp_copy = copymsg(mp)) == NULL) {
2476 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2477 			freemsg(mp);
2478 			return (-1);
2479 		}
2480 
2481 		/*
2482 		 * Decapsulate the packet and give it to
2483 		 * register_mforward.
2484 		 */
2485 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2486 		    sizeof (*reghdr);
2487 		if (register_mforward(q, mp_copy, ill) != 0) {
2488 			freemsg(mp);
2489 			return (-1);
2490 		}
2491 	}
2492 
2493 	/*
2494 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2495 	 * PIM socket. For Solaris it is done right after pim_input() is
2496 	 * called.
2497 	 */
2498 	return (0);
2499 }
2500 
2501 /*
2502  * PIM sparse mode hook.  Called by pim_input after decapsulating
2503  * the packet. Loop back the packet, as if we have received it.
2504  * In pim_input() we have to check if the destination is a multicast address.
2505  */
2506 /* ARGSUSED */
2507 static int
2508 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
2509 {
2510 	ip_stack_t	*ipst = ill->ill_ipst;
2511 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2512 
2513 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2514 
2515 	if (ipst->ips_ip_mrtdebug > 3) {
2516 		ipha_t *ipha;
2517 
2518 		ipha = (ipha_t *)mp->b_rptr;
2519 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2520 		    "register_mforward: src %x, dst %x\n",
2521 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2522 	}
2523 	/*
2524 	 * Need to pass in to ip_mforward() the information that the
2525 	 * packet has arrived on the register_vif. We use the solution that
2526 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2527 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2528 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2529 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2530 	 * or passes it to ip_mforward(). We use
2531 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2532 	 * register vif. If in the future we have more than one register vifs,
2533 	 * then this will need re-examination.
2534 	 */
2535 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2536 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2537 	ip_rput(q, mp);
2538 	return (0);
2539 }
2540 
2541 /*
2542  * Send an encapsulated packet.
2543  * Caller assumes can continue to use mp when routine returns.
2544  */
2545 /* ARGSUSED */
2546 static void
2547 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2548 {
2549 	mblk_t 	*mp_copy;
2550 	ipha_t 	*ipha_copy;
2551 	size_t	len;
2552 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2553 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2554 
2555 	if (ipst->ips_ip_mrtdebug > 1) {
2556 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2557 		    "encap_send: vif %ld enter",
2558 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2559 	}
2560 	len = ntohs(ipha->ipha_length);
2561 
2562 	/*
2563 	 * Copy the old packet & pullup it's IP header into the
2564 	 * new mbuf so we can modify it.  Try to fill the new
2565 	 * mbuf since if we don't the ethernet driver will.
2566 	 */
2567 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2568 	if (mp_copy == NULL)
2569 		return;
2570 	mp_copy->b_rptr += 32;
2571 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2572 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2573 		freeb(mp_copy);
2574 		return;
2575 	}
2576 
2577 	/*
2578 	 * Fill in the encapsulating IP header.
2579 	 * Remote tunnel dst in rmt_addr, from add_vif().
2580 	 */
2581 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2582 	*ipha_copy = multicast_encap_iphdr;
2583 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2584 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2585 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2586 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2587 	ASSERT(ipha_copy->ipha_ident == 0);
2588 
2589 	/* Turn the encapsulated IP header back into a valid one. */
2590 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2591 	ipha->ipha_ttl--;
2592 	ipha->ipha_hdr_checksum = 0;
2593 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2594 
2595 	if (ipst->ips_ip_mrtdebug > 1) {
2596 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2597 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2598 	}
2599 	if (vifp->v_rate_limit <= 0)
2600 		tbf_send_packet(vifp, mp_copy);
2601 	else
2602 		/* ipha is from the original header */
2603 		tbf_control(vifp, mp_copy, ipha);
2604 }
2605 
2606 /*
2607  * De-encapsulate a packet and feed it back through IP input.
2608  * This routine is called whenever IP gets a packet with prototype
2609  * IPPROTO_ENCAP and a local destination address.
2610  */
2611 void
2612 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
2613 {
2614 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2615 	ipha_t		*ipha_encap;
2616 	int		hlen = IPH_HDR_LENGTH(ipha);
2617 	ipaddr_t	src;
2618 	struct vif	*vifp;
2619 	ip_stack_t	*ipst = ill->ill_ipst;
2620 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2621 
2622 	/*
2623 	 * Dump the packet if it's not to a multicast destination or if
2624 	 * we don't have an encapsulating tunnel with the source.
2625 	 * Note:  This code assumes that the remote site IP address
2626 	 * uniquely identifies the tunnel (i.e., that this site has
2627 	 * at most one tunnel with the remote site).
2628 	 */
2629 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2630 	if (!CLASSD(ipha_encap->ipha_dst)) {
2631 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2632 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2633 		freemsg(mp);
2634 		return;
2635 	}
2636 	src = (ipaddr_t)ipha->ipha_src;
2637 	mutex_enter(&ipst->ips_last_encap_lock);
2638 	if (src != ipst->ips_last_encap_src) {
2639 		struct vif *vife;
2640 
2641 		vifp = ipst->ips_vifs;
2642 		vife = vifp + ipst->ips_numvifs;
2643 		ipst->ips_last_encap_src = src;
2644 		ipst->ips_last_encap_vif = 0;
2645 		for (; vifp < vife; ++vifp) {
2646 			if (!lock_good_vif(vifp))
2647 				continue;
2648 			if (vifp->v_rmt_addr.s_addr == src) {
2649 				if (vifp->v_flags & VIFF_TUNNEL)
2650 					ipst->ips_last_encap_vif = vifp;
2651 				if (ipst->ips_ip_mrtdebug > 1) {
2652 					(void) mi_strlog(mrouter->conn_rq,
2653 					    1, SL_TRACE,
2654 					    "ip_mroute_decap: good tun "
2655 					    "vif %ld with %x",
2656 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2657 					    ntohl(src));
2658 				}
2659 				unlock_good_vif(vifp);
2660 				break;
2661 			}
2662 			unlock_good_vif(vifp);
2663 		}
2664 	}
2665 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2666 		mutex_exit(&ipst->ips_last_encap_lock);
2667 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2668 		freemsg(mp);
2669 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2670 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2671 		return;
2672 	}
2673 	mutex_exit(&ipst->ips_last_encap_lock);
2674 
2675 	/*
2676 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2677 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2678 	 * to pass this information. This is safe since the ip_rput either
2679 	 * frees the packet or passes it to ip_mforward.
2680 	 */
2681 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2682 	mp->b_rptr += hlen;
2683 	/* Feed back into ip_rput as an M_DATA. */
2684 	ip_rput(q, mp);
2685 }
2686 
2687 /*
2688  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2689  * (stream closed).  Called as writer.
2690  */
2691 void
2692 reset_mrt_vif_ipif(ipif_t *ipif)
2693 {
2694 	vifi_t vifi, tmp_vifi;
2695 	vifi_t num_of_vifs;
2696 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2697 
2698 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2699 
2700 	mutex_enter(&ipst->ips_numvifs_mutex);
2701 	num_of_vifs = ipst->ips_numvifs;
2702 	mutex_exit(&ipst->ips_numvifs_mutex);
2703 
2704 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2705 		tmp_vifi = vifi - 1;
2706 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2707 			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
2708 		}
2709 	}
2710 }
2711 
2712 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2713 void
2714 reset_mrt_ill(ill_t *ill)
2715 {
2716 	struct mfc		*rt;
2717 	struct rtdetq	*rte;
2718 	int			i;
2719 	ip_stack_t	*ipst = ill->ill_ipst;
2720 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2721 
2722 	for (i = 0; i < MFCTBLSIZ; i++) {
2723 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2724 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2725 			if (ipst->ips_ip_mrtdebug > 1) {
2726 				(void) mi_strlog(mrouter->conn_rq, 1,
2727 				    SL_TRACE,
2728 				    "reset_mrt_ill: mfctable [%d]", i);
2729 			}
2730 			while (rt != NULL) {
2731 				mutex_enter(&rt->mfc_mutex);
2732 				while ((rte = rt->mfc_rte) != NULL) {
2733 					if (rte->ill == ill) {
2734 						if (ipst->ips_ip_mrtdebug > 1) {
2735 						(void) mi_strlog(
2736 						    mrouter->conn_rq,
2737 						    1, SL_TRACE,
2738 						    "reset_mrt_ill: "
2739 						    "ill 0x%p", (void *)ill);
2740 						}
2741 						rt->mfc_rte = rte->rte_next;
2742 						freemsg(rte->mp);
2743 						mi_free((char *)rte);
2744 					}
2745 				}
2746 				mutex_exit(&rt->mfc_mutex);
2747 				rt = rt->mfc_next;
2748 			}
2749 		}
2750 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2751 	}
2752 }
2753 
2754 /*
2755  * Token bucket filter module.
2756  * The ipha is for mcastgrp destination for phyint and encap.
2757  */
2758 static void
2759 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2760 {
2761 	size_t 	p_len =  msgdsize(mp);
2762 	struct tbf	*t    = vifp->v_tbf;
2763 	timeout_id_t id = 0;
2764 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2765 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2766 
2767 	/* Drop if packet is too large */
2768 	if (p_len > MAX_BKT_SIZE) {
2769 		ipst->ips_mrtstat->mrts_pkt2large++;
2770 		freemsg(mp);
2771 		return;
2772 	}
2773 	if (ipst->ips_ip_mrtdebug > 1) {
2774 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2775 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2776 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2777 		    ntohl(ipha->ipha_dst));
2778 	}
2779 
2780 	mutex_enter(&t->tbf_lock);
2781 
2782 	tbf_update_tokens(vifp);
2783 
2784 	/*
2785 	 * If there are enough tokens,
2786 	 * and the queue is empty, send this packet out.
2787 	 */
2788 	if (ipst->ips_ip_mrtdebug > 1) {
2789 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2790 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2791 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2792 		    t->tbf_q_len);
2793 	}
2794 	/* No packets are queued */
2795 	if (t->tbf_q_len == 0) {
2796 		/* queue empty, send packet if enough tokens */
2797 		if (p_len <= t->tbf_n_tok) {
2798 			t->tbf_n_tok -= p_len;
2799 			mutex_exit(&t->tbf_lock);
2800 			tbf_send_packet(vifp, mp);
2801 			return;
2802 		} else {
2803 			/* Queue packet and timeout till later */
2804 			tbf_queue(vifp, mp);
2805 			ASSERT(vifp->v_timeout_id == 0);
2806 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2807 			    TBF_REPROCESS);
2808 		}
2809 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2810 		/* Finite queue length, so queue pkts and process queue */
2811 		tbf_queue(vifp, mp);
2812 		tbf_process_q(vifp);
2813 	} else {
2814 		/* Check that we have UDP header with IP header */
2815 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2816 		    sizeof (struct udphdr);
2817 
2818 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2819 			if (!pullupmsg(mp, hdr_length)) {
2820 				freemsg(mp);
2821 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2822 				    "vif %ld src 0x%x dst 0x%x\n",
2823 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2824 				    ntohl(ipha->ipha_src),
2825 				    ntohl(ipha->ipha_dst)));
2826 				mutex_exit(&vifp->v_tbf->tbf_lock);
2827 				return;
2828 			} else
2829 				/* Have to reassign ipha after pullupmsg */
2830 				ipha = (ipha_t *)mp->b_rptr;
2831 		}
2832 		/*
2833 		 * Queue length too much,
2834 		 * try to selectively dq, or queue and process
2835 		 */
2836 		if (!tbf_dq_sel(vifp, ipha)) {
2837 			ipst->ips_mrtstat->mrts_q_overflow++;
2838 			freemsg(mp);
2839 		} else {
2840 			tbf_queue(vifp, mp);
2841 			tbf_process_q(vifp);
2842 		}
2843 	}
2844 	if (t->tbf_q_len == 0) {
2845 		id = vifp->v_timeout_id;
2846 		vifp->v_timeout_id = 0;
2847 	}
2848 	mutex_exit(&vifp->v_tbf->tbf_lock);
2849 	if (id != 0)
2850 		(void) untimeout(id);
2851 }
2852 
2853 /*
2854  * Adds a packet to the tbf queue at the interface.
2855  * The ipha is for mcastgrp destination for phyint and encap.
2856  */
2857 static void
2858 tbf_queue(struct vif *vifp, mblk_t *mp)
2859 {
2860 	struct tbf	*t = vifp->v_tbf;
2861 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2862 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2863 
2864 	if (ipst->ips_ip_mrtdebug > 1) {
2865 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2866 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2867 	}
2868 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2869 
2870 	if (t->tbf_t == NULL) {
2871 		/* Queue was empty */
2872 		t->tbf_q = mp;
2873 	} else {
2874 		/* Insert at tail */
2875 		t->tbf_t->b_next = mp;
2876 	}
2877 	/* set new tail pointer */
2878 	t->tbf_t = mp;
2879 
2880 	mp->b_next = mp->b_prev = NULL;
2881 
2882 	t->tbf_q_len++;
2883 }
2884 
2885 /*
2886  * Process the queue at the vif interface.
2887  * Drops the tbf_lock when sending packets.
2888  *
2889  * NOTE : The caller should quntimeout if the queue length is 0.
2890  */
2891 static void
2892 tbf_process_q(struct vif *vifp)
2893 {
2894 	mblk_t	*mp;
2895 	struct tbf	*t = vifp->v_tbf;
2896 	size_t	len;
2897 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2898 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2899 
2900 	if (ipst->ips_ip_mrtdebug > 1) {
2901 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2902 		    "tbf_process_q 1: vif %ld qlen = %d",
2903 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2904 	}
2905 
2906 	/*
2907 	 * Loop through the queue at the interface and send
2908 	 * as many packets as possible.
2909 	 */
2910 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2911 
2912 	while (t->tbf_q_len > 0) {
2913 		mp = t->tbf_q;
2914 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2915 
2916 		/* Determine if the packet can be sent */
2917 		if (len <= t->tbf_n_tok) {
2918 			/*
2919 			 * If so, reduce no. of tokens, dequeue the packet,
2920 			 * send the packet.
2921 			 */
2922 			t->tbf_n_tok -= len;
2923 
2924 			t->tbf_q = mp->b_next;
2925 			if (--t->tbf_q_len == 0) {
2926 				t->tbf_t = NULL;
2927 			}
2928 			mp->b_next = NULL;
2929 			/* Exit mutex before sending packet, then re-enter */
2930 			mutex_exit(&t->tbf_lock);
2931 			tbf_send_packet(vifp, mp);
2932 			mutex_enter(&t->tbf_lock);
2933 		} else
2934 			break;
2935 	}
2936 }
2937 
2938 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2939 static void
2940 tbf_reprocess_q(void *arg)
2941 {
2942 	struct vif *vifp = arg;
2943 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2944 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2945 
2946 	mutex_enter(&vifp->v_tbf->tbf_lock);
2947 	vifp->v_timeout_id = 0;
2948 	tbf_update_tokens(vifp);
2949 
2950 	tbf_process_q(vifp);
2951 
2952 	if (vifp->v_tbf->tbf_q_len > 0) {
2953 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2954 		    TBF_REPROCESS);
2955 	}
2956 	mutex_exit(&vifp->v_tbf->tbf_lock);
2957 
2958 	if (ipst->ips_ip_mrtdebug > 1) {
2959 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2960 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2961 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
2962 	}
2963 }
2964 
2965 /*
2966  * Function that will selectively discard a member of the tbf queue,
2967  * based on the precedence value and the priority.
2968  *
2969  * NOTE : The caller should quntimeout if the queue length is 0.
2970  */
2971 static int
2972 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2973 {
2974 	uint_t		p;
2975 	struct tbf		*t = vifp->v_tbf;
2976 	mblk_t		**np;
2977 	mblk_t		*last, *mp;
2978 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2979 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2980 
2981 	if (ipst->ips_ip_mrtdebug > 1) {
2982 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2983 		    "dq_sel: vif %ld dst 0x%x",
2984 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
2985 	}
2986 
2987 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2988 	p = priority(vifp, ipha);
2989 
2990 	np = &t->tbf_q;
2991 	last = NULL;
2992 	while ((mp = *np) != NULL) {
2993 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2994 			*np = mp->b_next;
2995 			/* If removing the last packet, fix the tail pointer */
2996 			if (mp == t->tbf_t)
2997 				t->tbf_t = last;
2998 			mp->b_prev = mp->b_next = NULL;
2999 			freemsg(mp);
3000 			/*
3001 			 * It's impossible for the queue to be empty, but
3002 			 * we check anyway.
3003 			 */
3004 			if (--t->tbf_q_len == 0) {
3005 				t->tbf_t = NULL;
3006 			}
3007 			ipst->ips_mrtstat->mrts_drop_sel++;
3008 			return (1);
3009 		}
3010 		np = &mp->b_next;
3011 		last = mp;
3012 	}
3013 	return (0);
3014 }
3015 
3016 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3017 static void
3018 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3019 {
3020 	ipif_t  *ipif;
3021 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3022 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3023 
3024 	/* If encap tunnel options */
3025 	if (vifp->v_flags & VIFF_TUNNEL)  {
3026 		if (ipst->ips_ip_mrtdebug > 1) {
3027 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3028 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
3029 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3030 		}
3031 
3032 		/*
3033 		 * Feed into ip_wput which will set the ident field and
3034 		 * checksum the encapsulating header.
3035 		 * BSD gets the cached route vifp->v_route from ip_output()
3036 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3037 		 */
3038 		put(vifp->v_ipif->ipif_wq, mp);
3039 		return;
3040 
3041 		/* phyint */
3042 	} else {
3043 		/* Need to loop back to members on the outgoing interface. */
3044 		ipha_t  *ipha;
3045 		ipaddr_t    dst;
3046 		ipha  = (ipha_t *)mp->b_rptr;
3047 		dst  = ipha->ipha_dst;
3048 		ipif = vifp->v_ipif;
3049 
3050 		mutex_enter(&ipif->ipif_ill->ill_lock);
3051 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
3052 			/*
3053 			 * The packet is not yet reassembled, thus we need to
3054 			 * pass it to ip_rput_local for checksum verification
3055 			 * and reassembly (and fanout the user stream).
3056 			 */
3057 			mblk_t 	*mp_loop;
3058 			ire_t	*ire;
3059 
3060 			mutex_exit(&ipif->ipif_ill->ill_lock);
3061 			if (ipst->ips_ip_mrtdebug > 1) {
3062 				(void) mi_strlog(mrouter->conn_rq, 1,
3063 				    SL_TRACE,
3064 				    "tbf_send_pkt: loopback vif %ld",
3065 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3066 			}
3067 			mp_loop = copymsg(mp);
3068 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
3069 			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
3070 
3071 			if (mp_loop != NULL && ire != NULL) {
3072 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
3073 				    ((ipha_t *)mp_loop->b_rptr),
3074 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
3075 			} else {
3076 				/* Either copymsg failed or no ire */
3077 				(void) mi_strlog(mrouter->conn_rq, 1,
3078 				    SL_TRACE,
3079 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
3080 				    "vif %ld\n", (void *)mp_loop, (void *)ire,
3081 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3082 			}
3083 			if (ire != NULL)
3084 				ire_refrele(ire);
3085 		} else {
3086 			mutex_exit(&ipif->ipif_ill->ill_lock);
3087 		}
3088 		if (ipst->ips_ip_mrtdebug > 1) {
3089 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3090 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3091 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3092 		}
3093 		ip_rput_forward_multicast(dst, mp, ipif);
3094 	}
3095 }
3096 
3097 /*
3098  * Determine the current time and then the elapsed time (between the last time
3099  * and time now).  Update the no. of tokens in the bucket.
3100  */
3101 static void
3102 tbf_update_tokens(struct vif *vifp)
3103 {
3104 	timespec_t	tp;
3105 	hrtime_t	tm;
3106 	struct tbf	*t = vifp->v_tbf;
3107 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3108 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3109 
3110 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3111 
3112 	/* Time in secs and nsecs, rate limit in kbits/sec */
3113 	gethrestime(&tp);
3114 
3115 	/*LINTED*/
3116 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3117 
3118 	/*
3119 	 * This formula is actually
3120 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3121 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3122 	 *
3123 	 * The (1000/1024) was introduced in add_vif to optimize
3124 	 * this divide into a shift.
3125 	 */
3126 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3127 	t->tbf_last_pkt_t = tp;
3128 
3129 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3130 		t->tbf_n_tok = MAX_BKT_SIZE;
3131 	if (ipst->ips_ip_mrtdebug > 1) {
3132 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3133 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3134 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3135 	}
3136 }
3137 
3138 /*
3139  * Priority currently is based on port nos.
3140  * Different forwarding mechanisms have different ways
3141  * of obtaining the port no. Hence, the vif must be
3142  * given along with the packet itself.
3143  *
3144  */
3145 static int
3146 priority(struct vif *vifp, ipha_t *ipha)
3147 {
3148 	int prio;
3149 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3150 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3151 
3152 	/* Temporary hack; may add general packet classifier some day */
3153 
3154 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3155 
3156 	/*
3157 	 * The UDP port space is divided up into four priority ranges:
3158 	 * [0, 16384)	: unclassified - lowest priority
3159 	 * [16384, 32768)	: audio - highest priority
3160 	 * [32768, 49152)	: whiteboard - medium priority
3161 	 * [49152, 65536)	: video - low priority
3162 	 */
3163 
3164 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3165 		struct udphdr *udp =
3166 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3167 		switch (ntohs(udp->uh_dport) & 0xc000) {
3168 		case 0x4000:
3169 			prio = 70;
3170 			break;
3171 		case 0x8000:
3172 			prio = 60;
3173 			break;
3174 		case 0xc000:
3175 			prio = 55;
3176 			break;
3177 		default:
3178 			prio = 50;
3179 			break;
3180 		}
3181 		if (ipst->ips_ip_mrtdebug > 1) {
3182 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3183 			    "priority: port %x prio %d\n",
3184 			    ntohs(udp->uh_dport), prio);
3185 		}
3186 	} else
3187 		prio = 50;  /* default priority */
3188 	return (prio);
3189 }
3190 
3191 /*
3192  * End of token bucket filter modifications
3193  */
3194 
3195 
3196 
3197 /*
3198  * Produces data for netstat -M.
3199  */
3200 int
3201 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3202 {
3203 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3204 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3205 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3206 		sizeof (struct mrtstat))) {
3207 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3208 		    (size_t)sizeof (struct mrtstat)));
3209 		return (0);
3210 	}
3211 	return (1);
3212 }
3213 
3214 /*
3215  * Sends info for SNMP's MIB.
3216  */
3217 int
3218 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3219 {
3220 	struct vifctl 	vi;
3221 	vifi_t		vifi;
3222 
3223 	mutex_enter(&ipst->ips_numvifs_mutex);
3224 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3225 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3226 			continue;
3227 		/*
3228 		 * No locks here, an approximation is fine.
3229 		 */
3230 		vi.vifc_vifi = vifi;
3231 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3232 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3233 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3234 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3235 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3236 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3237 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3238 
3239 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3240 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3241 			    (size_t)sizeof (vi)));
3242 			return (0);
3243 		}
3244 	}
3245 	mutex_exit(&ipst->ips_numvifs_mutex);
3246 	return (1);
3247 }
3248 
3249 /*
3250  * Called by ip_snmp_get to send up multicast routing table.
3251  */
3252 int
3253 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3254 {
3255 	int			i, j;
3256 	struct mfc		*rt;
3257 	struct mfcctl	mfcc;
3258 
3259 	/*
3260 	 * Make sure multicast has not been turned off.
3261 	 */
3262 	if (is_mrouter_off(ipst))
3263 		return (1);
3264 
3265 	/* Loop over all hash buckets and their chains */
3266 	for (i = 0; i < MFCTBLSIZ; i++) {
3267 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3268 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3269 			mutex_enter(&rt->mfc_mutex);
3270 			if (rt->mfc_rte != NULL ||
3271 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3272 				mutex_exit(&rt->mfc_mutex);
3273 				continue;
3274 			}
3275 			mfcc.mfcc_origin = rt->mfc_origin;
3276 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3277 			mfcc.mfcc_parent = rt->mfc_parent;
3278 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3279 			mutex_enter(&ipst->ips_numvifs_mutex);
3280 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3281 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3282 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3283 				mfcc.mfcc_ttls[j] = 0;
3284 			mutex_exit(&ipst->ips_numvifs_mutex);
3285 
3286 			mutex_exit(&rt->mfc_mutex);
3287 			if (!snmp_append_data(mp, (char *)&mfcc,
3288 			    sizeof (mfcc))) {
3289 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3290 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3291 				    (size_t)sizeof (mfcc)));
3292 				return (0);
3293 			}
3294 		}
3295 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3296 	}
3297 	return (1);
3298 }
3299