xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision bea83d026ee1bd1b2a2419e1d0232f107a5d7d9b)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.
23  * All rights reserved.  Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Procedures for the kernel part of DVMRP,
31  * a Distance-Vector Multicast Routing Protocol.
32  * (See RFC-1075)
33  * Written by David Waitzman, BBN Labs, August 1988.
34  * Modified by Steve Deering, Stanford, February 1989.
35  * Modified by Mark J. Steiglitz, Stanford, May, 1991
36  * Modified by Van Jacobson, LBL, January 1993
37  * Modified by Ajit Thyagarajan, PARC, August 1993
38  * Modified by Bill Fenner, PARC, April 1995
39  *
40  * MROUTING 3.5
41  */
42 
43 /*
44  * TODO
45  * - function pointer field in vif, void *vif_sendit()
46  */
47 
48 #include <sys/types.h>
49 #include <sys/stream.h>
50 #include <sys/stropts.h>
51 #include <sys/strlog.h>
52 #include <sys/systm.h>
53 #include <sys/ddi.h>
54 #include <sys/cmn_err.h>
55 #include <sys/zone.h>
56 
57 #include <sys/param.h>
58 #include <sys/socket.h>
59 #include <sys/vtrace.h>
60 #include <sys/debug.h>
61 #include <net/if.h>
62 #include <sys/sockio.h>
63 #include <netinet/in.h>
64 #include <net/if_dl.h>
65 
66 #include <inet/common.h>
67 #include <inet/mi.h>
68 #include <inet/nd.h>
69 #include <inet/mib2.h>
70 #include <netinet/ip6.h>
71 #include <inet/ip.h>
72 #include <inet/snmpcom.h>
73 
74 #include <netinet/igmp.h>
75 #include <netinet/igmp_var.h>
76 #include <netinet/udp.h>
77 #include <netinet/ip_mroute.h>
78 #include <inet/ip_multi.h>
79 #include <inet/ip_ire.h>
80 #include <inet/ip_if.h>
81 #include <inet/ipclassifier.h>
82 
83 #include <netinet/pim.h>
84 
85 
86 /*
87  * MT Design:
88  *
89  * There are three main data structures viftable, mfctable and tbftable that
90  * need to be protected against MT races.
91  *
92  * vitable is a fixed length array of vif structs. There is no lock to protect
93  * the whole array, instead each struct is protected by its own indiviual lock.
94  * The value of v_marks in conjuction with the value of v_refcnt determines the
95  * current state of a vif structure. One special state that needs mention
96  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
97  * that vif is being initalized.
98  * Each structure is freed when the refcnt goes down to zero. If a delete comes
99  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
100  * which prevents the struct from further use.  When the refcnt goes to zero
101  * the struct is freed and is marked VIF_MARK_NOTINUSE.
102  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
103  * from  going away a refhold is put on the ipif before using it. see
104  * lock_good_vif() and unlock_good_vif().
105  *
106  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
107  * of the vif struct.
108  *
109  * tbftable is also a fixed length array of tbf structs and is only accessed
110  * via v_tbf.  It is protected by its own lock tbf_lock.
111  *
112  * Lock Ordering is
113  * v_lock --> tbf_lock
114  * v_lock --> ill_locK
115  *
116  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
117  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
118  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
119  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
120  * protect the struct elements.
121  *
122  * mfc structs are dynamically allocated and are singly linked
123  * at the head of the chain. When an mfc structure is to be deleted
124  * it is marked condemned and so is the state in the bucket struct.
125  * When the last walker of the hash bucket exits all the mfc structs
126  * marked condemed are freed.
127  *
128  * Locking Hierarchy:
129  * The bucket lock should be acquired before the mfc struct lock.
130  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
131  * operations on the bucket struct.
132  *
133  * last_encap_lock and numvifs_mutex should be acquired after
134  * acquring vif or mfc locks. These locks protect some global variables.
135  *
136  * The statistics are not currently protected by a lock
137  * causing the stats be be approximate, not exact.
138  */
139 
140 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
141 
142 /*
143  * Timeouts:
144  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
145  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
146  *	SunOS 5.x uses mfc->timeout for each mfc.
147  *	Some Unixes are limited in the number of simultaneous timeouts
148  * 	that can be run, SunOS 5.x does not have this restriction.
149  */
150 
151 /*
152  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
153  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
154  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
155  */
156 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
157 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
158 
159 /*
160  * Hash function for a source, group entry
161  */
162 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
163 	((g) >> 20) ^ ((g) >> 10) ^ (g))
164 
165 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
166 
167 /* Identify PIM packet that came on a Register interface */
168 #define	PIM_REGISTER_MARKER	0xffffffff
169 
170 /* Function declarations */
171 static int	add_mfc(struct mfcctl *, ip_stack_t *);
172 static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
173 static int	del_mfc(struct mfcctl *, ip_stack_t *);
174 static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
175 static void	del_vifp(struct vif *);
176 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
177 static void	expire_upcalls(void *);
178 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
179 static void	free_queue(struct mfc *);
180 static int	get_assert(uchar_t *, ip_stack_t *);
181 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
182 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
183 static int	get_version(uchar_t *);
184 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
185 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
186 		    ipaddr_t, struct mfc *);
187 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
188 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
189 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
190 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
191 static int	set_assert(int *, ip_stack_t *);
192 
193 /*
194  * Token Bucket Filter functions
195  */
196 static int  priority(struct vif *, ipha_t *);
197 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
198 static int  tbf_dq_sel(struct vif *, ipha_t *);
199 static void tbf_process_q(struct vif *);
200 static void tbf_queue(struct vif *, mblk_t *);
201 static void tbf_reprocess_q(void *);
202 static void tbf_send_packet(struct vif *, mblk_t *);
203 static void tbf_update_tokens(struct vif *);
204 static void release_mfc(struct mfcb *);
205 
206 static boolean_t is_mrouter_off(ip_stack_t *);
207 /*
208  * Encapsulation packets
209  */
210 
211 #define	ENCAP_TTL	64
212 
213 /* prototype IP hdr for encapsulated packets */
214 static ipha_t multicast_encap_iphdr = {
215 	IP_SIMPLE_HDR_VERSION,
216 	0,				/* tos */
217 	sizeof (ipha_t),		/* total length */
218 	0,				/* id */
219 	0,				/* frag offset */
220 	ENCAP_TTL, IPPROTO_ENCAP,
221 	0,				/* checksum */
222 };
223 
224 /*
225  * Rate limit for assert notification messages, in nsec.
226  */
227 #define	ASSERT_MSG_TIME		3000000000
228 
229 
230 #define	VIF_REFHOLD(vifp) {			\
231 	mutex_enter(&(vifp)->v_lock);		\
232 	(vifp)->v_refcnt++;			\
233 	mutex_exit(&(vifp)->v_lock);		\
234 }
235 
236 #define	VIF_REFRELE_LOCKED(vifp) {				\
237 	(vifp)->v_refcnt--;					\
238 	if ((vifp)->v_refcnt == 0 &&				\
239 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
240 			del_vifp(vifp);				\
241 	} else {						\
242 		mutex_exit(&(vifp)->v_lock);			\
243 	}							\
244 }
245 
246 #define	VIF_REFRELE(vifp) {					\
247 	mutex_enter(&(vifp)->v_lock);				\
248 	(vifp)->v_refcnt--;					\
249 	if ((vifp)->v_refcnt == 0 &&				\
250 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
251 			del_vifp(vifp);				\
252 	} else {						\
253 		mutex_exit(&(vifp)->v_lock);			\
254 	}							\
255 }
256 
257 #define	MFCB_REFHOLD(mfcb) {				\
258 	mutex_enter(&(mfcb)->mfcb_lock);		\
259 	(mfcb)->mfcb_refcnt++;				\
260 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
261 	mutex_exit(&(mfcb)->mfcb_lock);			\
262 }
263 
264 #define	MFCB_REFRELE(mfcb) {					\
265 	mutex_enter(&(mfcb)->mfcb_lock);			\
266 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
267 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
268 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
269 			release_mfc(mfcb);			\
270 	}							\
271 	mutex_exit(&(mfcb)->mfcb_lock);				\
272 }
273 
274 /*
275  * MFCFIND:
276  * Find a route for a given origin IP address and multicast group address.
277  * Skip entries with pending upcalls.
278  * Type of service parameter to be added in the future!
279  */
280 #define	MFCFIND(mfcbp, o, g, rt) { \
281 	struct mfc *_mb_rt = NULL; \
282 	rt = NULL; \
283 	_mb_rt = mfcbp->mfcb_mfc; \
284 	while (_mb_rt) { \
285 		if ((_mb_rt->mfc_origin.s_addr == o) && \
286 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
287 		    (_mb_rt->mfc_rte == NULL) && \
288 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
289 		    rt = _mb_rt; \
290 		    break; \
291 		} \
292 	_mb_rt = _mb_rt->mfc_next; \
293 	} \
294 }
295 
296 /*
297  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
298  * are inefficient. We use gethrestime() which returns a timespec_t with
299  * sec and nsec, the resolution is machine dependent.
300  * The following 2 macros have been changed to use nsec instead of usec.
301  */
302 /*
303  * Macros to compute elapsed time efficiently.
304  * Borrowed from Van Jacobson's scheduling code.
305  * Delta should be a hrtime_t.
306  */
307 #define	TV_DELTA(a, b, delta) { \
308 	int xxs; \
309  \
310 	delta = (a).tv_nsec - (b).tv_nsec; \
311 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
312 		switch (xxs) { \
313 		case 2: \
314 		    delta += 1000000000; \
315 		    /*FALLTHROUGH*/ \
316 		case 1: \
317 		    delta += 1000000000; \
318 		    break; \
319 		default: \
320 		    delta += (1000000000 * xxs); \
321 		} \
322 	} \
323 }
324 
325 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
326 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
327 
328 /*
329  * Handle MRT setsockopt commands to modify the multicast routing tables.
330  */
331 int
332 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
333     int datalen, mblk_t *first_mp)
334 {
335 	conn_t		*connp = Q_TO_CONN(q);
336 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
337 
338 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
339 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
340 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
341 		return (EACCES);
342 	}
343 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
344 
345 	if (checkonly) {
346 		/*
347 		 * do not do operation, just pretend to - new T_CHECK
348 		 * Note: Even routines further on can probably fail but
349 		 * this T_CHECK stuff is only to please XTI so it not
350 		 * necessary to be perfect.
351 		 */
352 		switch (cmd) {
353 		case MRT_INIT:
354 		case MRT_DONE:
355 		case MRT_ADD_VIF:
356 		case MRT_DEL_VIF:
357 		case MRT_ADD_MFC:
358 		case MRT_DEL_MFC:
359 		case MRT_ASSERT:
360 			return (0);
361 		default:
362 			return (EOPNOTSUPP);
363 		}
364 	}
365 
366 	/*
367 	 * make sure no command is issued after multicast routing has been
368 	 * turned off.
369 	 */
370 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
371 		if (is_mrouter_off(ipst))
372 			return (EINVAL);
373 	}
374 
375 	switch (cmd) {
376 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
377 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
378 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
379 			    first_mp, ipst));
380 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
381 			    ipst));
382 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
383 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
384 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
385 	default:	   return (EOPNOTSUPP);
386 	}
387 }
388 
389 /*
390  * Handle MRT getsockopt commands
391  */
392 int
393 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
394 {
395 	conn_t		*connp = Q_TO_CONN(q);
396 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
397 
398 	if (connp != ipst->ips_ip_g_mrouter)
399 		return (EACCES);
400 
401 	switch (cmd) {
402 	case MRT_VERSION:	return (get_version((uchar_t *)data));
403 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
404 	default:		return (EOPNOTSUPP);
405 	}
406 }
407 
408 /*
409  * Handle ioctl commands to obtain information from the cache.
410  * Called with shared access to IP. These are read_only ioctls.
411  */
412 /* ARGSUSED */
413 int
414 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
415     ip_ioctl_cmd_t *ipip, void *if_req)
416 {
417 	mblk_t	*mp1;
418 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
419 	conn_t		*connp = Q_TO_CONN(q);
420 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
421 
422 	/* Existence verified in ip_wput_nondata */
423 	mp1 = mp->b_cont->b_cont;
424 
425 	switch (iocp->ioc_cmd) {
426 	case (SIOCGETVIFCNT):
427 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
428 	case (SIOCGETSGCNT):
429 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
430 	case (SIOCGETLSGCNT):
431 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
432 	default:
433 		return (EINVAL);
434 	}
435 }
436 
437 /*
438  * Returns the packet, byte, rpf-failure count for the source, group provided.
439  */
440 static int
441 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
442 {
443 	struct mfc *rt;
444 	struct mfcb *mfcbp;
445 
446 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
447 	MFCB_REFHOLD(mfcbp);
448 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
449 
450 	if (rt != NULL) {
451 		mutex_enter(&rt->mfc_mutex);
452 		req->pktcnt   = rt->mfc_pkt_cnt;
453 		req->bytecnt  = rt->mfc_byte_cnt;
454 		req->wrong_if = rt->mfc_wrong_if;
455 		mutex_exit(&rt->mfc_mutex);
456 	} else
457 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
458 
459 	MFCB_REFRELE(mfcbp);
460 	return (0);
461 }
462 
463 /*
464  * Returns the packet, byte, rpf-failure count for the source, group provided.
465  * Uses larger counters and IPv6 addresses.
466  */
467 /* ARGSUSED XXX until implemented */
468 static int
469 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
470 {
471 	/* XXX TODO SIOCGETLSGCNT */
472 	return (ENXIO);
473 }
474 
475 /*
476  * Returns the input and output packet and byte counts on the vif provided.
477  */
478 static int
479 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
480 {
481 	vifi_t vifi = req->vifi;
482 
483 	if (vifi >= ipst->ips_numvifs)
484 		return (EINVAL);
485 
486 	/*
487 	 * No locks here, an approximation is fine.
488 	 */
489 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
490 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
491 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
492 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
493 
494 	return (0);
495 }
496 
497 static int
498 get_version(uchar_t *data)
499 {
500 	int *v = (int *)data;
501 
502 	*v = 0x0305;	/* XXX !!!! */
503 
504 	return (0);
505 }
506 
507 /*
508  * Set PIM assert processing global.
509  */
510 static int
511 set_assert(int *i, ip_stack_t *ipst)
512 {
513 	if ((*i != 1) && (*i != 0))
514 		return (EINVAL);
515 
516 	ipst->ips_pim_assert = *i;
517 
518 	return (0);
519 }
520 
521 /*
522  * Get PIM assert processing global.
523  */
524 static int
525 get_assert(uchar_t *data, ip_stack_t *ipst)
526 {
527 	int *i = (int *)data;
528 
529 	*i = ipst->ips_pim_assert;
530 
531 	return (0);
532 }
533 
534 /*
535  * Enable multicast routing.
536  */
537 static int
538 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
539 {
540 	int	*v;
541 
542 	if (data == NULL || (datalen != sizeof (int)))
543 		return (ENOPROTOOPT);
544 
545 	v = (int *)data;
546 	if (*v != 1)
547 		return (ENOPROTOOPT);
548 
549 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
550 	if (ipst->ips_ip_g_mrouter != NULL) {
551 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
552 		return (EADDRINUSE);
553 	}
554 
555 	/*
556 	 * MRT_INIT should only be allowed for RAW sockets, but we double
557 	 * check.
558 	 */
559 	if (!IPCL_IS_RAWIP(connp)) {
560 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
561 		return (EINVAL);
562 	}
563 
564 	ipst->ips_ip_g_mrouter = connp;
565 	connp->conn_multi_router = 1;
566 	/* In order for tunnels to work we have to turn ip_g_forward on */
567 	if (!WE_ARE_FORWARDING(ipst)) {
568 		if (ipst->ips_ip_mrtdebug > 1) {
569 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
570 			    "ip_mrouter_init: turning on forwarding");
571 		}
572 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
573 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
574 	}
575 
576 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
577 	return (0);
578 }
579 
580 void
581 ip_mrouter_stack_init(ip_stack_t *ipst)
582 {
583 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
584 
585 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
586 	    KM_SLEEP);
587 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
588 	/*
589 	 * mfctable:
590 	 * Includes all mfcs, including waiting upcalls.
591 	 * Multiple mfcs per bucket.
592 	 */
593 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
594 	    KM_SLEEP);
595 	/*
596 	 * Define the token bucket filter structures.
597 	 * tbftable -> each vif has one of these for storing info.
598 	 */
599 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
600 
601 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
602 
603 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
604 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
605 }
606 
607 /*
608  * Disable multicast routing.
609  * Didn't use global timeout_val (BSD version), instead check the mfctable.
610  */
611 int
612 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
613 {
614 	conn_t		*mrouter;
615 	vifi_t 		vifi;
616 	struct mfc	*mfc_rt;
617 	int		i;
618 
619 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
620 	if (ipst->ips_ip_g_mrouter == NULL) {
621 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
622 		return (EINVAL);
623 	}
624 
625 	mrouter = ipst->ips_ip_g_mrouter;
626 
627 	if (ipst->ips_saved_ip_g_forward != -1) {
628 		if (ipst->ips_ip_mrtdebug > 1) {
629 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
630 			    "ip_mrouter_done: turning off forwarding");
631 		}
632 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
633 		ipst->ips_saved_ip_g_forward = -1;
634 	}
635 
636 	/*
637 	 * Always clear cache when vifs change.
638 	 * No need to get ipst->ips_last_encap_lock since we are running as
639 	 * a writer.
640 	 */
641 	mutex_enter(&ipst->ips_last_encap_lock);
642 	ipst->ips_last_encap_src = 0;
643 	ipst->ips_last_encap_vif = NULL;
644 	mutex_exit(&ipst->ips_last_encap_lock);
645 	mrouter->conn_multi_router = 0;
646 
647 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
648 
649 	/*
650 	 * For each phyint in use,
651 	 * disable promiscuous reception of all IP multicasts.
652 	 */
653 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
654 		struct vif *vifp = ipst->ips_vifs + vifi;
655 
656 		mutex_enter(&vifp->v_lock);
657 		/*
658 		 * if the vif is active mark it condemned.
659 		 */
660 		if (vifp->v_marks & VIF_MARK_GOOD) {
661 			ASSERT(vifp->v_ipif != NULL);
662 			ipif_refhold(vifp->v_ipif);
663 			/* Phyint only */
664 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
665 				ipif_t *ipif = vifp->v_ipif;
666 				ipsq_t  *ipsq;
667 				boolean_t suc;
668 				ill_t *ill;
669 
670 				ill = ipif->ipif_ill;
671 				suc = B_FALSE;
672 				if (mp == NULL) {
673 					/*
674 					 * being called from ip_close,
675 					 * lets do it synchronously.
676 					 * Clear VIF_MARK_GOOD and
677 					 * set VIF_MARK_CONDEMNED.
678 					 */
679 					vifp->v_marks &= ~VIF_MARK_GOOD;
680 					vifp->v_marks |= VIF_MARK_CONDEMNED;
681 					mutex_exit(&(vifp)->v_lock);
682 					suc = ipsq_enter(ill, B_FALSE);
683 					ipsq = ill->ill_phyint->phyint_ipsq;
684 				} else {
685 					ipsq = ipsq_try_enter(ipif, NULL,
686 					    mrouter->conn_wq, mp,
687 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
688 					if (ipsq == NULL) {
689 						mutex_exit(&(vifp)->v_lock);
690 						ipif_refrele(ipif);
691 						return (EINPROGRESS);
692 					}
693 					/*
694 					 * Clear VIF_MARK_GOOD and
695 					 * set VIF_MARK_CONDEMNED.
696 					 */
697 					vifp->v_marks &= ~VIF_MARK_GOOD;
698 					vifp->v_marks |= VIF_MARK_CONDEMNED;
699 					mutex_exit(&(vifp)->v_lock);
700 					suc = B_TRUE;
701 				}
702 
703 				if (suc) {
704 					(void) ip_delmulti(INADDR_ANY, ipif,
705 					    B_TRUE, B_TRUE);
706 					ipsq_exit(ipsq, B_TRUE, B_TRUE);
707 				}
708 				mutex_enter(&vifp->v_lock);
709 			}
710 			/*
711 			 * decreases the refcnt added in add_vif.
712 			 * and release v_lock.
713 			 */
714 			VIF_REFRELE_LOCKED(vifp);
715 		} else {
716 			mutex_exit(&vifp->v_lock);
717 			continue;
718 		}
719 	}
720 
721 	mutex_enter(&ipst->ips_numvifs_mutex);
722 	ipst->ips_numvifs = 0;
723 	ipst->ips_pim_assert = 0;
724 	ipst->ips_reg_vif_num = ALL_VIFS;
725 	mutex_exit(&ipst->ips_numvifs_mutex);
726 
727 	/*
728 	 * Free upcall msgs.
729 	 * Go through mfctable and stop any outstanding upcall
730 	 * timeouts remaining on mfcs.
731 	 */
732 	for (i = 0; i < MFCTBLSIZ; i++) {
733 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
734 		ipst->ips_mfcs[i].mfcb_refcnt++;
735 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
736 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
737 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
738 		while (mfc_rt) {
739 			/* Free upcalls */
740 			mutex_enter(&mfc_rt->mfc_mutex);
741 			if (mfc_rt->mfc_rte != NULL) {
742 				if (mfc_rt->mfc_timeout_id != 0) {
743 					/*
744 					 * OK to drop the lock as we have
745 					 * a refcnt on the bucket. timeout
746 					 * can fire but it will see that
747 					 * mfc_timeout_id == 0 and not do
748 					 * anything. see expire_upcalls().
749 					 */
750 					mfc_rt->mfc_timeout_id = 0;
751 					mutex_exit(&mfc_rt->mfc_mutex);
752 					(void) untimeout(
753 					    mfc_rt->mfc_timeout_id);
754 						mfc_rt->mfc_timeout_id = 0;
755 					mutex_enter(&mfc_rt->mfc_mutex);
756 
757 					/*
758 					 * all queued upcall packets
759 					 * and mblk will be freed in
760 					 * release_mfc().
761 					 */
762 				}
763 			}
764 
765 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
766 
767 			mutex_exit(&mfc_rt->mfc_mutex);
768 			mfc_rt = mfc_rt->mfc_next;
769 		}
770 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
771 	}
772 
773 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
774 	ipst->ips_ip_g_mrouter = NULL;
775 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
776 	return (0);
777 }
778 
779 void
780 ip_mrouter_stack_destroy(ip_stack_t *ipst)
781 {
782 	struct mfcb *mfcbp;
783 	struct mfc  *rt;
784 	int i;
785 
786 	for (i = 0; i < MFCTBLSIZ; i++) {
787 		mfcbp = &ipst->ips_mfcs[i];
788 
789 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
790 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
791 			    i);
792 
793 			mfcbp->mfcb_mfc = rt->mfc_next;
794 			free_queue(rt);
795 			mi_free(rt);
796 		}
797 	}
798 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
799 	ipst->ips_vifs = NULL;
800 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
801 	ipst->ips_mrtstat = NULL;
802 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
803 	ipst->ips_mfcs = NULL;
804 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
805 	ipst->ips_tbfs = NULL;
806 
807 	mutex_destroy(&ipst->ips_last_encap_lock);
808 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
809 }
810 
811 static boolean_t
812 is_mrouter_off(ip_stack_t *ipst)
813 {
814 	conn_t	*mrouter;
815 
816 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
817 	if (ipst->ips_ip_g_mrouter == NULL) {
818 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
819 		return (B_TRUE);
820 	}
821 
822 	mrouter = ipst->ips_ip_g_mrouter;
823 	if (mrouter->conn_multi_router == 0) {
824 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
825 		return (B_TRUE);
826 	}
827 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
828 	return (B_FALSE);
829 }
830 
831 static void
832 unlock_good_vif(struct vif *vifp)
833 {
834 	ASSERT(vifp->v_ipif != NULL);
835 	ipif_refrele(vifp->v_ipif);
836 	VIF_REFRELE(vifp);
837 }
838 
839 static boolean_t
840 lock_good_vif(struct vif *vifp)
841 {
842 	mutex_enter(&vifp->v_lock);
843 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
844 		mutex_exit(&vifp->v_lock);
845 		return (B_FALSE);
846 	}
847 
848 	ASSERT(vifp->v_ipif != NULL);
849 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
850 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
851 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
852 		mutex_exit(&vifp->v_lock);
853 		return (B_FALSE);
854 	}
855 	ipif_refhold_locked(vifp->v_ipif);
856 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
857 	vifp->v_refcnt++;
858 	mutex_exit(&vifp->v_lock);
859 	return (B_TRUE);
860 }
861 
862 /*
863  * Add a vif to the vif table.
864  */
865 static int
866 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
867 {
868 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
869 	ipif_t		*ipif;
870 	int		error;
871 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
872 	ipsq_t  	*ipsq;
873 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
874 
875 	ASSERT(connp != NULL);
876 
877 	if (vifcp->vifc_vifi >= MAXVIFS)
878 		return (EINVAL);
879 
880 	if (is_mrouter_off(ipst))
881 		return (EINVAL);
882 
883 	mutex_enter(&vifp->v_lock);
884 	/*
885 	 * Viftable entry should be 0.
886 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
887 	 * initialized.
888 	 *
889 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
890 	 * request while the delete is in progress, mrouted only sends add
891 	 * requests when a new interface is added and the new interface cannot
892 	 * have the same vifi as an existing interface. We make sure that
893 	 * ill_delete will block till the vif is deleted by adding a refcnt
894 	 * to ipif in del_vif().
895 	 */
896 	if (vifp->v_lcl_addr.s_addr != 0 ||
897 	    vifp->v_marks != 0 ||
898 	    vifp->v_refcnt != 0) {
899 		mutex_exit(&vifp->v_lock);
900 		return (EADDRINUSE);
901 	}
902 
903 	/* Incoming vif should not be 0 */
904 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
905 		mutex_exit(&vifp->v_lock);
906 		return (EINVAL);
907 	}
908 
909 	vifp->v_refcnt++;
910 	mutex_exit(&vifp->v_lock);
911 	/* Find the interface with the local address */
912 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
913 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
914 	    ip_restart_optmgmt, &error, ipst);
915 	if (ipif == NULL) {
916 		VIF_REFRELE(vifp);
917 		if (error == EINPROGRESS)
918 			return (error);
919 		return (EADDRNOTAVAIL);
920 	}
921 
922 	/*
923 	 * We have to be exclusive as we have to call ip_addmulti()
924 	 * This is the best position to try to be exclusive in case
925 	 * we have to wait.
926 	 */
927 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
928 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
929 	if ((ipsq) == NULL) {
930 		VIF_REFRELE(vifp);
931 		ipif_refrele(ipif);
932 		return (EINPROGRESS);
933 	}
934 
935 	if (ipst->ips_ip_mrtdebug > 1) {
936 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
937 		    "add_vif: src 0x%x enter",
938 		    vifcp->vifc_lcl_addr.s_addr);
939 	}
940 
941 	mutex_enter(&vifp->v_lock);
942 	/*
943 	 * Always clear cache when vifs change.
944 	 * Needed to ensure that src isn't left over from before vif was added.
945 	 * No need to get last_encap_lock, since we are running as a writer.
946 	 */
947 
948 	mutex_enter(&ipst->ips_last_encap_lock);
949 	ipst->ips_last_encap_src = 0;
950 	ipst->ips_last_encap_vif = NULL;
951 	mutex_exit(&ipst->ips_last_encap_lock);
952 
953 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
954 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
955 			cmn_err(CE_WARN,
956 			    "add_vif: source route tunnels not supported\n");
957 			VIF_REFRELE_LOCKED(vifp);
958 			ipif_refrele(ipif);
959 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
960 			return (EOPNOTSUPP);
961 		}
962 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
963 
964 	} else {
965 		/* Phyint or Register vif */
966 		if (vifcp->vifc_flags & VIFF_REGISTER) {
967 			/*
968 			 * Note: Since all IPPROTO_IP level options (including
969 			 * MRT_ADD_VIF) are done exclusively via
970 			 * ip_optmgmt_writer(), a lock is not necessary to
971 			 * protect reg_vif_num.
972 			 */
973 			mutex_enter(&ipst->ips_numvifs_mutex);
974 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
975 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
976 				mutex_exit(&ipst->ips_numvifs_mutex);
977 			} else {
978 				mutex_exit(&ipst->ips_numvifs_mutex);
979 				VIF_REFRELE_LOCKED(vifp);
980 				ipif_refrele(ipif);
981 				ipsq_exit(ipsq, B_TRUE, B_TRUE);
982 				return (EADDRINUSE);
983 			}
984 		}
985 
986 		/* Make sure the interface supports multicast */
987 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
988 			VIF_REFRELE_LOCKED(vifp);
989 			ipif_refrele(ipif);
990 			if (vifcp->vifc_flags & VIFF_REGISTER) {
991 				mutex_enter(&ipst->ips_numvifs_mutex);
992 				ipst->ips_reg_vif_num = ALL_VIFS;
993 				mutex_exit(&ipst->ips_numvifs_mutex);
994 			}
995 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
996 			return (EOPNOTSUPP);
997 		}
998 		/* Enable promiscuous reception of all IP mcasts from the if */
999 		mutex_exit(&vifp->v_lock);
1000 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
1001 		    MODE_IS_EXCLUDE, NULL);
1002 		mutex_enter(&vifp->v_lock);
1003 		/*
1004 		 * since we released the lock lets make sure that
1005 		 * ip_mrouter_done() has not been called.
1006 		 */
1007 		if (error != 0 || is_mrouter_off(ipst)) {
1008 			if (error == 0)
1009 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
1010 				    B_TRUE);
1011 			if (vifcp->vifc_flags & VIFF_REGISTER) {
1012 				mutex_enter(&ipst->ips_numvifs_mutex);
1013 				ipst->ips_reg_vif_num = ALL_VIFS;
1014 				mutex_exit(&ipst->ips_numvifs_mutex);
1015 			}
1016 			VIF_REFRELE_LOCKED(vifp);
1017 			ipif_refrele(ipif);
1018 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1019 			return (error?error:EINVAL);
1020 		}
1021 	}
1022 	/* Define parameters for the tbf structure */
1023 	vifp->v_tbf = v_tbf;
1024 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1025 	vifp->v_tbf->tbf_n_tok = 0;
1026 	vifp->v_tbf->tbf_q_len = 0;
1027 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1028 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1029 
1030 	vifp->v_flags = vifcp->vifc_flags;
1031 	vifp->v_threshold = vifcp->vifc_threshold;
1032 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1033 	vifp->v_ipif = ipif;
1034 	ipif_refrele(ipif);
1035 	/* Scaling up here, allows division by 1024 in critical code.	*/
1036 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1037 	vifp->v_timeout_id = 0;
1038 	/* initialize per vif pkt counters */
1039 	vifp->v_pkt_in = 0;
1040 	vifp->v_pkt_out = 0;
1041 	vifp->v_bytes_in = 0;
1042 	vifp->v_bytes_out = 0;
1043 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1044 
1045 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1046 	mutex_enter(&ipst->ips_numvifs_mutex);
1047 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1048 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1049 	mutex_exit(&ipst->ips_numvifs_mutex);
1050 
1051 	if (ipst->ips_ip_mrtdebug > 1) {
1052 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1053 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1054 		    vifcp->vifc_vifi,
1055 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1056 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1057 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1058 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1059 	}
1060 
1061 	vifp->v_marks = VIF_MARK_GOOD;
1062 	mutex_exit(&vifp->v_lock);
1063 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
1064 	return (0);
1065 }
1066 
1067 
1068 /* Delete a vif from the vif table. */
1069 static void
1070 del_vifp(struct vif *vifp)
1071 {
1072 	struct tbf	*t = vifp->v_tbf;
1073 	mblk_t  *mp0;
1074 	vifi_t  vifi;
1075 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1076 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1077 
1078 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1079 	ASSERT(t != NULL);
1080 
1081 	/*
1082 	 * release the ref we put in vif_del.
1083 	 */
1084 	ASSERT(vifp->v_ipif != NULL);
1085 	ipif_refrele(vifp->v_ipif);
1086 
1087 	if (ipst->ips_ip_mrtdebug > 1) {
1088 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1089 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1090 	}
1091 
1092 	if (vifp->v_timeout_id != 0) {
1093 		(void) untimeout(vifp->v_timeout_id);
1094 		vifp->v_timeout_id = 0;
1095 	}
1096 
1097 	/*
1098 	 * Free packets queued at the interface.
1099 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1100 	 */
1101 	mutex_enter(&t->tbf_lock);
1102 	while (t->tbf_q != NULL) {
1103 		mp0 = t->tbf_q;
1104 		t->tbf_q = t->tbf_q->b_next;
1105 		mp0->b_prev = mp0->b_next = NULL;
1106 		freemsg(mp0);
1107 	}
1108 	mutex_exit(&t->tbf_lock);
1109 
1110 	/*
1111 	 * Always clear cache when vifs change.
1112 	 * No need to get last_encap_lock since we are running as a writer.
1113 	 */
1114 	mutex_enter(&ipst->ips_last_encap_lock);
1115 	if (vifp == ipst->ips_last_encap_vif) {
1116 		ipst->ips_last_encap_vif = NULL;
1117 		ipst->ips_last_encap_src = 0;
1118 	}
1119 	mutex_exit(&ipst->ips_last_encap_lock);
1120 
1121 	mutex_destroy(&t->tbf_lock);
1122 
1123 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1124 
1125 	/* Adjust numvifs down */
1126 	mutex_enter(&ipst->ips_numvifs_mutex);
1127 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1128 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1129 			break;
1130 	ipst->ips_numvifs = vifi;
1131 	mutex_exit(&ipst->ips_numvifs_mutex);
1132 
1133 	bzero(vifp, sizeof (*vifp));
1134 }
1135 
1136 static int
1137 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
1138 {
1139 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1140 	ipsq_t  	*ipsq;
1141 
1142 	if (*vifip >= ipst->ips_numvifs)
1143 		return (EINVAL);
1144 
1145 
1146 	mutex_enter(&vifp->v_lock);
1147 	/*
1148 	 * Not initialized
1149 	 * Here we are not looking at the vif that is being initialized
1150 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1151 	 */
1152 	if (vifp->v_lcl_addr.s_addr == 0 ||
1153 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1154 		mutex_exit(&vifp->v_lock);
1155 		return (EADDRNOTAVAIL);
1156 	}
1157 
1158 	/*
1159 	 * This is an optimization, if first_mp == NULL
1160 	 * than we are being called from reset_mrt_vif_ipif()
1161 	 * so we already have exclusive access to the ipsq.
1162 	 * the ASSERT below is a check for this condition.
1163 	 */
1164 	if (first_mp != NULL &&
1165 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1166 		ASSERT(connp != NULL);
1167 		/*
1168 		 * We have to be exclusive as we have to call ip_delmulti()
1169 		 * This is the best position to try to be exclusive in case
1170 		 * we have to wait.
1171 		 */
1172 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1173 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1174 		if ((ipsq) == NULL) {
1175 			mutex_exit(&vifp->v_lock);
1176 			return (EINPROGRESS);
1177 		}
1178 		/* recheck after being exclusive */
1179 		if (vifp->v_lcl_addr.s_addr == 0 ||
1180 		    !vifp->v_marks & VIF_MARK_GOOD) {
1181 			/*
1182 			 * someone beat us.
1183 			 */
1184 			mutex_exit(&vifp->v_lock);
1185 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1186 			return (EADDRNOTAVAIL);
1187 		}
1188 	}
1189 
1190 
1191 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1192 
1193 
1194 	/*
1195 	 * add a refhold so that ipif does not go away while
1196 	 * there are still users, this will be released in del_vifp
1197 	 * when we free the vif.
1198 	 */
1199 	ipif_refhold(vifp->v_ipif);
1200 
1201 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1202 	vifp->v_marks &= ~VIF_MARK_GOOD;
1203 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1204 
1205 	/* Phyint only */
1206 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1207 		ipif_t *ipif = vifp->v_ipif;
1208 		ASSERT(ipif != NULL);
1209 		/*
1210 		 * should be OK to drop the lock as we
1211 		 * have marked this as CONDEMNED.
1212 		 */
1213 		mutex_exit(&(vifp)->v_lock);
1214 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1215 		if (first_mp != NULL)
1216 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1217 		mutex_enter(&(vifp)->v_lock);
1218 	}
1219 
1220 	/*
1221 	 * decreases the refcnt added in add_vif.
1222 	 */
1223 	VIF_REFRELE_LOCKED(vifp);
1224 	return (0);
1225 }
1226 
1227 /*
1228  * Add an mfc entry.
1229  */
1230 static int
1231 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1232 {
1233 	struct mfc *rt;
1234 	struct rtdetq *rte;
1235 	ushort_t nstl;
1236 	int i;
1237 	struct mfcb *mfcbp;
1238 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1239 
1240 	/*
1241 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1242 	 * did not have a real route for pkt.
1243 	 * We want this pkt without rt installed in the mfctable to prevent
1244 	 * multiiple tries, so go ahead and put it in mfctable, it will
1245 	 * be discarded later in ip_mdq() because the child is NULL.
1246 	 */
1247 
1248 	/* Error checking, out of bounds? */
1249 	if (mfccp->mfcc_parent > MAXVIFS) {
1250 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1251 		    (int)mfccp->mfcc_parent));
1252 		return (EINVAL);
1253 	}
1254 
1255 	if ((mfccp->mfcc_parent != NO_VIF) &&
1256 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1257 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1258 		    (int)mfccp->mfcc_parent));
1259 		return (EINVAL);
1260 	}
1261 
1262 	if (is_mrouter_off(ipst)) {
1263 		return (EINVAL);
1264 	}
1265 
1266 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1267 	    mfccp->mfcc_mcastgrp.s_addr)];
1268 	MFCB_REFHOLD(mfcbp);
1269 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1270 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1271 
1272 	/* If an entry already exists, just update the fields */
1273 	if (rt) {
1274 		if (ipst->ips_ip_mrtdebug > 1) {
1275 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1276 			    "add_mfc: update o %x grp %x parent %x",
1277 			    ntohl(mfccp->mfcc_origin.s_addr),
1278 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1279 			    mfccp->mfcc_parent);
1280 		}
1281 		mutex_enter(&rt->mfc_mutex);
1282 		rt->mfc_parent = mfccp->mfcc_parent;
1283 
1284 		mutex_enter(&ipst->ips_numvifs_mutex);
1285 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1286 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1287 		mutex_exit(&ipst->ips_numvifs_mutex);
1288 		mutex_exit(&rt->mfc_mutex);
1289 
1290 		MFCB_REFRELE(mfcbp);
1291 		return (0);
1292 	}
1293 
1294 	/*
1295 	 * Find the entry for which the upcall was made and update.
1296 	 */
1297 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1298 		mutex_enter(&rt->mfc_mutex);
1299 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1300 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1301 		    (rt->mfc_rte != NULL) &&
1302 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1303 			if (nstl++ != 0)
1304 				cmn_err(CE_WARN,
1305 				    "add_mfc: %s o %x g %x p %x",
1306 				    "multiple kernel entries",
1307 				    ntohl(mfccp->mfcc_origin.s_addr),
1308 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1309 				    mfccp->mfcc_parent);
1310 
1311 			if (ipst->ips_ip_mrtdebug > 1) {
1312 				(void) mi_strlog(mrouter->conn_rq, 1,
1313 				    SL_TRACE,
1314 				    "add_mfc: o %x g %x p %x",
1315 				    ntohl(mfccp->mfcc_origin.s_addr),
1316 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1317 				    mfccp->mfcc_parent);
1318 			}
1319 			fill_route(rt, mfccp, ipst);
1320 
1321 			/*
1322 			 * Prevent cleanup of cache entry.
1323 			 * Timer starts in ip_mforward.
1324 			 */
1325 			if (rt->mfc_timeout_id != 0) {
1326 				timeout_id_t id;
1327 				id = rt->mfc_timeout_id;
1328 				/*
1329 				 * setting id to zero will avoid this
1330 				 * entry from being cleaned up in
1331 				 * expire_up_calls().
1332 				 */
1333 				rt->mfc_timeout_id = 0;
1334 				/*
1335 				 * dropping the lock is fine as we
1336 				 * have a refhold on the bucket.
1337 				 * so mfc cannot be freed.
1338 				 * The timeout can fire but it will see
1339 				 * that mfc_timeout_id == 0 and not cleanup.
1340 				 */
1341 				mutex_exit(&rt->mfc_mutex);
1342 				(void) untimeout(id);
1343 				mutex_enter(&rt->mfc_mutex);
1344 			}
1345 
1346 			/*
1347 			 * Send all pkts that are queued waiting for the upcall.
1348 			 * ip_mdq param tun set to 0 -
1349 			 * the return value of ip_mdq() isn't used here,
1350 			 * so value we send doesn't matter.
1351 			 */
1352 			while (rt->mfc_rte != NULL) {
1353 				rte = rt->mfc_rte;
1354 				rt->mfc_rte = rte->rte_next;
1355 				mutex_exit(&rt->mfc_mutex);
1356 				(void) ip_mdq(rte->mp, (ipha_t *)
1357 				    rte->mp->b_rptr, rte->ill, 0, rt);
1358 				freemsg(rte->mp);
1359 				mi_free((char *)rte);
1360 				mutex_enter(&rt->mfc_mutex);
1361 			}
1362 		}
1363 		mutex_exit(&rt->mfc_mutex);
1364 	}
1365 
1366 
1367 	/*
1368 	 * It is possible that an entry is being inserted without an upcall
1369 	 */
1370 	if (nstl == 0) {
1371 		mutex_enter(&(mfcbp->mfcb_lock));
1372 		if (ipst->ips_ip_mrtdebug > 1) {
1373 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1374 			    "add_mfc: no upcall o %x g %x p %x",
1375 			    ntohl(mfccp->mfcc_origin.s_addr),
1376 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1377 			    mfccp->mfcc_parent);
1378 		}
1379 		if (is_mrouter_off(ipst)) {
1380 			mutex_exit(&mfcbp->mfcb_lock);
1381 			MFCB_REFRELE(mfcbp);
1382 			return (EINVAL);
1383 		}
1384 
1385 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1386 
1387 			mutex_enter(&rt->mfc_mutex);
1388 			if ((rt->mfc_origin.s_addr ==
1389 			    mfccp->mfcc_origin.s_addr) &&
1390 			    (rt->mfc_mcastgrp.s_addr ==
1391 			    mfccp->mfcc_mcastgrp.s_addr) &&
1392 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1393 				fill_route(rt, mfccp, ipst);
1394 				mutex_exit(&rt->mfc_mutex);
1395 				break;
1396 			}
1397 			mutex_exit(&rt->mfc_mutex);
1398 		}
1399 
1400 		/* No upcall, so make a new entry into mfctable */
1401 		if (rt == NULL) {
1402 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1403 			if (rt == NULL) {
1404 				ip1dbg(("add_mfc: out of memory\n"));
1405 				mutex_exit(&mfcbp->mfcb_lock);
1406 				MFCB_REFRELE(mfcbp);
1407 				return (ENOBUFS);
1408 			}
1409 
1410 			/* Insert new entry at head of hash chain */
1411 			mutex_enter(&rt->mfc_mutex);
1412 			fill_route(rt, mfccp, ipst);
1413 
1414 			/* Link into table */
1415 			rt->mfc_next   = mfcbp->mfcb_mfc;
1416 			mfcbp->mfcb_mfc = rt;
1417 			mutex_exit(&rt->mfc_mutex);
1418 		}
1419 		mutex_exit(&mfcbp->mfcb_lock);
1420 	}
1421 
1422 	MFCB_REFRELE(mfcbp);
1423 	return (0);
1424 }
1425 
1426 /*
1427  * Fills in mfc structure from mrouted mfcctl.
1428  */
1429 static void
1430 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1431 {
1432 	int i;
1433 
1434 	rt->mfc_origin		= mfccp->mfcc_origin;
1435 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1436 	rt->mfc_parent		= mfccp->mfcc_parent;
1437 	mutex_enter(&ipst->ips_numvifs_mutex);
1438 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1439 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1440 	}
1441 	mutex_exit(&ipst->ips_numvifs_mutex);
1442 	/* Initialize pkt counters per src-grp */
1443 	rt->mfc_pkt_cnt	= 0;
1444 	rt->mfc_byte_cnt	= 0;
1445 	rt->mfc_wrong_if	= 0;
1446 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1447 
1448 }
1449 
1450 static void
1451 free_queue(struct mfc *mfcp)
1452 {
1453 	struct rtdetq *rte0;
1454 
1455 	/*
1456 	 * Drop all queued upcall packets.
1457 	 * Free the mbuf with the pkt.
1458 	 */
1459 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1460 		mfcp->mfc_rte = rte0->rte_next;
1461 		freemsg(rte0->mp);
1462 		mi_free((char *)rte0);
1463 	}
1464 }
1465 /*
1466  * go thorugh the hash bucket and free all the entries marked condemned.
1467  */
1468 void
1469 release_mfc(struct mfcb *mfcbp)
1470 {
1471 	struct mfc *current_mfcp;
1472 	struct mfc *prev_mfcp;
1473 
1474 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1475 
1476 	while (current_mfcp != NULL) {
1477 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1478 			if (current_mfcp == mfcbp->mfcb_mfc) {
1479 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1480 				free_queue(current_mfcp);
1481 				mi_free(current_mfcp);
1482 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1483 				continue;
1484 			}
1485 			ASSERT(prev_mfcp != NULL);
1486 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1487 			free_queue(current_mfcp);
1488 			mi_free(current_mfcp);
1489 			current_mfcp = NULL;
1490 		} else {
1491 			prev_mfcp = current_mfcp;
1492 		}
1493 
1494 		current_mfcp = prev_mfcp->mfc_next;
1495 
1496 	}
1497 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1498 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1499 }
1500 
1501 /*
1502  * Delete an mfc entry.
1503  */
1504 static int
1505 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1506 {
1507 	struct in_addr	origin;
1508 	struct in_addr	mcastgrp;
1509 	struct mfc 	*rt;
1510 	uint_t		hash;
1511 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1512 
1513 	origin = mfccp->mfcc_origin;
1514 	mcastgrp = mfccp->mfcc_mcastgrp;
1515 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1516 
1517 	if (ipst->ips_ip_mrtdebug > 1) {
1518 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1519 		    "del_mfc: o %x g %x",
1520 		    ntohl(origin.s_addr),
1521 		    ntohl(mcastgrp.s_addr));
1522 	}
1523 
1524 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1525 
1526 	/* Find mfc in mfctable, finds only entries without upcalls */
1527 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1528 		mutex_enter(&rt->mfc_mutex);
1529 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1530 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1531 		    rt->mfc_rte == NULL &&
1532 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1533 			break;
1534 		mutex_exit(&rt->mfc_mutex);
1535 	}
1536 
1537 	/*
1538 	 * Return if there was an upcall (mfc_rte != NULL,
1539 	 * or rt not in mfctable.
1540 	 */
1541 	if (rt == NULL) {
1542 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1543 		return (EADDRNOTAVAIL);
1544 	}
1545 
1546 
1547 	/*
1548 	 * no need to hold lock as we have a reference.
1549 	 */
1550 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1551 	/* error checking */
1552 	if (rt->mfc_timeout_id != 0) {
1553 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1554 		/*
1555 		 * Its ok to drop the lock,  the struct cannot be freed
1556 		 * since we have a ref on the hash bucket.
1557 		 */
1558 		rt->mfc_timeout_id = 0;
1559 		mutex_exit(&rt->mfc_mutex);
1560 		(void) untimeout(rt->mfc_timeout_id);
1561 		mutex_enter(&rt->mfc_mutex);
1562 	}
1563 
1564 	ASSERT(rt->mfc_rte == NULL);
1565 
1566 
1567 	/*
1568 	 * Delete the entry from the cache
1569 	 */
1570 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1571 	mutex_exit(&rt->mfc_mutex);
1572 
1573 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1574 
1575 	return (0);
1576 }
1577 
1578 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1579 
1580 /*
1581  * IP multicast forwarding function. This function assumes that the packet
1582  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1583  * pointed to by "ill", and the packet is to be relayed to other networks
1584  * that have members of the packet's destination IP multicast group.
1585  *
1586  * The packet is returned unscathed to the caller, unless it is
1587  * erroneous, in which case a -1 value tells the caller (IP)
1588  * to discard it.
1589  *
1590  * Unlike BSD, SunOS 5.x needs to return to IP info about
1591  * whether pkt came in thru a tunnel, so it can be discarded, unless
1592  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1593  * to be delivered.
1594  * Return values are 0 - pkt is okay and phyint
1595  *		    -1 - pkt is malformed and to be tossed
1596  *                   1 - pkt came in on tunnel
1597  */
1598 int
1599 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1600 {
1601 	struct mfc 	*rt;
1602 	ipaddr_t	src, dst, tunnel_src = 0;
1603 	static int	srctun = 0;
1604 	vifi_t		vifi;
1605 	boolean_t	pim_reg_packet = B_FALSE;
1606 	struct mfcb *mfcbp;
1607 	ip_stack_t	*ipst = ill->ill_ipst;
1608 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1609 
1610 	if (ipst->ips_ip_mrtdebug > 1) {
1611 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1612 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1613 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1614 		    ill->ill_name);
1615 	}
1616 
1617 	dst = ipha->ipha_dst;
1618 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1619 		pim_reg_packet = B_TRUE;
1620 	else
1621 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1622 
1623 	/*
1624 	 * Don't forward a packet with time-to-live of zero or one,
1625 	 * or a packet destined to a local-only group.
1626 	 */
1627 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1628 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1629 		if (ipst->ips_ip_mrtdebug > 1) {
1630 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1631 			    "ip_mforward: not forwarded ttl %d,"
1632 			    " dst 0x%x ill %s",
1633 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1634 		}
1635 		mp->b_prev = NULL;
1636 		if (tunnel_src != 0)
1637 			return (1);
1638 		else
1639 			return (0);
1640 	}
1641 
1642 	if ((tunnel_src != 0) || pim_reg_packet) {
1643 		/*
1644 		 * Packet arrived over an encapsulated tunnel or via a PIM
1645 		 * register message. Both ip_mroute_decap() and pim_input()
1646 		 * encode information in mp->b_prev.
1647 		 */
1648 		mp->b_prev = NULL;
1649 		if (ipst->ips_ip_mrtdebug > 1) {
1650 			if (tunnel_src != 0) {
1651 				(void) mi_strlog(mrouter->conn_rq, 1,
1652 				    SL_TRACE,
1653 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1654 				    ill->ill_name);
1655 			} else if (pim_reg_packet) {
1656 				(void) mi_strlog(mrouter->conn_rq, 1,
1657 				    SL_TRACE,
1658 				    "ip_mforward: ill %s arrived via"
1659 				    "  REGISTER VIF",
1660 				    ill->ill_name);
1661 			}
1662 		}
1663 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1664 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1665 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1666 		/* Packet arrived via a physical interface. */
1667 		if (ipst->ips_ip_mrtdebug > 1) {
1668 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1669 			    "ip_mforward: ill %s arrived via PHYINT",
1670 			    ill->ill_name);
1671 		}
1672 
1673 	} else {
1674 		/*
1675 		 * Packet arrived through a SRCRT tunnel.
1676 		 * Source-route tunnels are no longer supported.
1677 		 * Error message printed every 1000 times.
1678 		 */
1679 		if ((srctun++ % 1000) == 0) {
1680 			cmn_err(CE_WARN,
1681 			    "ip_mforward: received source-routed pkt from %x",
1682 			    ntohl(ipha->ipha_src));
1683 		}
1684 		return (-1);
1685 	}
1686 
1687 	ipst->ips_mrtstat->mrts_fwd_in++;
1688 	src = ipha->ipha_src;
1689 
1690 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1691 
1692 	/*
1693 	 * Lock the mfctable against changes made by ip_mforward.
1694 	 * Note that only add_mfc and del_mfc can remove entries and
1695 	 * they run with exclusive access to IP. So we do not need to
1696 	 * guard against the rt being deleted, so release lock after reading.
1697 	 */
1698 
1699 	if (is_mrouter_off(ipst))
1700 		return (-1);
1701 
1702 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1703 	MFCB_REFHOLD(mfcbp);
1704 	MFCFIND(mfcbp, src, dst, rt);
1705 
1706 	/* Entry exists, so forward if necessary */
1707 	if (rt != NULL) {
1708 		int ret = 0;
1709 		ipst->ips_mrtstat->mrts_mfc_hits++;
1710 		if (pim_reg_packet) {
1711 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1712 			ret = ip_mdq(mp, ipha,
1713 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1714 			    v_ipif->ipif_ill,
1715 			    0, rt);
1716 		} else {
1717 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1718 		}
1719 
1720 		MFCB_REFRELE(mfcbp);
1721 		return (ret);
1722 
1723 		/*
1724 		 * Don't forward if we don't have a cache entry.  Mrouted will
1725 		 * always provide a cache entry in response to an upcall.
1726 		 */
1727 	} else {
1728 		/*
1729 		 * If we don't have a route for packet's origin, make a copy
1730 		 * of the packet and send message to routing daemon.
1731 		 */
1732 		struct mfc	*mfc_rt	 = NULL;
1733 		mblk_t		*mp0	 = NULL;
1734 		mblk_t		*mp_copy = NULL;
1735 		struct rtdetq	*rte	 = NULL;
1736 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1737 		uint_t		hash;
1738 		int		npkts;
1739 		boolean_t	new_mfc = B_FALSE;
1740 		ipst->ips_mrtstat->mrts_mfc_misses++;
1741 		/* BSD uses mrts_no_route++ */
1742 		if (ipst->ips_ip_mrtdebug > 1) {
1743 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1744 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1745 			    ill->ill_name, ntohl(src), ntohl(dst),
1746 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1747 		}
1748 		/*
1749 		 * The order of the following code differs from the BSD code.
1750 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1751 		 * code works, so SunOS 5.x wasn't changed to conform to the
1752 		 * BSD version.
1753 		 */
1754 
1755 		/* Lock mfctable. */
1756 		hash = MFCHASH(src, dst);
1757 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1758 
1759 		/*
1760 		 * If we are turning off mrouted return an error
1761 		 */
1762 		if (is_mrouter_off(ipst)) {
1763 			mutex_exit(&mfcbp->mfcb_lock);
1764 			MFCB_REFRELE(mfcbp);
1765 			return (-1);
1766 		}
1767 
1768 		/* Is there an upcall waiting for this packet? */
1769 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1770 		    mfc_rt = mfc_rt->mfc_next) {
1771 			mutex_enter(&mfc_rt->mfc_mutex);
1772 			if (ipst->ips_ip_mrtdebug > 1) {
1773 				(void) mi_strlog(mrouter->conn_rq, 1,
1774 				    SL_TRACE,
1775 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1776 				    " g 0x%x\n",
1777 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1778 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1779 			}
1780 			/* There is an upcall */
1781 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1782 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1783 			    (mfc_rt->mfc_rte != NULL) &&
1784 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1785 				break;
1786 			}
1787 			mutex_exit(&mfc_rt->mfc_mutex);
1788 		}
1789 		/* No upcall, so make a new entry into mfctable */
1790 		if (mfc_rt == NULL) {
1791 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1792 			if (mfc_rt == NULL) {
1793 				ipst->ips_mrtstat->mrts_fwd_drop++;
1794 				ip1dbg(("ip_mforward: out of memory "
1795 				    "for mfc, mfc_rt\n"));
1796 				goto error_return;
1797 			} else
1798 				new_mfc = B_TRUE;
1799 			/* Get resources */
1800 			/* TODO could copy header and dup rest */
1801 			mp_copy = copymsg(mp);
1802 			if (mp_copy == NULL) {
1803 				ipst->ips_mrtstat->mrts_fwd_drop++;
1804 				ip1dbg(("ip_mforward: out of memory for "
1805 				    "mblk, mp_copy\n"));
1806 				goto error_return;
1807 			}
1808 			mutex_enter(&mfc_rt->mfc_mutex);
1809 		}
1810 		/* Get resources for rte, whether first rte or not first. */
1811 		/* Add this packet into rtdetq */
1812 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1813 		if (rte == NULL) {
1814 			ipst->ips_mrtstat->mrts_fwd_drop++;
1815 			mutex_exit(&mfc_rt->mfc_mutex);
1816 			ip1dbg(("ip_mforward: out of memory for"
1817 			    " rtdetq, rte\n"));
1818 			goto error_return;
1819 		}
1820 
1821 		mp0 = copymsg(mp);
1822 		if (mp0 == NULL) {
1823 			ipst->ips_mrtstat->mrts_fwd_drop++;
1824 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1825 			mutex_exit(&mfc_rt->mfc_mutex);
1826 			goto error_return;
1827 		}
1828 		rte->mp		= mp0;
1829 		if (pim_reg_packet) {
1830 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1831 			rte->ill =
1832 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1833 			    v_ipif->ipif_ill;
1834 		} else {
1835 			rte->ill = ill;
1836 		}
1837 		rte->rte_next	= NULL;
1838 
1839 		/*
1840 		 * Determine if upcall q (rtdetq) has overflowed.
1841 		 * mfc_rt->mfc_rte is null by mi_zalloc
1842 		 * if it is the first message.
1843 		 */
1844 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1845 		    rte_m = rte_m->rte_next)
1846 			npkts++;
1847 		if (ipst->ips_ip_mrtdebug > 1) {
1848 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1849 			    "ip_mforward: upcalls %d\n", npkts);
1850 		}
1851 		if (npkts > MAX_UPQ) {
1852 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1853 			mutex_exit(&mfc_rt->mfc_mutex);
1854 			goto error_return;
1855 		}
1856 
1857 		if (npkts == 0) {	/* first upcall */
1858 			int i = 0;
1859 			/*
1860 			 * Now finish installing the new mfc! Now that we have
1861 			 * resources!  Insert new entry at head of hash chain.
1862 			 * Use src and dst which are ipaddr_t's.
1863 			 */
1864 			mfc_rt->mfc_origin.s_addr = src;
1865 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1866 
1867 			mutex_enter(&ipst->ips_numvifs_mutex);
1868 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1869 				mfc_rt->mfc_ttls[i] = 0;
1870 			mutex_exit(&ipst->ips_numvifs_mutex);
1871 			mfc_rt->mfc_parent = ALL_VIFS;
1872 
1873 			/* Link into table */
1874 			if (ipst->ips_ip_mrtdebug > 1) {
1875 				(void) mi_strlog(mrouter->conn_rq, 1,
1876 				    SL_TRACE,
1877 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1878 				    "g 0x%x\n", hash,
1879 				    ntohl(mfc_rt->mfc_origin.s_addr),
1880 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1881 			}
1882 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1883 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1884 			mfc_rt->mfc_rte = NULL;
1885 		}
1886 
1887 		/* Link in the upcall */
1888 		/* First upcall */
1889 		if (mfc_rt->mfc_rte == NULL)
1890 			mfc_rt->mfc_rte = rte;
1891 		else {
1892 			/* not the first upcall */
1893 			prev_rte = mfc_rt->mfc_rte;
1894 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1895 			    prev_rte = rte1, rte1 = rte1->rte_next)
1896 				;
1897 			prev_rte->rte_next = rte;
1898 		}
1899 
1900 		/*
1901 		 * No upcalls waiting, this is first one, so send a message to
1902 		 * routing daemon to install a route into kernel table.
1903 		 */
1904 		if (npkts == 0) {
1905 			struct igmpmsg	*im;
1906 			/* ipha_protocol is 0, for upcall */
1907 			ASSERT(mp_copy != NULL);
1908 			im = (struct igmpmsg *)mp_copy->b_rptr;
1909 			im->im_msgtype	= IGMPMSG_NOCACHE;
1910 			im->im_mbz = 0;
1911 			mutex_enter(&ipst->ips_numvifs_mutex);
1912 			if (pim_reg_packet) {
1913 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1914 				mutex_exit(&ipst->ips_numvifs_mutex);
1915 			} else {
1916 				/*
1917 				 * XXX do we need to hold locks here ?
1918 				 */
1919 				for (vifi = 0;
1920 				    vifi < ipst->ips_numvifs;
1921 				    vifi++) {
1922 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1923 						continue;
1924 					if (ipst->ips_vifs[vifi].
1925 					    v_ipif->ipif_ill == ill) {
1926 						im->im_vif = (uchar_t)vifi;
1927 						break;
1928 					}
1929 				}
1930 				mutex_exit(&ipst->ips_numvifs_mutex);
1931 				ASSERT(vifi < ipst->ips_numvifs);
1932 			}
1933 
1934 			ipst->ips_mrtstat->mrts_upcalls++;
1935 			/* Timer to discard upcalls if mrouted is too slow */
1936 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1937 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1938 			mutex_exit(&mfc_rt->mfc_mutex);
1939 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1940 			/* Pass to RAWIP */
1941 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
1942 		} else {
1943 			mutex_exit(&mfc_rt->mfc_mutex);
1944 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1945 			freemsg(mp_copy);
1946 		}
1947 
1948 		MFCB_REFRELE(mfcbp);
1949 		if (tunnel_src != 0)
1950 			return (1);
1951 		else
1952 			return (0);
1953 	error_return:
1954 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1955 		MFCB_REFRELE(mfcbp);
1956 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1957 			mi_free((char *)mfc_rt);
1958 		if (rte != NULL)
1959 			mi_free((char *)rte);
1960 		if (mp_copy != NULL)
1961 			freemsg(mp_copy);
1962 		if (mp0 != NULL)
1963 			freemsg(mp0);
1964 		return (-1);
1965 	}
1966 }
1967 
1968 /*
1969  * Clean up the mfctable cache entry if upcall is not serviced.
1970  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1971  */
1972 static void
1973 expire_upcalls(void *arg)
1974 {
1975 	struct mfc *mfc_rt = arg;
1976 	uint_t hash;
1977 	struct mfc *prev_mfc, *mfc0;
1978 	ip_stack_t	*ipst;
1979 	conn_t		*mrouter;
1980 
1981 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1982 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1983 		return;
1984 	}
1985 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1986 	mrouter = ipst->ips_ip_g_mrouter;
1987 
1988 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1989 	if (ipst->ips_ip_mrtdebug > 1) {
1990 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1991 		    "expire_upcalls: hash %d s %x g %x",
1992 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1993 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1994 	}
1995 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1996 	mutex_enter(&mfc_rt->mfc_mutex);
1997 	/*
1998 	 * if timeout has been set to zero, than the
1999 	 * entry has been filled, no need to delete it.
2000 	 */
2001 	if (mfc_rt->mfc_timeout_id == 0)
2002 		goto done;
2003 	ipst->ips_mrtstat->mrts_cache_cleanups++;
2004 	mfc_rt->mfc_timeout_id = 0;
2005 
2006 	/* Determine entry to be cleaned up in cache table. */
2007 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
2008 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
2009 		if (mfc0 == mfc_rt)
2010 			break;
2011 
2012 	/* del_mfc takes care of gone mfcs */
2013 	ASSERT(prev_mfc != NULL);
2014 	ASSERT(mfc0 != NULL);
2015 
2016 	/*
2017 	 * Delete the entry from the cache
2018 	 */
2019 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
2020 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
2021 
2022 	/*
2023 	 * release_mfc will drop all queued upcall packets.
2024 	 * and will free the mbuf with the pkt, if, timing info.
2025 	 */
2026 done:
2027 	mutex_exit(&mfc_rt->mfc_mutex);
2028 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
2029 }
2030 
2031 /*
2032  * Packet forwarding routine once entry in the cache is made.
2033  */
2034 static int
2035 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
2036     struct mfc *rt)
2037 {
2038 	vifi_t vifi;
2039 	struct vif *vifp;
2040 	ipaddr_t dst = ipha->ipha_dst;
2041 	size_t  plen = msgdsize(mp);
2042 	vifi_t num_of_vifs;
2043 	ip_stack_t	*ipst = ill->ill_ipst;
2044 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2045 
2046 	if (ipst->ips_ip_mrtdebug > 1) {
2047 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2048 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2049 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2050 		    ill->ill_name);
2051 	}
2052 
2053 	/* Macro to send packet on vif */
2054 #define	MC_SEND(ipha, mp, vifp, dst) { \
2055 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2056 		encap_send((ipha), (mp), (vifp), (dst)); \
2057 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2058 		register_send((ipha), (mp), (vifp), (dst)); \
2059 	else \
2060 		phyint_send((ipha), (mp), (vifp), (dst)); \
2061 }
2062 
2063 	vifi = rt->mfc_parent;
2064 
2065 	/*
2066 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2067 	 * Mrouted had no route.
2068 	 * We wanted the route installed in the mfctable to prevent multiple
2069 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2070 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2071 	 * 3.6.
2072 	 */
2073 	if (vifi == NO_VIF) {
2074 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2075 		    ill->ill_name));
2076 		if (ipst->ips_ip_mrtdebug > 1) {
2077 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2078 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2079 		}
2080 		return (-1);	/* drop pkt */
2081 	}
2082 
2083 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2084 		return (-1);
2085 	/*
2086 	 * The MFC entries are not cleaned up when an ipif goes
2087 	 * away thus this code has to guard against an MFC referencing
2088 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2089 	 * sets the v_ipif to NULL when the ipif disappears.
2090 	 */
2091 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2092 
2093 	if (vifi >= ipst->ips_numvifs) {
2094 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2095 		    "%d ill %s viftable ill %s\n",
2096 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2097 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2098 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2099 		return (-1);
2100 	}
2101 	/*
2102 	 * Don't forward if it didn't arrive from the parent vif for its
2103 	 * origin. But do match on the groups as we nominate only one
2104 	 * ill in the group for receiving allmulti packets.
2105 	 */
2106 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
2107 	    (ill->ill_group == NULL ||
2108 	    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
2109 		ill->ill_group)) ||
2110 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2111 		/* Came in the wrong interface */
2112 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2113 			"numvifs %d ill %s viftable ill %s\n",
2114 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2115 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2116 		if (ipst->ips_ip_mrtdebug > 1) {
2117 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2118 			    "ip_mdq: arrived wrong if, vifi %d ill "
2119 			    "%s viftable ill %s\n",
2120 			    (int)vifi, ill->ill_name,
2121 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2122 		}
2123 		ipst->ips_mrtstat->mrts_wrong_if++;
2124 		rt->mfc_wrong_if++;
2125 
2126 		/*
2127 		 * If we are doing PIM assert processing and we are forwarding
2128 		 * packets on this interface, and it is a broadcast medium
2129 		 * interface (and not a tunnel), send a message to the routing.
2130 		 *
2131 		 * We use the first ipif on the list, since it's all we have.
2132 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2133 		 */
2134 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2135 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2136 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2137 			mblk_t		*mp_copy;
2138 			struct igmpmsg	*im;
2139 
2140 			/* TODO could copy header and dup rest */
2141 			mp_copy = copymsg(mp);
2142 			if (mp_copy == NULL) {
2143 				ipst->ips_mrtstat->mrts_fwd_drop++;
2144 				ip1dbg(("ip_mdq: out of memory "
2145 				    "for mblk, mp_copy\n"));
2146 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2147 				return (-1);
2148 			}
2149 
2150 			im = (struct igmpmsg *)mp_copy->b_rptr;
2151 			im->im_msgtype = IGMPMSG_WRONGVIF;
2152 			im->im_mbz = 0;
2153 			im->im_vif = (ushort_t)vifi;
2154 			/* Pass to RAWIP */
2155 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2156 		}
2157 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2158 		if (tunnel_src != 0)
2159 			return (1);
2160 		else
2161 			return (0);
2162 	}
2163 	/*
2164 	 * If I sourced this packet, it counts as output, else it was input.
2165 	 */
2166 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2167 		ipst->ips_vifs[vifi].v_pkt_out++;
2168 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2169 	} else {
2170 		ipst->ips_vifs[vifi].v_pkt_in++;
2171 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2172 	}
2173 	mutex_enter(&rt->mfc_mutex);
2174 	rt->mfc_pkt_cnt++;
2175 	rt->mfc_byte_cnt += plen;
2176 	mutex_exit(&rt->mfc_mutex);
2177 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2178 	/*
2179 	 * For each vif, decide if a copy of the packet should be forwarded.
2180 	 * Forward if:
2181 	 *		- the vif threshold ttl is non-zero AND
2182 	 *		- the pkt ttl exceeds the vif's threshold
2183 	 * A non-zero mfc_ttl indicates that the vif is part of
2184 	 * the output set for the mfc entry.
2185 	 */
2186 	mutex_enter(&ipst->ips_numvifs_mutex);
2187 	num_of_vifs = ipst->ips_numvifs;
2188 	mutex_exit(&ipst->ips_numvifs_mutex);
2189 	for (vifp = ipst->ips_vifs, vifi = 0;
2190 	    vifi < num_of_vifs;
2191 	    vifp++, vifi++) {
2192 		if (!lock_good_vif(vifp))
2193 			continue;
2194 		if ((rt->mfc_ttls[vifi] > 0) &&
2195 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2196 			/*
2197 			 * lock_good_vif should not have succedded if
2198 			 * v_ipif is null.
2199 			 */
2200 			ASSERT(vifp->v_ipif != NULL);
2201 			vifp->v_pkt_out++;
2202 			vifp->v_bytes_out += plen;
2203 			MC_SEND(ipha, mp, vifp, dst);
2204 			ipst->ips_mrtstat->mrts_fwd_out++;
2205 		}
2206 		unlock_good_vif(vifp);
2207 	}
2208 	if (tunnel_src != 0)
2209 		return (1);
2210 	else
2211 		return (0);
2212 }
2213 
2214 /*
2215  * Send the packet on physical interface.
2216  * Caller assumes can continue to use mp on return.
2217  */
2218 /* ARGSUSED */
2219 static void
2220 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2221 {
2222 	mblk_t 	*mp_copy;
2223 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2224 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2225 
2226 	/* Make a new reference to the packet */
2227 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2228 	if (mp_copy == NULL) {
2229 		ipst->ips_mrtstat->mrts_fwd_drop++;
2230 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2231 		return;
2232 	}
2233 	if (vifp->v_rate_limit <= 0)
2234 		tbf_send_packet(vifp, mp_copy);
2235 	else  {
2236 		if (ipst->ips_ip_mrtdebug > 1) {
2237 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2238 			    "phyint_send: tbf_contr rate %d "
2239 			    "vifp 0x%p mp 0x%p dst 0x%x",
2240 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2241 		}
2242 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2243 	}
2244 }
2245 
2246 /*
2247  * Send the whole packet for REGISTER encapsulation to PIM daemon
2248  * Caller assumes it can continue to use mp on return.
2249  */
2250 /* ARGSUSED */
2251 static void
2252 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2253 {
2254 	struct igmpmsg	*im;
2255 	mblk_t		*mp_copy;
2256 	ipha_t		*ipha_copy;
2257 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2258 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2259 
2260 	if (ipst->ips_ip_mrtdebug > 1) {
2261 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2262 		    "register_send: src %x, dst %x\n",
2263 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2264 	}
2265 
2266 	/*
2267 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2268 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2269 	 * ethernet driver will.
2270 	 */
2271 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2272 	if (mp_copy == NULL) {
2273 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2274 		if (ipst->ips_ip_mrtdebug > 3) {
2275 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2276 			    "register_send: allocb failure.");
2277 		}
2278 		return;
2279 	}
2280 
2281 	/*
2282 	 * Bump write pointer to account for igmpmsg being added.
2283 	 */
2284 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2285 
2286 	/*
2287 	 * Chain packet to new mblk_t.
2288 	 */
2289 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2290 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2291 		if (ipst->ips_ip_mrtdebug > 3) {
2292 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2293 			    "register_send: copymsg failure.");
2294 		}
2295 		freeb(mp_copy);
2296 		return;
2297 	}
2298 
2299 	/*
2300 	 * icmp_input() asserts that IP version field is set to an
2301 	 * appropriate version. Hence, the struct igmpmsg that this really
2302 	 * becomes, needs to have the correct IP version field.
2303 	 */
2304 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2305 	*ipha_copy = multicast_encap_iphdr;
2306 
2307 	/*
2308 	 * The kernel uses the struct igmpmsg header to encode the messages to
2309 	 * the multicast routing daemon. Fill in the fields in the header
2310 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2311 	 */
2312 	im = (struct igmpmsg *)mp_copy->b_rptr;
2313 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2314 	im->im_src.s_addr = ipha->ipha_src;
2315 	im->im_dst.s_addr = ipha->ipha_dst;
2316 
2317 	/*
2318 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2319 	 * header with renamed fields and the multicast routing daemon uses
2320 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2321 	 */
2322 	im->im_mbz = 0;
2323 
2324 	++ipst->ips_mrtstat->mrts_upcalls;
2325 	if (!canputnext(mrouter->conn_rq)) {
2326 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2327 		if (ipst->ips_ip_mrtdebug > 3) {
2328 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2329 			    "register_send: register upcall failure.");
2330 		}
2331 		freemsg(mp_copy);
2332 	} else {
2333 		/* Pass to RAWIP */
2334 		(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2335 	}
2336 }
2337 
2338 /*
2339  * pim_validate_cksum handles verification of the checksum in the
2340  * pim header.  For PIM Register packets, the checksum is calculated
2341  * across the PIM header only.  For all other packets, the checksum
2342  * is for the PIM header and remainder of the packet.
2343  *
2344  * returns: B_TRUE, if checksum is okay.
2345  *          B_FALSE, if checksum is not valid.
2346  */
2347 static boolean_t
2348 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2349 {
2350 	mblk_t *mp_dup;
2351 
2352 	if ((mp_dup = dupmsg(mp)) == NULL)
2353 		return (B_FALSE);
2354 
2355 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2356 	if (pimp->pim_type == PIM_REGISTER)
2357 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2358 	if (IP_CSUM(mp_dup, 0, 0)) {
2359 		freemsg(mp_dup);
2360 		return (B_FALSE);
2361 	}
2362 	freemsg(mp_dup);
2363 	return (B_TRUE);
2364 }
2365 
2366 /*
2367  * int
2368  * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
2369  *	IP Protocol 103. Register messages are decapsulated and sent
2370  *	onto multicast forwarding.
2371  */
2372 int
2373 pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
2374 {
2375 	ipha_t		*eip, *ip;
2376 	int		iplen, pimlen, iphlen;
2377 	struct pim	*pimp;	/* pointer to a pim struct */
2378 	uint32_t	*reghdr;
2379 	ip_stack_t	*ipst = ill->ill_ipst;
2380 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2381 
2382 	/*
2383 	 * Pullup the msg for PIM protocol processing.
2384 	 */
2385 	if (pullupmsg(mp, -1) == 0) {
2386 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2387 		freemsg(mp);
2388 		return (-1);
2389 	}
2390 
2391 	ip = (ipha_t *)mp->b_rptr;
2392 	iplen = ip->ipha_length;
2393 	iphlen = IPH_HDR_LENGTH(ip);
2394 	pimlen = ntohs(iplen) - iphlen;
2395 
2396 	/*
2397 	 * Validate lengths
2398 	 */
2399 	if (pimlen < PIM_MINLEN) {
2400 		++ipst->ips_mrtstat->mrts_pim_malformed;
2401 		if (ipst->ips_ip_mrtdebug > 1) {
2402 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2403 			    "pim_input: length not at least minlen");
2404 		}
2405 		freemsg(mp);
2406 		return (-1);
2407 	}
2408 
2409 	/*
2410 	 * Point to the PIM header.
2411 	 */
2412 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2413 
2414 	/*
2415 	 * Check the version number.
2416 	 */
2417 	if (pimp->pim_vers != PIM_VERSION) {
2418 		++ipst->ips_mrtstat->mrts_pim_badversion;
2419 		if (ipst->ips_ip_mrtdebug > 1) {
2420 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2421 			    "pim_input: unknown version of PIM");
2422 		}
2423 		freemsg(mp);
2424 		return (-1);
2425 	}
2426 
2427 	/*
2428 	 * Validate the checksum
2429 	 */
2430 	if (!pim_validate_cksum(mp, ip, pimp)) {
2431 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2432 		if (ipst->ips_ip_mrtdebug > 1) {
2433 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2434 			    "pim_input: invalid checksum");
2435 		}
2436 		freemsg(mp);
2437 		return (-1);
2438 	}
2439 
2440 	if (pimp->pim_type != PIM_REGISTER)
2441 		return (0);
2442 
2443 	reghdr = (uint32_t *)(pimp + 1);
2444 	eip = (ipha_t *)(reghdr + 1);
2445 
2446 	/*
2447 	 * check if the inner packet is destined to mcast group
2448 	 */
2449 	if (!CLASSD(eip->ipha_dst)) {
2450 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2451 		if (ipst->ips_ip_mrtdebug > 1) {
2452 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2453 			    "pim_input: Inner pkt not mcast .. !");
2454 		}
2455 		freemsg(mp);
2456 		return (-1);
2457 	}
2458 	if (ipst->ips_ip_mrtdebug > 1) {
2459 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2460 		    "register from %x, to %x, len %d",
2461 		    ntohl(eip->ipha_src),
2462 		    ntohl(eip->ipha_dst),
2463 		    ntohs(eip->ipha_length));
2464 	}
2465 	/*
2466 	 * If the null register bit is not set, decapsulate
2467 	 * the packet before forwarding it.
2468 	 */
2469 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2470 		mblk_t *mp_copy;
2471 
2472 		/* Copy the message */
2473 		if ((mp_copy = copymsg(mp)) == NULL) {
2474 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2475 			freemsg(mp);
2476 			return (-1);
2477 		}
2478 
2479 		/*
2480 		 * Decapsulate the packet and give it to
2481 		 * register_mforward.
2482 		 */
2483 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2484 		    sizeof (*reghdr);
2485 		if (register_mforward(q, mp_copy, ill) != 0) {
2486 			freemsg(mp);
2487 			return (-1);
2488 		}
2489 	}
2490 
2491 	/*
2492 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2493 	 * PIM socket. For Solaris it is done right after pim_input() is
2494 	 * called.
2495 	 */
2496 	return (0);
2497 }
2498 
2499 /*
2500  * PIM sparse mode hook.  Called by pim_input after decapsulating
2501  * the packet. Loop back the packet, as if we have received it.
2502  * In pim_input() we have to check if the destination is a multicast address.
2503  */
2504 /* ARGSUSED */
2505 static int
2506 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
2507 {
2508 	ip_stack_t	*ipst = ill->ill_ipst;
2509 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2510 
2511 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2512 
2513 	if (ipst->ips_ip_mrtdebug > 3) {
2514 		ipha_t *ipha;
2515 
2516 		ipha = (ipha_t *)mp->b_rptr;
2517 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2518 		    "register_mforward: src %x, dst %x\n",
2519 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2520 	}
2521 	/*
2522 	 * Need to pass in to ip_mforward() the information that the
2523 	 * packet has arrived on the register_vif. We use the solution that
2524 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2525 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2526 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2527 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2528 	 * or passes it to ip_mforward(). We use
2529 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2530 	 * register vif. If in the future we have more than one register vifs,
2531 	 * then this will need re-examination.
2532 	 */
2533 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2534 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2535 	ip_rput(q, mp);
2536 	return (0);
2537 }
2538 
2539 /*
2540  * Send an encapsulated packet.
2541  * Caller assumes can continue to use mp when routine returns.
2542  */
2543 /* ARGSUSED */
2544 static void
2545 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2546 {
2547 	mblk_t 	*mp_copy;
2548 	ipha_t 	*ipha_copy;
2549 	size_t	len;
2550 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2551 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2552 
2553 	if (ipst->ips_ip_mrtdebug > 1) {
2554 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2555 		    "encap_send: vif %ld enter",
2556 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2557 	}
2558 	len = ntohs(ipha->ipha_length);
2559 
2560 	/*
2561 	 * Copy the old packet & pullup it's IP header into the
2562 	 * new mbuf so we can modify it.  Try to fill the new
2563 	 * mbuf since if we don't the ethernet driver will.
2564 	 */
2565 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2566 	if (mp_copy == NULL)
2567 		return;
2568 	mp_copy->b_rptr += 32;
2569 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2570 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2571 		freeb(mp_copy);
2572 		return;
2573 	}
2574 
2575 	/*
2576 	 * Fill in the encapsulating IP header.
2577 	 * Remote tunnel dst in rmt_addr, from add_vif().
2578 	 */
2579 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2580 	*ipha_copy = multicast_encap_iphdr;
2581 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2582 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2583 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2584 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2585 	ASSERT(ipha_copy->ipha_ident == 0);
2586 
2587 	/* Turn the encapsulated IP header back into a valid one. */
2588 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2589 	ipha->ipha_ttl--;
2590 	ipha->ipha_hdr_checksum = 0;
2591 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2592 
2593 	if (ipst->ips_ip_mrtdebug > 1) {
2594 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2595 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2596 	}
2597 	if (vifp->v_rate_limit <= 0)
2598 		tbf_send_packet(vifp, mp_copy);
2599 	else
2600 		/* ipha is from the original header */
2601 		tbf_control(vifp, mp_copy, ipha);
2602 }
2603 
2604 /*
2605  * De-encapsulate a packet and feed it back through IP input.
2606  * This routine is called whenever IP gets a packet with prototype
2607  * IPPROTO_ENCAP and a local destination address.
2608  */
2609 void
2610 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
2611 {
2612 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2613 	ipha_t		*ipha_encap;
2614 	int		hlen = IPH_HDR_LENGTH(ipha);
2615 	ipaddr_t	src;
2616 	struct vif	*vifp;
2617 	ip_stack_t	*ipst = ill->ill_ipst;
2618 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2619 
2620 	/*
2621 	 * Dump the packet if it's not to a multicast destination or if
2622 	 * we don't have an encapsulating tunnel with the source.
2623 	 * Note:  This code assumes that the remote site IP address
2624 	 * uniquely identifies the tunnel (i.e., that this site has
2625 	 * at most one tunnel with the remote site).
2626 	 */
2627 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2628 	if (!CLASSD(ipha_encap->ipha_dst)) {
2629 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2630 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2631 		freemsg(mp);
2632 		return;
2633 	}
2634 	src = (ipaddr_t)ipha->ipha_src;
2635 	mutex_enter(&ipst->ips_last_encap_lock);
2636 	if (src != ipst->ips_last_encap_src) {
2637 		struct vif *vife;
2638 
2639 		vifp = ipst->ips_vifs;
2640 		vife = vifp + ipst->ips_numvifs;
2641 		ipst->ips_last_encap_src = src;
2642 		ipst->ips_last_encap_vif = 0;
2643 		for (; vifp < vife; ++vifp) {
2644 			if (!lock_good_vif(vifp))
2645 				continue;
2646 			if (vifp->v_rmt_addr.s_addr == src) {
2647 				if (vifp->v_flags & VIFF_TUNNEL)
2648 					ipst->ips_last_encap_vif = vifp;
2649 				if (ipst->ips_ip_mrtdebug > 1) {
2650 					(void) mi_strlog(mrouter->conn_rq,
2651 					    1, SL_TRACE,
2652 					    "ip_mroute_decap: good tun "
2653 					    "vif %ld with %x",
2654 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2655 					    ntohl(src));
2656 				}
2657 				unlock_good_vif(vifp);
2658 				break;
2659 			}
2660 			unlock_good_vif(vifp);
2661 		}
2662 	}
2663 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2664 		mutex_exit(&ipst->ips_last_encap_lock);
2665 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2666 		freemsg(mp);
2667 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2668 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2669 		return;
2670 	}
2671 	mutex_exit(&ipst->ips_last_encap_lock);
2672 
2673 	/*
2674 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2675 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2676 	 * to pass this information. This is safe since the ip_rput either
2677 	 * frees the packet or passes it to ip_mforward.
2678 	 */
2679 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2680 	mp->b_rptr += hlen;
2681 	/* Feed back into ip_rput as an M_DATA. */
2682 	ip_rput(q, mp);
2683 }
2684 
2685 /*
2686  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2687  * (stream closed).  Called as writer.
2688  */
2689 void
2690 reset_mrt_vif_ipif(ipif_t *ipif)
2691 {
2692 	vifi_t vifi, tmp_vifi;
2693 	vifi_t num_of_vifs;
2694 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2695 
2696 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2697 
2698 	mutex_enter(&ipst->ips_numvifs_mutex);
2699 	num_of_vifs = ipst->ips_numvifs;
2700 	mutex_exit(&ipst->ips_numvifs_mutex);
2701 
2702 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2703 		tmp_vifi = vifi - 1;
2704 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2705 			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
2706 		}
2707 	}
2708 }
2709 
2710 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2711 void
2712 reset_mrt_ill(ill_t *ill)
2713 {
2714 	struct mfc		*rt;
2715 	struct rtdetq	*rte;
2716 	int			i;
2717 	ip_stack_t	*ipst = ill->ill_ipst;
2718 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2719 
2720 	for (i = 0; i < MFCTBLSIZ; i++) {
2721 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2722 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2723 			if (ipst->ips_ip_mrtdebug > 1) {
2724 				(void) mi_strlog(mrouter->conn_rq, 1,
2725 				    SL_TRACE,
2726 				    "reset_mrt_ill: mfctable [%d]", i);
2727 			}
2728 			while (rt != NULL) {
2729 				mutex_enter(&rt->mfc_mutex);
2730 				while ((rte = rt->mfc_rte) != NULL) {
2731 					if (rte->ill == ill) {
2732 						if (ipst->ips_ip_mrtdebug > 1) {
2733 						(void) mi_strlog(
2734 						    mrouter->conn_rq,
2735 						    1, SL_TRACE,
2736 						    "reset_mrt_ill: "
2737 						    "ill 0x%p", ill);
2738 						}
2739 						rt->mfc_rte = rte->rte_next;
2740 						freemsg(rte->mp);
2741 						mi_free((char *)rte);
2742 					}
2743 				}
2744 				mutex_exit(&rt->mfc_mutex);
2745 				rt = rt->mfc_next;
2746 			}
2747 		}
2748 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2749 	}
2750 }
2751 
2752 /*
2753  * Token bucket filter module.
2754  * The ipha is for mcastgrp destination for phyint and encap.
2755  */
2756 static void
2757 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2758 {
2759 	size_t 	p_len =  msgdsize(mp);
2760 	struct tbf	*t    = vifp->v_tbf;
2761 	timeout_id_t id = 0;
2762 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2763 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2764 
2765 	/* Drop if packet is too large */
2766 	if (p_len > MAX_BKT_SIZE) {
2767 		ipst->ips_mrtstat->mrts_pkt2large++;
2768 		freemsg(mp);
2769 		return;
2770 	}
2771 	if (ipst->ips_ip_mrtdebug > 1) {
2772 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2773 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2774 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2775 		    ntohl(ipha->ipha_dst));
2776 	}
2777 
2778 	mutex_enter(&t->tbf_lock);
2779 
2780 	tbf_update_tokens(vifp);
2781 
2782 	/*
2783 	 * If there are enough tokens,
2784 	 * and the queue is empty, send this packet out.
2785 	 */
2786 	if (ipst->ips_ip_mrtdebug > 1) {
2787 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2788 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2789 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2790 		    t->tbf_q_len);
2791 	}
2792 	/* No packets are queued */
2793 	if (t->tbf_q_len == 0) {
2794 		/* queue empty, send packet if enough tokens */
2795 		if (p_len <= t->tbf_n_tok) {
2796 			t->tbf_n_tok -= p_len;
2797 			mutex_exit(&t->tbf_lock);
2798 			tbf_send_packet(vifp, mp);
2799 			return;
2800 		} else {
2801 			/* Queue packet and timeout till later */
2802 			tbf_queue(vifp, mp);
2803 			ASSERT(vifp->v_timeout_id == 0);
2804 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2805 			    TBF_REPROCESS);
2806 		}
2807 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2808 		/* Finite queue length, so queue pkts and process queue */
2809 		tbf_queue(vifp, mp);
2810 		tbf_process_q(vifp);
2811 	} else {
2812 		/* Check that we have UDP header with IP header */
2813 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2814 		    sizeof (struct udphdr);
2815 
2816 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2817 			if (!pullupmsg(mp, hdr_length)) {
2818 				freemsg(mp);
2819 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2820 				    "vif %ld src 0x%x dst 0x%x\n",
2821 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2822 				    ntohl(ipha->ipha_src),
2823 				    ntohl(ipha->ipha_dst)));
2824 				mutex_exit(&vifp->v_tbf->tbf_lock);
2825 				return;
2826 			} else
2827 				/* Have to reassign ipha after pullupmsg */
2828 				ipha = (ipha_t *)mp->b_rptr;
2829 		}
2830 		/*
2831 		 * Queue length too much,
2832 		 * try to selectively dq, or queue and process
2833 		 */
2834 		if (!tbf_dq_sel(vifp, ipha)) {
2835 			ipst->ips_mrtstat->mrts_q_overflow++;
2836 			freemsg(mp);
2837 		} else {
2838 			tbf_queue(vifp, mp);
2839 			tbf_process_q(vifp);
2840 		}
2841 	}
2842 	if (t->tbf_q_len == 0) {
2843 		id = vifp->v_timeout_id;
2844 		vifp->v_timeout_id = 0;
2845 	}
2846 	mutex_exit(&vifp->v_tbf->tbf_lock);
2847 	if (id != 0)
2848 		(void) untimeout(id);
2849 }
2850 
2851 /*
2852  * Adds a packet to the tbf queue at the interface.
2853  * The ipha is for mcastgrp destination for phyint and encap.
2854  */
2855 static void
2856 tbf_queue(struct vif *vifp, mblk_t *mp)
2857 {
2858 	struct tbf	*t = vifp->v_tbf;
2859 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2860 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2861 
2862 	if (ipst->ips_ip_mrtdebug > 1) {
2863 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2864 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2865 	}
2866 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2867 
2868 	if (t->tbf_t == NULL) {
2869 		/* Queue was empty */
2870 		t->tbf_q = mp;
2871 	} else {
2872 		/* Insert at tail */
2873 		t->tbf_t->b_next = mp;
2874 	}
2875 	/* set new tail pointer */
2876 	t->tbf_t = mp;
2877 
2878 	mp->b_next = mp->b_prev = NULL;
2879 
2880 	t->tbf_q_len++;
2881 }
2882 
2883 /*
2884  * Process the queue at the vif interface.
2885  * Drops the tbf_lock when sending packets.
2886  *
2887  * NOTE : The caller should quntimeout if the queue length is 0.
2888  */
2889 static void
2890 tbf_process_q(struct vif *vifp)
2891 {
2892 	mblk_t	*mp;
2893 	struct tbf	*t = vifp->v_tbf;
2894 	size_t	len;
2895 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2896 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2897 
2898 	if (ipst->ips_ip_mrtdebug > 1) {
2899 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2900 		    "tbf_process_q 1: vif %ld qlen = %d",
2901 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2902 	}
2903 
2904 	/*
2905 	 * Loop through the queue at the interface and send
2906 	 * as many packets as possible.
2907 	 */
2908 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2909 
2910 	while (t->tbf_q_len > 0) {
2911 		mp = t->tbf_q;
2912 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2913 
2914 		/* Determine if the packet can be sent */
2915 		if (len <= t->tbf_n_tok) {
2916 			/*
2917 			 * If so, reduce no. of tokens, dequeue the packet,
2918 			 * send the packet.
2919 			 */
2920 			t->tbf_n_tok -= len;
2921 
2922 			t->tbf_q = mp->b_next;
2923 			if (--t->tbf_q_len == 0) {
2924 				t->tbf_t = NULL;
2925 			}
2926 			mp->b_next = NULL;
2927 			/* Exit mutex before sending packet, then re-enter */
2928 			mutex_exit(&t->tbf_lock);
2929 			tbf_send_packet(vifp, mp);
2930 			mutex_enter(&t->tbf_lock);
2931 		} else
2932 			break;
2933 	}
2934 }
2935 
2936 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2937 static void
2938 tbf_reprocess_q(void *arg)
2939 {
2940 	struct vif *vifp = arg;
2941 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2942 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2943 
2944 	mutex_enter(&vifp->v_tbf->tbf_lock);
2945 	vifp->v_timeout_id = 0;
2946 	tbf_update_tokens(vifp);
2947 
2948 	tbf_process_q(vifp);
2949 
2950 	if (vifp->v_tbf->tbf_q_len > 0) {
2951 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2952 		    TBF_REPROCESS);
2953 	}
2954 	mutex_exit(&vifp->v_tbf->tbf_lock);
2955 
2956 	if (ipst->ips_ip_mrtdebug > 1) {
2957 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2958 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2959 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
2960 	}
2961 }
2962 
2963 /*
2964  * Function that will selectively discard a member of the tbf queue,
2965  * based on the precedence value and the priority.
2966  *
2967  * NOTE : The caller should quntimeout if the queue length is 0.
2968  */
2969 static int
2970 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2971 {
2972 	uint_t		p;
2973 	struct tbf		*t = vifp->v_tbf;
2974 	mblk_t		**np;
2975 	mblk_t		*last, *mp;
2976 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2977 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2978 
2979 	if (ipst->ips_ip_mrtdebug > 1) {
2980 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2981 		    "dq_sel: vif %ld dst 0x%x",
2982 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
2983 	}
2984 
2985 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2986 	p = priority(vifp, ipha);
2987 
2988 	np = &t->tbf_q;
2989 	last = NULL;
2990 	while ((mp = *np) != NULL) {
2991 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2992 			*np = mp->b_next;
2993 			/* If removing the last packet, fix the tail pointer */
2994 			if (mp == t->tbf_t)
2995 				t->tbf_t = last;
2996 			mp->b_prev = mp->b_next = NULL;
2997 			freemsg(mp);
2998 			/*
2999 			 * It's impossible for the queue to be empty, but
3000 			 * we check anyway.
3001 			 */
3002 			if (--t->tbf_q_len == 0) {
3003 				t->tbf_t = NULL;
3004 			}
3005 			ipst->ips_mrtstat->mrts_drop_sel++;
3006 			return (1);
3007 		}
3008 		np = &mp->b_next;
3009 		last = mp;
3010 	}
3011 	return (0);
3012 }
3013 
3014 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3015 static void
3016 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3017 {
3018 	ipif_t  *ipif;
3019 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3020 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3021 
3022 	/* If encap tunnel options */
3023 	if (vifp->v_flags & VIFF_TUNNEL)  {
3024 		if (ipst->ips_ip_mrtdebug > 1) {
3025 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3026 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
3027 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3028 		}
3029 
3030 		/*
3031 		 * Feed into ip_wput which will set the ident field and
3032 		 * checksum the encapsulating header.
3033 		 * BSD gets the cached route vifp->v_route from ip_output()
3034 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3035 		 */
3036 		put(vifp->v_ipif->ipif_wq, mp);
3037 		return;
3038 
3039 		/* phyint */
3040 	} else {
3041 		/* Need to loop back to members on the outgoing interface. */
3042 		ipha_t  *ipha;
3043 		ipaddr_t    dst;
3044 		ipha  = (ipha_t *)mp->b_rptr;
3045 		dst  = ipha->ipha_dst;
3046 		ipif = vifp->v_ipif;
3047 
3048 		mutex_enter(&ipif->ipif_ill->ill_lock);
3049 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
3050 			/*
3051 			 * The packet is not yet reassembled, thus we need to
3052 			 * pass it to ip_rput_local for checksum verification
3053 			 * and reassembly (and fanout the user stream).
3054 			 */
3055 			mblk_t 	*mp_loop;
3056 			ire_t	*ire;
3057 
3058 			mutex_exit(&ipif->ipif_ill->ill_lock);
3059 			if (ipst->ips_ip_mrtdebug > 1) {
3060 				(void) mi_strlog(mrouter->conn_rq, 1,
3061 				    SL_TRACE,
3062 				    "tbf_send_pkt: loopback vif %ld",
3063 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3064 			}
3065 			mp_loop = copymsg(mp);
3066 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
3067 			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
3068 
3069 			if (mp_loop != NULL && ire != NULL) {
3070 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
3071 				    ((ipha_t *)mp_loop->b_rptr),
3072 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
3073 			} else {
3074 				/* Either copymsg failed or no ire */
3075 				(void) mi_strlog(mrouter->conn_rq, 1,
3076 				    SL_TRACE,
3077 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
3078 				    "vif %ld\n", mp_loop, ire,
3079 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3080 			}
3081 			if (ire != NULL)
3082 				ire_refrele(ire);
3083 		} else {
3084 			mutex_exit(&ipif->ipif_ill->ill_lock);
3085 		}
3086 		if (ipst->ips_ip_mrtdebug > 1) {
3087 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3088 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3089 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3090 		}
3091 		ip_rput_forward_multicast(dst, mp, ipif);
3092 	}
3093 }
3094 
3095 /*
3096  * Determine the current time and then the elapsed time (between the last time
3097  * and time now).  Update the no. of tokens in the bucket.
3098  */
3099 static void
3100 tbf_update_tokens(struct vif *vifp)
3101 {
3102 	timespec_t	tp;
3103 	hrtime_t	tm;
3104 	struct tbf	*t = vifp->v_tbf;
3105 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3106 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3107 
3108 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3109 
3110 	/* Time in secs and nsecs, rate limit in kbits/sec */
3111 	gethrestime(&tp);
3112 
3113 	/*LINTED*/
3114 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3115 
3116 	/*
3117 	 * This formula is actually
3118 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3119 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3120 	 *
3121 	 * The (1000/1024) was introduced in add_vif to optimize
3122 	 * this divide into a shift.
3123 	 */
3124 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3125 	t->tbf_last_pkt_t = tp;
3126 
3127 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3128 		t->tbf_n_tok = MAX_BKT_SIZE;
3129 	if (ipst->ips_ip_mrtdebug > 1) {
3130 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3131 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3132 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3133 	}
3134 }
3135 
3136 /*
3137  * Priority currently is based on port nos.
3138  * Different forwarding mechanisms have different ways
3139  * of obtaining the port no. Hence, the vif must be
3140  * given along with the packet itself.
3141  *
3142  */
3143 static int
3144 priority(struct vif *vifp, ipha_t *ipha)
3145 {
3146 	int prio;
3147 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3148 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3149 
3150 	/* Temporary hack; may add general packet classifier some day */
3151 
3152 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3153 
3154 	/*
3155 	 * The UDP port space is divided up into four priority ranges:
3156 	 * [0, 16384)	: unclassified - lowest priority
3157 	 * [16384, 32768)	: audio - highest priority
3158 	 * [32768, 49152)	: whiteboard - medium priority
3159 	 * [49152, 65536)	: video - low priority
3160 	 */
3161 
3162 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3163 		struct udphdr *udp =
3164 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3165 		switch (ntohs(udp->uh_dport) & 0xc000) {
3166 		case 0x4000:
3167 			prio = 70;
3168 			break;
3169 		case 0x8000:
3170 			prio = 60;
3171 			break;
3172 		case 0xc000:
3173 			prio = 55;
3174 			break;
3175 		default:
3176 			prio = 50;
3177 			break;
3178 		}
3179 		if (ipst->ips_ip_mrtdebug > 1) {
3180 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3181 			    "priority: port %x prio %d\n",
3182 			    ntohs(udp->uh_dport), prio);
3183 		}
3184 	} else
3185 		prio = 50;  /* default priority */
3186 	return (prio);
3187 }
3188 
3189 /*
3190  * End of token bucket filter modifications
3191  */
3192 
3193 
3194 
3195 /*
3196  * Produces data for netstat -M.
3197  */
3198 int
3199 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3200 {
3201 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3202 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3203 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3204 		sizeof (struct mrtstat))) {
3205 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3206 		    (size_t)sizeof (struct mrtstat)));
3207 		return (0);
3208 	}
3209 	return (1);
3210 }
3211 
3212 /*
3213  * Sends info for SNMP's MIB.
3214  */
3215 int
3216 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3217 {
3218 	struct vifctl 	vi;
3219 	vifi_t		vifi;
3220 
3221 	mutex_enter(&ipst->ips_numvifs_mutex);
3222 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3223 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3224 			continue;
3225 		/*
3226 		 * No locks here, an approximation is fine.
3227 		 */
3228 		vi.vifc_vifi = vifi;
3229 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3230 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3231 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3232 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3233 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3234 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3235 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3236 
3237 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3238 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3239 			    (size_t)sizeof (vi)));
3240 			return (0);
3241 		}
3242 	}
3243 	mutex_exit(&ipst->ips_numvifs_mutex);
3244 	return (1);
3245 }
3246 
3247 /*
3248  * Called by ip_snmp_get to send up multicast routing table.
3249  */
3250 int
3251 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3252 {
3253 	int			i, j;
3254 	struct mfc		*rt;
3255 	struct mfcctl	mfcc;
3256 
3257 	/*
3258 	 * Make sure multicast has not been turned off.
3259 	 */
3260 	if (is_mrouter_off(ipst))
3261 		return (1);
3262 
3263 	/* Loop over all hash buckets and their chains */
3264 	for (i = 0; i < MFCTBLSIZ; i++) {
3265 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3266 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3267 			mutex_enter(&rt->mfc_mutex);
3268 			if (rt->mfc_rte != NULL ||
3269 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3270 				mutex_exit(&rt->mfc_mutex);
3271 				continue;
3272 			}
3273 			mfcc.mfcc_origin = rt->mfc_origin;
3274 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3275 			mfcc.mfcc_parent = rt->mfc_parent;
3276 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3277 			mutex_enter(&ipst->ips_numvifs_mutex);
3278 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3279 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3280 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3281 				mfcc.mfcc_ttls[j] = 0;
3282 			mutex_exit(&ipst->ips_numvifs_mutex);
3283 
3284 			mutex_exit(&rt->mfc_mutex);
3285 			if (!snmp_append_data(mp, (char *)&mfcc,
3286 			    sizeof (mfcc))) {
3287 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3288 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3289 				    (size_t)sizeof (mfcc)));
3290 				return (0);
3291 			}
3292 		}
3293 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3294 	}
3295 	return (1);
3296 }
3297