xref: /illumos-gate/usr/src/uts/common/inet/ip/ip_mroute.c (revision 328c7d1fa5cb168d2356ae60c27a09bd3f6f35e8)
1 /*
2  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  */
5 /*
6  * CDDL HEADER START
7  *
8  * The contents of this file are subject to the terms of the
9  * Common Development and Distribution License (the "License").
10  * You may not use this file except in compliance with the License.
11  *
12  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
13  * or http://www.opensolaris.org/os/licensing.
14  * See the License for the specific language governing permissions
15  * and limitations under the License.
16  *
17  * When distributing Covered Code, include this CDDL HEADER in each
18  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
19  * If applicable, add the following below this CDDL HEADER, with the
20  * fields enclosed by brackets "[]" replaced with your own identifying
21  * information: Portions Copyright [yyyy] [name of copyright owner]
22  *
23  * CDDL HEADER END
24  */
25 /*
26  * Copyright 2007 Sun Microsystems, Inc.
27  * All rights reserved.  Use is subject to license terms.
28  */
29 /* Copyright (c) 1990 Mentat Inc. */
30 
31 #pragma ident	"%Z%%M%	%I%	%E% SMI"
32 
33 /*
34  * Procedures for the kernel part of DVMRP,
35  * a Distance-Vector Multicast Routing Protocol.
36  * (See RFC-1075)
37  * Written by David Waitzman, BBN Labs, August 1988.
38  * Modified by Steve Deering, Stanford, February 1989.
39  * Modified by Mark J. Steiglitz, Stanford, May, 1991
40  * Modified by Van Jacobson, LBL, January 1993
41  * Modified by Ajit Thyagarajan, PARC, August 1993
42  * Modified by Bill Fenner, PARC, April 1995
43  *
44  * MROUTING 3.5
45  */
46 
47 /*
48  * TODO
49  * - function pointer field in vif, void *vif_sendit()
50  */
51 
52 #include <sys/types.h>
53 #include <sys/stream.h>
54 #include <sys/stropts.h>
55 #include <sys/strlog.h>
56 #include <sys/systm.h>
57 #include <sys/ddi.h>
58 #include <sys/cmn_err.h>
59 #include <sys/zone.h>
60 
61 #include <sys/param.h>
62 #include <sys/socket.h>
63 #include <sys/vtrace.h>
64 #include <sys/debug.h>
65 #include <net/if.h>
66 #include <sys/sockio.h>
67 #include <netinet/in.h>
68 #include <net/if_dl.h>
69 
70 #include <inet/common.h>
71 #include <inet/mi.h>
72 #include <inet/nd.h>
73 #include <inet/mib2.h>
74 #include <netinet/ip6.h>
75 #include <inet/ip.h>
76 #include <inet/snmpcom.h>
77 
78 #include <netinet/igmp.h>
79 #include <netinet/igmp_var.h>
80 #include <netinet/udp.h>
81 #include <netinet/ip_mroute.h>
82 #include <inet/ip_multi.h>
83 #include <inet/ip_ire.h>
84 #include <inet/ip_if.h>
85 #include <inet/ipclassifier.h>
86 
87 #include <netinet/pim.h>
88 
89 
90 /*
91  * MT Design:
92  *
93  * There are three main data structures viftable, mfctable and tbftable that
94  * need to be protected against MT races.
95  *
96  * vitable is a fixed length array of vif structs. There is no lock to protect
97  * the whole array, instead each struct is protected by its own indiviual lock.
98  * The value of v_marks in conjuction with the value of v_refcnt determines the
99  * current state of a vif structure. One special state that needs mention
100  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
101  * that vif is being initalized.
102  * Each structure is freed when the refcnt goes down to zero. If a delete comes
103  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
104  * which prevents the struct from further use.  When the refcnt goes to zero
105  * the struct is freed and is marked VIF_MARK_NOTINUSE.
106  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
107  * from  going away a refhold is put on the ipif before using it. see
108  * lock_good_vif() and unlock_good_vif().
109  *
110  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
111  * of the vif struct.
112  *
113  * tbftable is also a fixed length array of tbf structs and is only accessed
114  * via v_tbf.  It is protected by its own lock tbf_lock.
115  *
116  * Lock Ordering is
117  * v_lock --> tbf_lock
118  * v_lock --> ill_locK
119  *
120  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
121  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
122  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
123  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
124  * protect the struct elements.
125  *
126  * mfc structs are dynamically allocated and are singly linked
127  * at the head of the chain. When an mfc structure is to be deleted
128  * it is marked condemned and so is the state in the bucket struct.
129  * When the last walker of the hash bucket exits all the mfc structs
130  * marked condemed are freed.
131  *
132  * Locking Hierarchy:
133  * The bucket lock should be acquired before the mfc struct lock.
134  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
135  * operations on the bucket struct.
136  *
137  * last_encap_lock and numvifs_mutex should be acquired after
138  * acquring vif or mfc locks. These locks protect some global variables.
139  *
140  * The statistics are not currently protected by a lock
141  * causing the stats be be approximate, not exact.
142  */
143 
144 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
145 
146 /*
147  * Timeouts:
148  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
149  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
150  *	SunOS 5.x uses mfc->timeout for each mfc.
151  *	Some Unixes are limited in the number of simultaneous timeouts
152  * 	that can be run, SunOS 5.x does not have this restriction.
153  */
154 
155 /*
156  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
157  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
158  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
159  */
160 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
161 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
162 
163 /*
164  * Hash function for a source, group entry
165  */
166 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
167 	((g) >> 20) ^ ((g) >> 10) ^ (g))
168 
169 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
170 
171 /* Identify PIM packet that came on a Register interface */
172 #define	PIM_REGISTER_MARKER	0xffffffff
173 
174 /* Function declarations */
175 static int	add_mfc(struct mfcctl *, ip_stack_t *);
176 static int	add_vif(struct vifctl *, conn_t *, mblk_t *, ip_stack_t *);
177 static int	del_mfc(struct mfcctl *, ip_stack_t *);
178 static int	del_vif(vifi_t *, conn_t *, mblk_t *, ip_stack_t *);
179 static void	del_vifp(struct vif *);
180 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
181 static void	expire_upcalls(void *);
182 static void	fill_route(struct mfc *, struct mfcctl *, ip_stack_t *);
183 static void	free_queue(struct mfc *);
184 static int	get_assert(uchar_t *, ip_stack_t *);
185 static int	get_lsg_cnt(struct sioc_lsg_req *, ip_stack_t *);
186 static int	get_sg_cnt(struct sioc_sg_req *, ip_stack_t *);
187 static int	get_version(uchar_t *);
188 static int	get_vif_cnt(struct sioc_vif_req *, ip_stack_t *);
189 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
190 		    ipaddr_t, struct mfc *);
191 static int	ip_mrouter_init(conn_t *, uchar_t *, int, ip_stack_t *);
192 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
193 static int	register_mforward(queue_t *, mblk_t *, ill_t *);
194 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
195 static int	set_assert(int *, ip_stack_t *);
196 
197 /*
198  * Token Bucket Filter functions
199  */
200 static int  priority(struct vif *, ipha_t *);
201 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
202 static int  tbf_dq_sel(struct vif *, ipha_t *);
203 static void tbf_process_q(struct vif *);
204 static void tbf_queue(struct vif *, mblk_t *);
205 static void tbf_reprocess_q(void *);
206 static void tbf_send_packet(struct vif *, mblk_t *);
207 static void tbf_update_tokens(struct vif *);
208 static void release_mfc(struct mfcb *);
209 
210 static boolean_t is_mrouter_off(ip_stack_t *);
211 /*
212  * Encapsulation packets
213  */
214 
215 #define	ENCAP_TTL	64
216 
217 /* prototype IP hdr for encapsulated packets */
218 static ipha_t multicast_encap_iphdr = {
219 	IP_SIMPLE_HDR_VERSION,
220 	0,				/* tos */
221 	sizeof (ipha_t),		/* total length */
222 	0,				/* id */
223 	0,				/* frag offset */
224 	ENCAP_TTL, IPPROTO_ENCAP,
225 	0,				/* checksum */
226 };
227 
228 /*
229  * Rate limit for assert notification messages, in nsec.
230  */
231 #define	ASSERT_MSG_TIME		3000000000
232 
233 
234 #define	VIF_REFHOLD(vifp) {			\
235 	mutex_enter(&(vifp)->v_lock);		\
236 	(vifp)->v_refcnt++;			\
237 	mutex_exit(&(vifp)->v_lock);		\
238 }
239 
240 #define	VIF_REFRELE_LOCKED(vifp) {				\
241 	(vifp)->v_refcnt--;					\
242 	if ((vifp)->v_refcnt == 0 &&				\
243 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
244 			del_vifp(vifp);				\
245 	} else {						\
246 		mutex_exit(&(vifp)->v_lock);			\
247 	}							\
248 }
249 
250 #define	VIF_REFRELE(vifp) {					\
251 	mutex_enter(&(vifp)->v_lock);				\
252 	(vifp)->v_refcnt--;					\
253 	if ((vifp)->v_refcnt == 0 &&				\
254 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
255 			del_vifp(vifp);				\
256 	} else {						\
257 		mutex_exit(&(vifp)->v_lock);			\
258 	}							\
259 }
260 
261 #define	MFCB_REFHOLD(mfcb) {				\
262 	mutex_enter(&(mfcb)->mfcb_lock);		\
263 	(mfcb)->mfcb_refcnt++;				\
264 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
265 	mutex_exit(&(mfcb)->mfcb_lock);			\
266 }
267 
268 #define	MFCB_REFRELE(mfcb) {					\
269 	mutex_enter(&(mfcb)->mfcb_lock);			\
270 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
271 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
272 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
273 			release_mfc(mfcb);			\
274 	}							\
275 	mutex_exit(&(mfcb)->mfcb_lock);				\
276 }
277 
278 /*
279  * MFCFIND:
280  * Find a route for a given origin IP address and multicast group address.
281  * Skip entries with pending upcalls.
282  * Type of service parameter to be added in the future!
283  */
284 #define	MFCFIND(mfcbp, o, g, rt) { \
285 	struct mfc *_mb_rt = NULL; \
286 	rt = NULL; \
287 	_mb_rt = mfcbp->mfcb_mfc; \
288 	while (_mb_rt) { \
289 		if ((_mb_rt->mfc_origin.s_addr == o) && \
290 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
291 		    (_mb_rt->mfc_rte == NULL) && \
292 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
293 		    rt = _mb_rt; \
294 		    break; \
295 		} \
296 	_mb_rt = _mb_rt->mfc_next; \
297 	} \
298 }
299 
300 /*
301  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
302  * are inefficient. We use gethrestime() which returns a timespec_t with
303  * sec and nsec, the resolution is machine dependent.
304  * The following 2 macros have been changed to use nsec instead of usec.
305  */
306 /*
307  * Macros to compute elapsed time efficiently.
308  * Borrowed from Van Jacobson's scheduling code.
309  * Delta should be a hrtime_t.
310  */
311 #define	TV_DELTA(a, b, delta) { \
312 	int xxs; \
313  \
314 	delta = (a).tv_nsec - (b).tv_nsec; \
315 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
316 		switch (xxs) { \
317 		case 2: \
318 		    delta += 1000000000; \
319 		    /*FALLTHROUGH*/ \
320 		case 1: \
321 		    delta += 1000000000; \
322 		    break; \
323 		default: \
324 		    delta += (1000000000 * xxs); \
325 		} \
326 	} \
327 }
328 
329 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
330 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
331 
332 /*
333  * Handle MRT setsockopt commands to modify the multicast routing tables.
334  */
335 int
336 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
337     int datalen, mblk_t *first_mp)
338 {
339 	conn_t		*connp = Q_TO_CONN(q);
340 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
341 
342 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
343 	if (cmd != MRT_INIT && connp != ipst->ips_ip_g_mrouter) {
344 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
345 		return (EACCES);
346 	}
347 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
348 
349 	if (checkonly) {
350 		/*
351 		 * do not do operation, just pretend to - new T_CHECK
352 		 * Note: Even routines further on can probably fail but
353 		 * this T_CHECK stuff is only to please XTI so it not
354 		 * necessary to be perfect.
355 		 */
356 		switch (cmd) {
357 		case MRT_INIT:
358 		case MRT_DONE:
359 		case MRT_ADD_VIF:
360 		case MRT_DEL_VIF:
361 		case MRT_ADD_MFC:
362 		case MRT_DEL_MFC:
363 		case MRT_ASSERT:
364 			return (0);
365 		default:
366 			return (EOPNOTSUPP);
367 		}
368 	}
369 
370 	/*
371 	 * make sure no command is issued after multicast routing has been
372 	 * turned off.
373 	 */
374 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
375 		if (is_mrouter_off(ipst))
376 			return (EINVAL);
377 	}
378 
379 	switch (cmd) {
380 	case MRT_INIT:	return (ip_mrouter_init(connp, data, datalen, ipst));
381 	case MRT_DONE:	return (ip_mrouter_done(first_mp, ipst));
382 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, connp,
383 			    first_mp, ipst));
384 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, connp, first_mp,
385 			    ipst));
386 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data, ipst));
387 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data, ipst));
388 	case MRT_ASSERT:   return (set_assert((int *)data, ipst));
389 	default:	   return (EOPNOTSUPP);
390 	}
391 }
392 
393 /*
394  * Handle MRT getsockopt commands
395  */
396 int
397 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
398 {
399 	conn_t		*connp = Q_TO_CONN(q);
400 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
401 
402 	if (connp != ipst->ips_ip_g_mrouter)
403 		return (EACCES);
404 
405 	switch (cmd) {
406 	case MRT_VERSION:	return (get_version((uchar_t *)data));
407 	case MRT_ASSERT:	return (get_assert((uchar_t *)data, ipst));
408 	default:		return (EOPNOTSUPP);
409 	}
410 }
411 
412 /*
413  * Handle ioctl commands to obtain information from the cache.
414  * Called with shared access to IP. These are read_only ioctls.
415  */
416 /* ARGSUSED */
417 int
418 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
419     ip_ioctl_cmd_t *ipip, void *if_req)
420 {
421 	mblk_t	*mp1;
422 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
423 	conn_t		*connp = Q_TO_CONN(q);
424 	ip_stack_t	*ipst = connp->conn_netstack->netstack_ip;
425 
426 	/* Existence verified in ip_wput_nondata */
427 	mp1 = mp->b_cont->b_cont;
428 
429 	switch (iocp->ioc_cmd) {
430 	case (SIOCGETVIFCNT):
431 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr, ipst));
432 	case (SIOCGETSGCNT):
433 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr, ipst));
434 	case (SIOCGETLSGCNT):
435 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr, ipst));
436 	default:
437 		return (EINVAL);
438 	}
439 }
440 
441 /*
442  * Returns the packet, byte, rpf-failure count for the source, group provided.
443  */
444 static int
445 get_sg_cnt(struct sioc_sg_req *req, ip_stack_t *ipst)
446 {
447 	struct mfc *rt;
448 	struct mfcb *mfcbp;
449 
450 	mfcbp = &ipst->ips_mfcs[MFCHASH(req->src.s_addr, req->grp.s_addr)];
451 	MFCB_REFHOLD(mfcbp);
452 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
453 
454 	if (rt != NULL) {
455 		mutex_enter(&rt->mfc_mutex);
456 		req->pktcnt   = rt->mfc_pkt_cnt;
457 		req->bytecnt  = rt->mfc_byte_cnt;
458 		req->wrong_if = rt->mfc_wrong_if;
459 		mutex_exit(&rt->mfc_mutex);
460 	} else
461 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
462 
463 	MFCB_REFRELE(mfcbp);
464 	return (0);
465 }
466 
467 /*
468  * Returns the packet, byte, rpf-failure count for the source, group provided.
469  * Uses larger counters and IPv6 addresses.
470  */
471 /* ARGSUSED XXX until implemented */
472 static int
473 get_lsg_cnt(struct sioc_lsg_req *req, ip_stack_t *ipst)
474 {
475 	/* XXX TODO SIOCGETLSGCNT */
476 	return (ENXIO);
477 }
478 
479 /*
480  * Returns the input and output packet and byte counts on the vif provided.
481  */
482 static int
483 get_vif_cnt(struct sioc_vif_req *req, ip_stack_t *ipst)
484 {
485 	vifi_t vifi = req->vifi;
486 
487 	if (vifi >= ipst->ips_numvifs)
488 		return (EINVAL);
489 
490 	/*
491 	 * No locks here, an approximation is fine.
492 	 */
493 	req->icount = ipst->ips_vifs[vifi].v_pkt_in;
494 	req->ocount = ipst->ips_vifs[vifi].v_pkt_out;
495 	req->ibytes = ipst->ips_vifs[vifi].v_bytes_in;
496 	req->obytes = ipst->ips_vifs[vifi].v_bytes_out;
497 
498 	return (0);
499 }
500 
501 static int
502 get_version(uchar_t *data)
503 {
504 	int *v = (int *)data;
505 
506 	*v = 0x0305;	/* XXX !!!! */
507 
508 	return (0);
509 }
510 
511 /*
512  * Set PIM assert processing global.
513  */
514 static int
515 set_assert(int *i, ip_stack_t *ipst)
516 {
517 	if ((*i != 1) && (*i != 0))
518 		return (EINVAL);
519 
520 	ipst->ips_pim_assert = *i;
521 
522 	return (0);
523 }
524 
525 /*
526  * Get PIM assert processing global.
527  */
528 static int
529 get_assert(uchar_t *data, ip_stack_t *ipst)
530 {
531 	int *i = (int *)data;
532 
533 	*i = ipst->ips_pim_assert;
534 
535 	return (0);
536 }
537 
538 /*
539  * Enable multicast routing.
540  */
541 static int
542 ip_mrouter_init(conn_t *connp, uchar_t *data, int datalen, ip_stack_t *ipst)
543 {
544 	int	*v;
545 
546 	if (data == NULL || (datalen != sizeof (int)))
547 		return (ENOPROTOOPT);
548 
549 	v = (int *)data;
550 	if (*v != 1)
551 		return (ENOPROTOOPT);
552 
553 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
554 	if (ipst->ips_ip_g_mrouter != NULL) {
555 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
556 		return (EADDRINUSE);
557 	}
558 
559 	/*
560 	 * MRT_INIT should only be allowed for RAW sockets, but we double
561 	 * check.
562 	 */
563 	if (!IPCL_IS_RAWIP(connp)) {
564 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
565 		return (EINVAL);
566 	}
567 
568 	ipst->ips_ip_g_mrouter = connp;
569 	connp->conn_multi_router = 1;
570 	/* In order for tunnels to work we have to turn ip_g_forward on */
571 	if (!WE_ARE_FORWARDING(ipst)) {
572 		if (ipst->ips_ip_mrtdebug > 1) {
573 			(void) mi_strlog(connp->conn_rq, 1, SL_TRACE,
574 			    "ip_mrouter_init: turning on forwarding");
575 		}
576 		ipst->ips_saved_ip_g_forward = ipst->ips_ip_g_forward;
577 		ipst->ips_ip_g_forward = IP_FORWARD_ALWAYS;
578 	}
579 
580 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
581 	return (0);
582 }
583 
584 void
585 ip_mrouter_stack_init(ip_stack_t *ipst)
586 {
587 	mutex_init(&ipst->ips_ip_g_mrouter_mutex, NULL, MUTEX_DEFAULT, NULL);
588 
589 	ipst->ips_vifs = kmem_zalloc(sizeof (struct vif) * (MAXVIFS+1),
590 	    KM_SLEEP);
591 	ipst->ips_mrtstat = kmem_zalloc(sizeof (struct mrtstat), KM_SLEEP);
592 	/*
593 	 * mfctable:
594 	 * Includes all mfcs, including waiting upcalls.
595 	 * Multiple mfcs per bucket.
596 	 */
597 	ipst->ips_mfcs = kmem_zalloc(sizeof (struct mfcb) * MFCTBLSIZ,
598 	    KM_SLEEP);
599 	/*
600 	 * Define the token bucket filter structures.
601 	 * tbftable -> each vif has one of these for storing info.
602 	 */
603 	ipst->ips_tbfs = kmem_zalloc(sizeof (struct tbf) * MAXVIFS, KM_SLEEP);
604 
605 	mutex_init(&ipst->ips_last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
606 
607 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
608 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
609 }
610 
611 /*
612  * Disable multicast routing.
613  * Didn't use global timeout_val (BSD version), instead check the mfctable.
614  */
615 int
616 ip_mrouter_done(mblk_t *mp, ip_stack_t *ipst)
617 {
618 	conn_t		*mrouter;
619 	vifi_t 		vifi;
620 	struct mfc	*mfc_rt;
621 	int		i;
622 
623 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
624 	if (ipst->ips_ip_g_mrouter == NULL) {
625 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
626 		return (EINVAL);
627 	}
628 
629 	mrouter = ipst->ips_ip_g_mrouter;
630 
631 	if (ipst->ips_saved_ip_g_forward != -1) {
632 		if (ipst->ips_ip_mrtdebug > 1) {
633 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
634 			    "ip_mrouter_done: turning off forwarding");
635 		}
636 		ipst->ips_ip_g_forward = ipst->ips_saved_ip_g_forward;
637 		ipst->ips_saved_ip_g_forward = -1;
638 	}
639 
640 	/*
641 	 * Always clear cache when vifs change.
642 	 * No need to get ipst->ips_last_encap_lock since we are running as
643 	 * a writer.
644 	 */
645 	mutex_enter(&ipst->ips_last_encap_lock);
646 	ipst->ips_last_encap_src = 0;
647 	ipst->ips_last_encap_vif = NULL;
648 	mutex_exit(&ipst->ips_last_encap_lock);
649 	mrouter->conn_multi_router = 0;
650 
651 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
652 
653 	/*
654 	 * For each phyint in use,
655 	 * disable promiscuous reception of all IP multicasts.
656 	 */
657 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
658 		struct vif *vifp = ipst->ips_vifs + vifi;
659 
660 		mutex_enter(&vifp->v_lock);
661 		/*
662 		 * if the vif is active mark it condemned.
663 		 */
664 		if (vifp->v_marks & VIF_MARK_GOOD) {
665 			ASSERT(vifp->v_ipif != NULL);
666 			ipif_refhold(vifp->v_ipif);
667 			/* Phyint only */
668 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
669 				ipif_t *ipif = vifp->v_ipif;
670 				ipsq_t  *ipsq;
671 				boolean_t suc;
672 				ill_t *ill;
673 
674 				ill = ipif->ipif_ill;
675 				suc = B_FALSE;
676 				if (mp == NULL) {
677 					/*
678 					 * being called from ip_close,
679 					 * lets do it synchronously.
680 					 * Clear VIF_MARK_GOOD and
681 					 * set VIF_MARK_CONDEMNED.
682 					 */
683 					vifp->v_marks &= ~VIF_MARK_GOOD;
684 					vifp->v_marks |= VIF_MARK_CONDEMNED;
685 					mutex_exit(&(vifp)->v_lock);
686 					suc = ipsq_enter(ill, B_FALSE);
687 					ipsq = ill->ill_phyint->phyint_ipsq;
688 				} else {
689 					ipsq = ipsq_try_enter(ipif, NULL,
690 					    mrouter->conn_wq, mp,
691 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
692 					if (ipsq == NULL) {
693 						mutex_exit(&(vifp)->v_lock);
694 						ipif_refrele(ipif);
695 						return (EINPROGRESS);
696 					}
697 					/*
698 					 * Clear VIF_MARK_GOOD and
699 					 * set VIF_MARK_CONDEMNED.
700 					 */
701 					vifp->v_marks &= ~VIF_MARK_GOOD;
702 					vifp->v_marks |= VIF_MARK_CONDEMNED;
703 					mutex_exit(&(vifp)->v_lock);
704 					suc = B_TRUE;
705 				}
706 
707 				if (suc) {
708 					(void) ip_delmulti(INADDR_ANY, ipif,
709 					    B_TRUE, B_TRUE);
710 					ipsq_exit(ipsq);
711 				}
712 				mutex_enter(&vifp->v_lock);
713 			}
714 			/*
715 			 * decreases the refcnt added in add_vif.
716 			 * and release v_lock.
717 			 */
718 			VIF_REFRELE_LOCKED(vifp);
719 		} else {
720 			mutex_exit(&vifp->v_lock);
721 			continue;
722 		}
723 	}
724 
725 	mutex_enter(&ipst->ips_numvifs_mutex);
726 	ipst->ips_numvifs = 0;
727 	ipst->ips_pim_assert = 0;
728 	ipst->ips_reg_vif_num = ALL_VIFS;
729 	mutex_exit(&ipst->ips_numvifs_mutex);
730 
731 	/*
732 	 * Free upcall msgs.
733 	 * Go through mfctable and stop any outstanding upcall
734 	 * timeouts remaining on mfcs.
735 	 */
736 	for (i = 0; i < MFCTBLSIZ; i++) {
737 		mutex_enter(&ipst->ips_mfcs[i].mfcb_lock);
738 		ipst->ips_mfcs[i].mfcb_refcnt++;
739 		ipst->ips_mfcs[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
740 		mutex_exit(&ipst->ips_mfcs[i].mfcb_lock);
741 		mfc_rt = ipst->ips_mfcs[i].mfcb_mfc;
742 		while (mfc_rt) {
743 			/* Free upcalls */
744 			mutex_enter(&mfc_rt->mfc_mutex);
745 			if (mfc_rt->mfc_rte != NULL) {
746 				if (mfc_rt->mfc_timeout_id != 0) {
747 					/*
748 					 * OK to drop the lock as we have
749 					 * a refcnt on the bucket. timeout
750 					 * can fire but it will see that
751 					 * mfc_timeout_id == 0 and not do
752 					 * anything. see expire_upcalls().
753 					 */
754 					mfc_rt->mfc_timeout_id = 0;
755 					mutex_exit(&mfc_rt->mfc_mutex);
756 					(void) untimeout(
757 					    mfc_rt->mfc_timeout_id);
758 						mfc_rt->mfc_timeout_id = 0;
759 					mutex_enter(&mfc_rt->mfc_mutex);
760 
761 					/*
762 					 * all queued upcall packets
763 					 * and mblk will be freed in
764 					 * release_mfc().
765 					 */
766 				}
767 			}
768 
769 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
770 
771 			mutex_exit(&mfc_rt->mfc_mutex);
772 			mfc_rt = mfc_rt->mfc_next;
773 		}
774 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
775 	}
776 
777 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
778 	ipst->ips_ip_g_mrouter = NULL;
779 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
780 	return (0);
781 }
782 
783 void
784 ip_mrouter_stack_destroy(ip_stack_t *ipst)
785 {
786 	struct mfcb *mfcbp;
787 	struct mfc  *rt;
788 	int i;
789 
790 	for (i = 0; i < MFCTBLSIZ; i++) {
791 		mfcbp = &ipst->ips_mfcs[i];
792 
793 		while ((rt = mfcbp->mfcb_mfc) != NULL) {
794 			(void) printf("ip_mrouter_stack_destroy: free for %d\n",
795 			    i);
796 
797 			mfcbp->mfcb_mfc = rt->mfc_next;
798 			free_queue(rt);
799 			mi_free(rt);
800 		}
801 	}
802 	kmem_free(ipst->ips_vifs, sizeof (struct vif) * (MAXVIFS+1));
803 	ipst->ips_vifs = NULL;
804 	kmem_free(ipst->ips_mrtstat, sizeof (struct mrtstat));
805 	ipst->ips_mrtstat = NULL;
806 	kmem_free(ipst->ips_mfcs, sizeof (struct mfcb) * MFCTBLSIZ);
807 	ipst->ips_mfcs = NULL;
808 	kmem_free(ipst->ips_tbfs, sizeof (struct tbf) * MAXVIFS);
809 	ipst->ips_tbfs = NULL;
810 
811 	mutex_destroy(&ipst->ips_last_encap_lock);
812 	mutex_destroy(&ipst->ips_ip_g_mrouter_mutex);
813 }
814 
815 static boolean_t
816 is_mrouter_off(ip_stack_t *ipst)
817 {
818 	conn_t	*mrouter;
819 
820 	mutex_enter(&ipst->ips_ip_g_mrouter_mutex);
821 	if (ipst->ips_ip_g_mrouter == NULL) {
822 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
823 		return (B_TRUE);
824 	}
825 
826 	mrouter = ipst->ips_ip_g_mrouter;
827 	if (mrouter->conn_multi_router == 0) {
828 		mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
829 		return (B_TRUE);
830 	}
831 	mutex_exit(&ipst->ips_ip_g_mrouter_mutex);
832 	return (B_FALSE);
833 }
834 
835 static void
836 unlock_good_vif(struct vif *vifp)
837 {
838 	ASSERT(vifp->v_ipif != NULL);
839 	ipif_refrele(vifp->v_ipif);
840 	VIF_REFRELE(vifp);
841 }
842 
843 static boolean_t
844 lock_good_vif(struct vif *vifp)
845 {
846 	mutex_enter(&vifp->v_lock);
847 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
848 		mutex_exit(&vifp->v_lock);
849 		return (B_FALSE);
850 	}
851 
852 	ASSERT(vifp->v_ipif != NULL);
853 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
854 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
855 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
856 		mutex_exit(&vifp->v_lock);
857 		return (B_FALSE);
858 	}
859 	ipif_refhold_locked(vifp->v_ipif);
860 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
861 	vifp->v_refcnt++;
862 	mutex_exit(&vifp->v_lock);
863 	return (B_TRUE);
864 }
865 
866 /*
867  * Add a vif to the vif table.
868  */
869 static int
870 add_vif(struct vifctl *vifcp, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
871 {
872 	struct vif	*vifp = ipst->ips_vifs + vifcp->vifc_vifi;
873 	ipif_t		*ipif;
874 	int		error;
875 	struct tbf	*v_tbf = ipst->ips_tbfs + vifcp->vifc_vifi;
876 	ipsq_t  	*ipsq;
877 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
878 
879 	ASSERT(connp != NULL);
880 
881 	if (vifcp->vifc_vifi >= MAXVIFS)
882 		return (EINVAL);
883 
884 	if (is_mrouter_off(ipst))
885 		return (EINVAL);
886 
887 	mutex_enter(&vifp->v_lock);
888 	/*
889 	 * Viftable entry should be 0.
890 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
891 	 * initialized.
892 	 *
893 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
894 	 * request while the delete is in progress, mrouted only sends add
895 	 * requests when a new interface is added and the new interface cannot
896 	 * have the same vifi as an existing interface. We make sure that
897 	 * ill_delete will block till the vif is deleted by adding a refcnt
898 	 * to ipif in del_vif().
899 	 */
900 	if (vifp->v_lcl_addr.s_addr != 0 ||
901 	    vifp->v_marks != 0 ||
902 	    vifp->v_refcnt != 0) {
903 		mutex_exit(&vifp->v_lock);
904 		return (EADDRINUSE);
905 	}
906 
907 	/* Incoming vif should not be 0 */
908 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
909 		mutex_exit(&vifp->v_lock);
910 		return (EINVAL);
911 	}
912 
913 	vifp->v_refcnt++;
914 	mutex_exit(&vifp->v_lock);
915 	/* Find the interface with the local address */
916 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
917 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
918 	    ip_restart_optmgmt, &error, ipst);
919 	if (ipif == NULL) {
920 		VIF_REFRELE(vifp);
921 		if (error == EINPROGRESS)
922 			return (error);
923 		return (EADDRNOTAVAIL);
924 	}
925 
926 	/*
927 	 * We have to be exclusive as we have to call ip_addmulti()
928 	 * This is the best position to try to be exclusive in case
929 	 * we have to wait.
930 	 */
931 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
932 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
933 	if ((ipsq) == NULL) {
934 		VIF_REFRELE(vifp);
935 		ipif_refrele(ipif);
936 		return (EINPROGRESS);
937 	}
938 
939 	if (ipst->ips_ip_mrtdebug > 1) {
940 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
941 		    "add_vif: src 0x%x enter",
942 		    vifcp->vifc_lcl_addr.s_addr);
943 	}
944 
945 	mutex_enter(&vifp->v_lock);
946 	/*
947 	 * Always clear cache when vifs change.
948 	 * Needed to ensure that src isn't left over from before vif was added.
949 	 * No need to get last_encap_lock, since we are running as a writer.
950 	 */
951 
952 	mutex_enter(&ipst->ips_last_encap_lock);
953 	ipst->ips_last_encap_src = 0;
954 	ipst->ips_last_encap_vif = NULL;
955 	mutex_exit(&ipst->ips_last_encap_lock);
956 
957 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
958 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
959 			cmn_err(CE_WARN,
960 			    "add_vif: source route tunnels not supported\n");
961 			VIF_REFRELE_LOCKED(vifp);
962 			ipif_refrele(ipif);
963 			ipsq_exit(ipsq);
964 			return (EOPNOTSUPP);
965 		}
966 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
967 
968 	} else {
969 		/* Phyint or Register vif */
970 		if (vifcp->vifc_flags & VIFF_REGISTER) {
971 			/*
972 			 * Note: Since all IPPROTO_IP level options (including
973 			 * MRT_ADD_VIF) are done exclusively via
974 			 * ip_optmgmt_writer(), a lock is not necessary to
975 			 * protect reg_vif_num.
976 			 */
977 			mutex_enter(&ipst->ips_numvifs_mutex);
978 			if (ipst->ips_reg_vif_num == ALL_VIFS) {
979 				ipst->ips_reg_vif_num = vifcp->vifc_vifi;
980 				mutex_exit(&ipst->ips_numvifs_mutex);
981 			} else {
982 				mutex_exit(&ipst->ips_numvifs_mutex);
983 				VIF_REFRELE_LOCKED(vifp);
984 				ipif_refrele(ipif);
985 				ipsq_exit(ipsq);
986 				return (EADDRINUSE);
987 			}
988 		}
989 
990 		/* Make sure the interface supports multicast */
991 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
992 			VIF_REFRELE_LOCKED(vifp);
993 			ipif_refrele(ipif);
994 			if (vifcp->vifc_flags & VIFF_REGISTER) {
995 				mutex_enter(&ipst->ips_numvifs_mutex);
996 				ipst->ips_reg_vif_num = ALL_VIFS;
997 				mutex_exit(&ipst->ips_numvifs_mutex);
998 			}
999 			ipsq_exit(ipsq);
1000 			return (EOPNOTSUPP);
1001 		}
1002 		/* Enable promiscuous reception of all IP mcasts from the if */
1003 		mutex_exit(&vifp->v_lock);
1004 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
1005 		    MODE_IS_EXCLUDE, NULL);
1006 		mutex_enter(&vifp->v_lock);
1007 		/*
1008 		 * since we released the lock lets make sure that
1009 		 * ip_mrouter_done() has not been called.
1010 		 */
1011 		if (error != 0 || is_mrouter_off(ipst)) {
1012 			if (error == 0)
1013 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
1014 				    B_TRUE);
1015 			if (vifcp->vifc_flags & VIFF_REGISTER) {
1016 				mutex_enter(&ipst->ips_numvifs_mutex);
1017 				ipst->ips_reg_vif_num = ALL_VIFS;
1018 				mutex_exit(&ipst->ips_numvifs_mutex);
1019 			}
1020 			VIF_REFRELE_LOCKED(vifp);
1021 			ipif_refrele(ipif);
1022 			ipsq_exit(ipsq);
1023 			return (error?error:EINVAL);
1024 		}
1025 	}
1026 	/* Define parameters for the tbf structure */
1027 	vifp->v_tbf = v_tbf;
1028 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1029 	vifp->v_tbf->tbf_n_tok = 0;
1030 	vifp->v_tbf->tbf_q_len = 0;
1031 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1032 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1033 
1034 	vifp->v_flags = vifcp->vifc_flags;
1035 	vifp->v_threshold = vifcp->vifc_threshold;
1036 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1037 	vifp->v_ipif = ipif;
1038 	ipif_refrele(ipif);
1039 	/* Scaling up here, allows division by 1024 in critical code.	*/
1040 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1041 	vifp->v_timeout_id = 0;
1042 	/* initialize per vif pkt counters */
1043 	vifp->v_pkt_in = 0;
1044 	vifp->v_pkt_out = 0;
1045 	vifp->v_bytes_in = 0;
1046 	vifp->v_bytes_out = 0;
1047 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1048 
1049 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1050 	mutex_enter(&ipst->ips_numvifs_mutex);
1051 	if (ipst->ips_numvifs <= vifcp->vifc_vifi)
1052 		ipst->ips_numvifs = vifcp->vifc_vifi + 1;
1053 	mutex_exit(&ipst->ips_numvifs_mutex);
1054 
1055 	if (ipst->ips_ip_mrtdebug > 1) {
1056 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1057 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1058 		    vifcp->vifc_vifi,
1059 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1060 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1061 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1062 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1063 	}
1064 
1065 	vifp->v_marks = VIF_MARK_GOOD;
1066 	mutex_exit(&vifp->v_lock);
1067 	ipsq_exit(ipsq);
1068 	return (0);
1069 }
1070 
1071 
1072 /* Delete a vif from the vif table. */
1073 static void
1074 del_vifp(struct vif *vifp)
1075 {
1076 	struct tbf	*t = vifp->v_tbf;
1077 	mblk_t  *mp0;
1078 	vifi_t  vifi;
1079 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
1080 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1081 
1082 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1083 	ASSERT(t != NULL);
1084 
1085 	/*
1086 	 * release the ref we put in vif_del.
1087 	 */
1088 	ASSERT(vifp->v_ipif != NULL);
1089 	ipif_refrele(vifp->v_ipif);
1090 
1091 	if (ipst->ips_ip_mrtdebug > 1) {
1092 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1093 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1094 	}
1095 
1096 	if (vifp->v_timeout_id != 0) {
1097 		(void) untimeout(vifp->v_timeout_id);
1098 		vifp->v_timeout_id = 0;
1099 	}
1100 
1101 	/*
1102 	 * Free packets queued at the interface.
1103 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1104 	 */
1105 	mutex_enter(&t->tbf_lock);
1106 	while (t->tbf_q != NULL) {
1107 		mp0 = t->tbf_q;
1108 		t->tbf_q = t->tbf_q->b_next;
1109 		mp0->b_prev = mp0->b_next = NULL;
1110 		freemsg(mp0);
1111 	}
1112 	mutex_exit(&t->tbf_lock);
1113 
1114 	/*
1115 	 * Always clear cache when vifs change.
1116 	 * No need to get last_encap_lock since we are running as a writer.
1117 	 */
1118 	mutex_enter(&ipst->ips_last_encap_lock);
1119 	if (vifp == ipst->ips_last_encap_vif) {
1120 		ipst->ips_last_encap_vif = NULL;
1121 		ipst->ips_last_encap_src = 0;
1122 	}
1123 	mutex_exit(&ipst->ips_last_encap_lock);
1124 
1125 	mutex_destroy(&t->tbf_lock);
1126 
1127 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1128 
1129 	/* Adjust numvifs down */
1130 	mutex_enter(&ipst->ips_numvifs_mutex);
1131 	for (vifi = ipst->ips_numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1132 		if (ipst->ips_vifs[vifi - 1].v_lcl_addr.s_addr != 0)
1133 			break;
1134 	ipst->ips_numvifs = vifi;
1135 	mutex_exit(&ipst->ips_numvifs_mutex);
1136 
1137 	bzero(vifp, sizeof (*vifp));
1138 }
1139 
1140 static int
1141 del_vif(vifi_t *vifip, conn_t *connp, mblk_t *first_mp, ip_stack_t *ipst)
1142 {
1143 	struct vif	*vifp = ipst->ips_vifs + *vifip;
1144 	ipsq_t  	*ipsq;
1145 
1146 	if (*vifip >= ipst->ips_numvifs)
1147 		return (EINVAL);
1148 
1149 
1150 	mutex_enter(&vifp->v_lock);
1151 	/*
1152 	 * Not initialized
1153 	 * Here we are not looking at the vif that is being initialized
1154 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1155 	 */
1156 	if (vifp->v_lcl_addr.s_addr == 0 ||
1157 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1158 		mutex_exit(&vifp->v_lock);
1159 		return (EADDRNOTAVAIL);
1160 	}
1161 
1162 	/*
1163 	 * This is an optimization, if first_mp == NULL
1164 	 * than we are being called from reset_mrt_vif_ipif()
1165 	 * so we already have exclusive access to the ipsq.
1166 	 * the ASSERT below is a check for this condition.
1167 	 */
1168 	if (first_mp != NULL &&
1169 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1170 		ASSERT(connp != NULL);
1171 		/*
1172 		 * We have to be exclusive as we have to call ip_delmulti()
1173 		 * This is the best position to try to be exclusive in case
1174 		 * we have to wait.
1175 		 */
1176 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1177 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1178 		if ((ipsq) == NULL) {
1179 			mutex_exit(&vifp->v_lock);
1180 			return (EINPROGRESS);
1181 		}
1182 		/* recheck after being exclusive */
1183 		if (vifp->v_lcl_addr.s_addr == 0 ||
1184 		    !vifp->v_marks & VIF_MARK_GOOD) {
1185 			/*
1186 			 * someone beat us.
1187 			 */
1188 			mutex_exit(&vifp->v_lock);
1189 			ipsq_exit(ipsq);
1190 			return (EADDRNOTAVAIL);
1191 		}
1192 	}
1193 
1194 
1195 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1196 
1197 
1198 	/*
1199 	 * add a refhold so that ipif does not go away while
1200 	 * there are still users, this will be released in del_vifp
1201 	 * when we free the vif.
1202 	 */
1203 	ipif_refhold(vifp->v_ipif);
1204 
1205 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1206 	vifp->v_marks &= ~VIF_MARK_GOOD;
1207 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1208 
1209 	/* Phyint only */
1210 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1211 		ipif_t *ipif = vifp->v_ipif;
1212 		ASSERT(ipif != NULL);
1213 		/*
1214 		 * should be OK to drop the lock as we
1215 		 * have marked this as CONDEMNED.
1216 		 */
1217 		mutex_exit(&(vifp)->v_lock);
1218 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1219 		if (first_mp != NULL)
1220 			ipsq_exit(ipsq);
1221 		mutex_enter(&(vifp)->v_lock);
1222 	}
1223 
1224 	/*
1225 	 * decreases the refcnt added in add_vif.
1226 	 */
1227 	VIF_REFRELE_LOCKED(vifp);
1228 	return (0);
1229 }
1230 
1231 /*
1232  * Add an mfc entry.
1233  */
1234 static int
1235 add_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1236 {
1237 	struct mfc *rt;
1238 	struct rtdetq *rte;
1239 	ushort_t nstl;
1240 	int i;
1241 	struct mfcb *mfcbp;
1242 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1243 
1244 	/*
1245 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1246 	 * did not have a real route for pkt.
1247 	 * We want this pkt without rt installed in the mfctable to prevent
1248 	 * multiiple tries, so go ahead and put it in mfctable, it will
1249 	 * be discarded later in ip_mdq() because the child is NULL.
1250 	 */
1251 
1252 	/* Error checking, out of bounds? */
1253 	if (mfccp->mfcc_parent > MAXVIFS) {
1254 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1255 		    (int)mfccp->mfcc_parent));
1256 		return (EINVAL);
1257 	}
1258 
1259 	if ((mfccp->mfcc_parent != NO_VIF) &&
1260 	    (ipst->ips_vifs[mfccp->mfcc_parent].v_ipif == NULL)) {
1261 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1262 		    (int)mfccp->mfcc_parent));
1263 		return (EINVAL);
1264 	}
1265 
1266 	if (is_mrouter_off(ipst)) {
1267 		return (EINVAL);
1268 	}
1269 
1270 	mfcbp = &ipst->ips_mfcs[MFCHASH(mfccp->mfcc_origin.s_addr,
1271 	    mfccp->mfcc_mcastgrp.s_addr)];
1272 	MFCB_REFHOLD(mfcbp);
1273 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1274 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1275 
1276 	/* If an entry already exists, just update the fields */
1277 	if (rt) {
1278 		if (ipst->ips_ip_mrtdebug > 1) {
1279 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1280 			    "add_mfc: update o %x grp %x parent %x",
1281 			    ntohl(mfccp->mfcc_origin.s_addr),
1282 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1283 			    mfccp->mfcc_parent);
1284 		}
1285 		mutex_enter(&rt->mfc_mutex);
1286 		rt->mfc_parent = mfccp->mfcc_parent;
1287 
1288 		mutex_enter(&ipst->ips_numvifs_mutex);
1289 		for (i = 0; i < (int)ipst->ips_numvifs; i++)
1290 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1291 		mutex_exit(&ipst->ips_numvifs_mutex);
1292 		mutex_exit(&rt->mfc_mutex);
1293 
1294 		MFCB_REFRELE(mfcbp);
1295 		return (0);
1296 	}
1297 
1298 	/*
1299 	 * Find the entry for which the upcall was made and update.
1300 	 */
1301 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1302 		mutex_enter(&rt->mfc_mutex);
1303 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1304 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1305 		    (rt->mfc_rte != NULL) &&
1306 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1307 			if (nstl++ != 0)
1308 				cmn_err(CE_WARN,
1309 				    "add_mfc: %s o %x g %x p %x",
1310 				    "multiple kernel entries",
1311 				    ntohl(mfccp->mfcc_origin.s_addr),
1312 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1313 				    mfccp->mfcc_parent);
1314 
1315 			if (ipst->ips_ip_mrtdebug > 1) {
1316 				(void) mi_strlog(mrouter->conn_rq, 1,
1317 				    SL_TRACE,
1318 				    "add_mfc: o %x g %x p %x",
1319 				    ntohl(mfccp->mfcc_origin.s_addr),
1320 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1321 				    mfccp->mfcc_parent);
1322 			}
1323 			fill_route(rt, mfccp, ipst);
1324 
1325 			/*
1326 			 * Prevent cleanup of cache entry.
1327 			 * Timer starts in ip_mforward.
1328 			 */
1329 			if (rt->mfc_timeout_id != 0) {
1330 				timeout_id_t id;
1331 				id = rt->mfc_timeout_id;
1332 				/*
1333 				 * setting id to zero will avoid this
1334 				 * entry from being cleaned up in
1335 				 * expire_up_calls().
1336 				 */
1337 				rt->mfc_timeout_id = 0;
1338 				/*
1339 				 * dropping the lock is fine as we
1340 				 * have a refhold on the bucket.
1341 				 * so mfc cannot be freed.
1342 				 * The timeout can fire but it will see
1343 				 * that mfc_timeout_id == 0 and not cleanup.
1344 				 */
1345 				mutex_exit(&rt->mfc_mutex);
1346 				(void) untimeout(id);
1347 				mutex_enter(&rt->mfc_mutex);
1348 			}
1349 
1350 			/*
1351 			 * Send all pkts that are queued waiting for the upcall.
1352 			 * ip_mdq param tun set to 0 -
1353 			 * the return value of ip_mdq() isn't used here,
1354 			 * so value we send doesn't matter.
1355 			 */
1356 			while (rt->mfc_rte != NULL) {
1357 				rte = rt->mfc_rte;
1358 				rt->mfc_rte = rte->rte_next;
1359 				mutex_exit(&rt->mfc_mutex);
1360 				(void) ip_mdq(rte->mp, (ipha_t *)
1361 				    rte->mp->b_rptr, rte->ill, 0, rt);
1362 				freemsg(rte->mp);
1363 				mi_free((char *)rte);
1364 				mutex_enter(&rt->mfc_mutex);
1365 			}
1366 		}
1367 		mutex_exit(&rt->mfc_mutex);
1368 	}
1369 
1370 
1371 	/*
1372 	 * It is possible that an entry is being inserted without an upcall
1373 	 */
1374 	if (nstl == 0) {
1375 		mutex_enter(&(mfcbp->mfcb_lock));
1376 		if (ipst->ips_ip_mrtdebug > 1) {
1377 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1378 			    "add_mfc: no upcall o %x g %x p %x",
1379 			    ntohl(mfccp->mfcc_origin.s_addr),
1380 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1381 			    mfccp->mfcc_parent);
1382 		}
1383 		if (is_mrouter_off(ipst)) {
1384 			mutex_exit(&mfcbp->mfcb_lock);
1385 			MFCB_REFRELE(mfcbp);
1386 			return (EINVAL);
1387 		}
1388 
1389 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1390 
1391 			mutex_enter(&rt->mfc_mutex);
1392 			if ((rt->mfc_origin.s_addr ==
1393 			    mfccp->mfcc_origin.s_addr) &&
1394 			    (rt->mfc_mcastgrp.s_addr ==
1395 			    mfccp->mfcc_mcastgrp.s_addr) &&
1396 			    (!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1397 				fill_route(rt, mfccp, ipst);
1398 				mutex_exit(&rt->mfc_mutex);
1399 				break;
1400 			}
1401 			mutex_exit(&rt->mfc_mutex);
1402 		}
1403 
1404 		/* No upcall, so make a new entry into mfctable */
1405 		if (rt == NULL) {
1406 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1407 			if (rt == NULL) {
1408 				ip1dbg(("add_mfc: out of memory\n"));
1409 				mutex_exit(&mfcbp->mfcb_lock);
1410 				MFCB_REFRELE(mfcbp);
1411 				return (ENOBUFS);
1412 			}
1413 
1414 			/* Insert new entry at head of hash chain */
1415 			mutex_enter(&rt->mfc_mutex);
1416 			fill_route(rt, mfccp, ipst);
1417 
1418 			/* Link into table */
1419 			rt->mfc_next   = mfcbp->mfcb_mfc;
1420 			mfcbp->mfcb_mfc = rt;
1421 			mutex_exit(&rt->mfc_mutex);
1422 		}
1423 		mutex_exit(&mfcbp->mfcb_lock);
1424 	}
1425 
1426 	MFCB_REFRELE(mfcbp);
1427 	return (0);
1428 }
1429 
1430 /*
1431  * Fills in mfc structure from mrouted mfcctl.
1432  */
1433 static void
1434 fill_route(struct mfc *rt, struct mfcctl *mfccp, ip_stack_t *ipst)
1435 {
1436 	int i;
1437 
1438 	rt->mfc_origin		= mfccp->mfcc_origin;
1439 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1440 	rt->mfc_parent		= mfccp->mfcc_parent;
1441 	mutex_enter(&ipst->ips_numvifs_mutex);
1442 	for (i = 0; i < (int)ipst->ips_numvifs; i++) {
1443 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1444 	}
1445 	mutex_exit(&ipst->ips_numvifs_mutex);
1446 	/* Initialize pkt counters per src-grp */
1447 	rt->mfc_pkt_cnt	= 0;
1448 	rt->mfc_byte_cnt	= 0;
1449 	rt->mfc_wrong_if	= 0;
1450 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1451 
1452 }
1453 
1454 static void
1455 free_queue(struct mfc *mfcp)
1456 {
1457 	struct rtdetq *rte0;
1458 
1459 	/*
1460 	 * Drop all queued upcall packets.
1461 	 * Free the mbuf with the pkt.
1462 	 */
1463 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1464 		mfcp->mfc_rte = rte0->rte_next;
1465 		freemsg(rte0->mp);
1466 		mi_free((char *)rte0);
1467 	}
1468 }
1469 /*
1470  * go thorugh the hash bucket and free all the entries marked condemned.
1471  */
1472 void
1473 release_mfc(struct mfcb *mfcbp)
1474 {
1475 	struct mfc *current_mfcp;
1476 	struct mfc *prev_mfcp;
1477 
1478 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1479 
1480 	while (current_mfcp != NULL) {
1481 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1482 			if (current_mfcp == mfcbp->mfcb_mfc) {
1483 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1484 				free_queue(current_mfcp);
1485 				mi_free(current_mfcp);
1486 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1487 				continue;
1488 			}
1489 			ASSERT(prev_mfcp != NULL);
1490 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1491 			free_queue(current_mfcp);
1492 			mi_free(current_mfcp);
1493 			current_mfcp = NULL;
1494 		} else {
1495 			prev_mfcp = current_mfcp;
1496 		}
1497 
1498 		current_mfcp = prev_mfcp->mfc_next;
1499 
1500 	}
1501 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1502 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1503 }
1504 
1505 /*
1506  * Delete an mfc entry.
1507  */
1508 static int
1509 del_mfc(struct mfcctl *mfccp, ip_stack_t *ipst)
1510 {
1511 	struct in_addr	origin;
1512 	struct in_addr	mcastgrp;
1513 	struct mfc 	*rt;
1514 	uint_t		hash;
1515 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1516 
1517 	origin = mfccp->mfcc_origin;
1518 	mcastgrp = mfccp->mfcc_mcastgrp;
1519 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1520 
1521 	if (ipst->ips_ip_mrtdebug > 1) {
1522 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1523 		    "del_mfc: o %x g %x",
1524 		    ntohl(origin.s_addr),
1525 		    ntohl(mcastgrp.s_addr));
1526 	}
1527 
1528 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
1529 
1530 	/* Find mfc in mfctable, finds only entries without upcalls */
1531 	for (rt = ipst->ips_mfcs[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1532 		mutex_enter(&rt->mfc_mutex);
1533 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1534 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1535 		    rt->mfc_rte == NULL &&
1536 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1537 			break;
1538 		mutex_exit(&rt->mfc_mutex);
1539 	}
1540 
1541 	/*
1542 	 * Return if there was an upcall (mfc_rte != NULL,
1543 	 * or rt not in mfctable.
1544 	 */
1545 	if (rt == NULL) {
1546 		MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1547 		return (EADDRNOTAVAIL);
1548 	}
1549 
1550 
1551 	/*
1552 	 * no need to hold lock as we have a reference.
1553 	 */
1554 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1555 	/* error checking */
1556 	if (rt->mfc_timeout_id != 0) {
1557 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1558 		/*
1559 		 * Its ok to drop the lock,  the struct cannot be freed
1560 		 * since we have a ref on the hash bucket.
1561 		 */
1562 		rt->mfc_timeout_id = 0;
1563 		mutex_exit(&rt->mfc_mutex);
1564 		(void) untimeout(rt->mfc_timeout_id);
1565 		mutex_enter(&rt->mfc_mutex);
1566 	}
1567 
1568 	ASSERT(rt->mfc_rte == NULL);
1569 
1570 
1571 	/*
1572 	 * Delete the entry from the cache
1573 	 */
1574 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1575 	mutex_exit(&rt->mfc_mutex);
1576 
1577 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
1578 
1579 	return (0);
1580 }
1581 
1582 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1583 
1584 /*
1585  * IP multicast forwarding function. This function assumes that the packet
1586  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1587  * pointed to by "ill", and the packet is to be relayed to other networks
1588  * that have members of the packet's destination IP multicast group.
1589  *
1590  * The packet is returned unscathed to the caller, unless it is
1591  * erroneous, in which case a -1 value tells the caller (IP)
1592  * to discard it.
1593  *
1594  * Unlike BSD, SunOS 5.x needs to return to IP info about
1595  * whether pkt came in thru a tunnel, so it can be discarded, unless
1596  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1597  * to be delivered.
1598  * Return values are 0 - pkt is okay and phyint
1599  *		    -1 - pkt is malformed and to be tossed
1600  *                   1 - pkt came in on tunnel
1601  */
1602 int
1603 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1604 {
1605 	struct mfc 	*rt;
1606 	ipaddr_t	src, dst, tunnel_src = 0;
1607 	static int	srctun = 0;
1608 	vifi_t		vifi;
1609 	boolean_t	pim_reg_packet = B_FALSE;
1610 	struct mfcb *mfcbp;
1611 	ip_stack_t	*ipst = ill->ill_ipst;
1612 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
1613 
1614 	if (ipst->ips_ip_mrtdebug > 1) {
1615 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1616 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1617 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1618 		    ill->ill_name);
1619 	}
1620 
1621 	dst = ipha->ipha_dst;
1622 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1623 		pim_reg_packet = B_TRUE;
1624 	else
1625 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1626 
1627 	/*
1628 	 * Don't forward a packet with time-to-live of zero or one,
1629 	 * or a packet destined to a local-only group.
1630 	 */
1631 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1632 	    (ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1633 		if (ipst->ips_ip_mrtdebug > 1) {
1634 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1635 			    "ip_mforward: not forwarded ttl %d,"
1636 			    " dst 0x%x ill %s",
1637 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1638 		}
1639 		mp->b_prev = NULL;
1640 		if (tunnel_src != 0)
1641 			return (1);
1642 		else
1643 			return (0);
1644 	}
1645 
1646 	if ((tunnel_src != 0) || pim_reg_packet) {
1647 		/*
1648 		 * Packet arrived over an encapsulated tunnel or via a PIM
1649 		 * register message. Both ip_mroute_decap() and pim_input()
1650 		 * encode information in mp->b_prev.
1651 		 */
1652 		mp->b_prev = NULL;
1653 		if (ipst->ips_ip_mrtdebug > 1) {
1654 			if (tunnel_src != 0) {
1655 				(void) mi_strlog(mrouter->conn_rq, 1,
1656 				    SL_TRACE,
1657 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1658 				    ill->ill_name);
1659 			} else if (pim_reg_packet) {
1660 				(void) mi_strlog(mrouter->conn_rq, 1,
1661 				    SL_TRACE,
1662 				    "ip_mforward: ill %s arrived via"
1663 				    "  REGISTER VIF",
1664 				    ill->ill_name);
1665 			}
1666 		}
1667 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1668 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1669 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1670 		/* Packet arrived via a physical interface. */
1671 		if (ipst->ips_ip_mrtdebug > 1) {
1672 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1673 			    "ip_mforward: ill %s arrived via PHYINT",
1674 			    ill->ill_name);
1675 		}
1676 
1677 	} else {
1678 		/*
1679 		 * Packet arrived through a SRCRT tunnel.
1680 		 * Source-route tunnels are no longer supported.
1681 		 * Error message printed every 1000 times.
1682 		 */
1683 		if ((srctun++ % 1000) == 0) {
1684 			cmn_err(CE_WARN,
1685 			    "ip_mforward: received source-routed pkt from %x",
1686 			    ntohl(ipha->ipha_src));
1687 		}
1688 		return (-1);
1689 	}
1690 
1691 	ipst->ips_mrtstat->mrts_fwd_in++;
1692 	src = ipha->ipha_src;
1693 
1694 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1695 
1696 	/*
1697 	 * Lock the mfctable against changes made by ip_mforward.
1698 	 * Note that only add_mfc and del_mfc can remove entries and
1699 	 * they run with exclusive access to IP. So we do not need to
1700 	 * guard against the rt being deleted, so release lock after reading.
1701 	 */
1702 
1703 	if (is_mrouter_off(ipst))
1704 		return (-1);
1705 
1706 	mfcbp = &ipst->ips_mfcs[MFCHASH(src, dst)];
1707 	MFCB_REFHOLD(mfcbp);
1708 	MFCFIND(mfcbp, src, dst, rt);
1709 
1710 	/* Entry exists, so forward if necessary */
1711 	if (rt != NULL) {
1712 		int ret = 0;
1713 		ipst->ips_mrtstat->mrts_mfc_hits++;
1714 		if (pim_reg_packet) {
1715 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1716 			ret = ip_mdq(mp, ipha,
1717 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1718 			    v_ipif->ipif_ill,
1719 			    0, rt);
1720 		} else {
1721 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1722 		}
1723 
1724 		MFCB_REFRELE(mfcbp);
1725 		return (ret);
1726 
1727 		/*
1728 		 * Don't forward if we don't have a cache entry.  Mrouted will
1729 		 * always provide a cache entry in response to an upcall.
1730 		 */
1731 	} else {
1732 		/*
1733 		 * If we don't have a route for packet's origin, make a copy
1734 		 * of the packet and send message to routing daemon.
1735 		 */
1736 		struct mfc	*mfc_rt	 = NULL;
1737 		mblk_t		*mp0	 = NULL;
1738 		mblk_t		*mp_copy = NULL;
1739 		struct rtdetq	*rte	 = NULL;
1740 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1741 		uint_t		hash;
1742 		int		npkts;
1743 		boolean_t	new_mfc = B_FALSE;
1744 		ipst->ips_mrtstat->mrts_mfc_misses++;
1745 		/* BSD uses mrts_no_route++ */
1746 		if (ipst->ips_ip_mrtdebug > 1) {
1747 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1748 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1749 			    ill->ill_name, ntohl(src), ntohl(dst),
1750 			    (int)ipst->ips_mrtstat->mrts_mfc_misses);
1751 		}
1752 		/*
1753 		 * The order of the following code differs from the BSD code.
1754 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1755 		 * code works, so SunOS 5.x wasn't changed to conform to the
1756 		 * BSD version.
1757 		 */
1758 
1759 		/* Lock mfctable. */
1760 		hash = MFCHASH(src, dst);
1761 		mutex_enter(&(ipst->ips_mfcs[hash].mfcb_lock));
1762 
1763 		/*
1764 		 * If we are turning off mrouted return an error
1765 		 */
1766 		if (is_mrouter_off(ipst)) {
1767 			mutex_exit(&mfcbp->mfcb_lock);
1768 			MFCB_REFRELE(mfcbp);
1769 			return (-1);
1770 		}
1771 
1772 		/* Is there an upcall waiting for this packet? */
1773 		for (mfc_rt = ipst->ips_mfcs[hash].mfcb_mfc; mfc_rt;
1774 		    mfc_rt = mfc_rt->mfc_next) {
1775 			mutex_enter(&mfc_rt->mfc_mutex);
1776 			if (ipst->ips_ip_mrtdebug > 1) {
1777 				(void) mi_strlog(mrouter->conn_rq, 1,
1778 				    SL_TRACE,
1779 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1780 				    " g 0x%x\n",
1781 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1782 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1783 			}
1784 			/* There is an upcall */
1785 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1786 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1787 			    (mfc_rt->mfc_rte != NULL) &&
1788 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1789 				break;
1790 			}
1791 			mutex_exit(&mfc_rt->mfc_mutex);
1792 		}
1793 		/* No upcall, so make a new entry into mfctable */
1794 		if (mfc_rt == NULL) {
1795 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1796 			if (mfc_rt == NULL) {
1797 				ipst->ips_mrtstat->mrts_fwd_drop++;
1798 				ip1dbg(("ip_mforward: out of memory "
1799 				    "for mfc, mfc_rt\n"));
1800 				goto error_return;
1801 			} else
1802 				new_mfc = B_TRUE;
1803 			/* Get resources */
1804 			/* TODO could copy header and dup rest */
1805 			mp_copy = copymsg(mp);
1806 			if (mp_copy == NULL) {
1807 				ipst->ips_mrtstat->mrts_fwd_drop++;
1808 				ip1dbg(("ip_mforward: out of memory for "
1809 				    "mblk, mp_copy\n"));
1810 				goto error_return;
1811 			}
1812 			mutex_enter(&mfc_rt->mfc_mutex);
1813 		}
1814 		/* Get resources for rte, whether first rte or not first. */
1815 		/* Add this packet into rtdetq */
1816 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1817 		if (rte == NULL) {
1818 			ipst->ips_mrtstat->mrts_fwd_drop++;
1819 			mutex_exit(&mfc_rt->mfc_mutex);
1820 			ip1dbg(("ip_mforward: out of memory for"
1821 			    " rtdetq, rte\n"));
1822 			goto error_return;
1823 		}
1824 
1825 		mp0 = copymsg(mp);
1826 		if (mp0 == NULL) {
1827 			ipst->ips_mrtstat->mrts_fwd_drop++;
1828 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1829 			mutex_exit(&mfc_rt->mfc_mutex);
1830 			goto error_return;
1831 		}
1832 		rte->mp		= mp0;
1833 		if (pim_reg_packet) {
1834 			ASSERT(ipst->ips_reg_vif_num != ALL_VIFS);
1835 			rte->ill =
1836 			    ipst->ips_vifs[ipst->ips_reg_vif_num].
1837 			    v_ipif->ipif_ill;
1838 		} else {
1839 			rte->ill = ill;
1840 		}
1841 		rte->rte_next	= NULL;
1842 
1843 		/*
1844 		 * Determine if upcall q (rtdetq) has overflowed.
1845 		 * mfc_rt->mfc_rte is null by mi_zalloc
1846 		 * if it is the first message.
1847 		 */
1848 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1849 		    rte_m = rte_m->rte_next)
1850 			npkts++;
1851 		if (ipst->ips_ip_mrtdebug > 1) {
1852 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1853 			    "ip_mforward: upcalls %d\n", npkts);
1854 		}
1855 		if (npkts > MAX_UPQ) {
1856 			ipst->ips_mrtstat->mrts_upq_ovflw++;
1857 			mutex_exit(&mfc_rt->mfc_mutex);
1858 			goto error_return;
1859 		}
1860 
1861 		if (npkts == 0) {	/* first upcall */
1862 			int i = 0;
1863 			/*
1864 			 * Now finish installing the new mfc! Now that we have
1865 			 * resources!  Insert new entry at head of hash chain.
1866 			 * Use src and dst which are ipaddr_t's.
1867 			 */
1868 			mfc_rt->mfc_origin.s_addr = src;
1869 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1870 
1871 			mutex_enter(&ipst->ips_numvifs_mutex);
1872 			for (i = 0; i < (int)ipst->ips_numvifs; i++)
1873 				mfc_rt->mfc_ttls[i] = 0;
1874 			mutex_exit(&ipst->ips_numvifs_mutex);
1875 			mfc_rt->mfc_parent = ALL_VIFS;
1876 
1877 			/* Link into table */
1878 			if (ipst->ips_ip_mrtdebug > 1) {
1879 				(void) mi_strlog(mrouter->conn_rq, 1,
1880 				    SL_TRACE,
1881 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1882 				    "g 0x%x\n", hash,
1883 				    ntohl(mfc_rt->mfc_origin.s_addr),
1884 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1885 			}
1886 			mfc_rt->mfc_next = ipst->ips_mfcs[hash].mfcb_mfc;
1887 			ipst->ips_mfcs[hash].mfcb_mfc = mfc_rt;
1888 			mfc_rt->mfc_rte = NULL;
1889 		}
1890 
1891 		/* Link in the upcall */
1892 		/* First upcall */
1893 		if (mfc_rt->mfc_rte == NULL)
1894 			mfc_rt->mfc_rte = rte;
1895 		else {
1896 			/* not the first upcall */
1897 			prev_rte = mfc_rt->mfc_rte;
1898 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1899 			    prev_rte = rte1, rte1 = rte1->rte_next)
1900 				;
1901 			prev_rte->rte_next = rte;
1902 		}
1903 
1904 		/*
1905 		 * No upcalls waiting, this is first one, so send a message to
1906 		 * routing daemon to install a route into kernel table.
1907 		 */
1908 		if (npkts == 0) {
1909 			struct igmpmsg	*im;
1910 			/* ipha_protocol is 0, for upcall */
1911 			ASSERT(mp_copy != NULL);
1912 			im = (struct igmpmsg *)mp_copy->b_rptr;
1913 			im->im_msgtype	= IGMPMSG_NOCACHE;
1914 			im->im_mbz = 0;
1915 			mutex_enter(&ipst->ips_numvifs_mutex);
1916 			if (pim_reg_packet) {
1917 				im->im_vif = (uchar_t)ipst->ips_reg_vif_num;
1918 				mutex_exit(&ipst->ips_numvifs_mutex);
1919 			} else {
1920 				/*
1921 				 * XXX do we need to hold locks here ?
1922 				 */
1923 				for (vifi = 0;
1924 				    vifi < ipst->ips_numvifs;
1925 				    vifi++) {
1926 					if (ipst->ips_vifs[vifi].v_ipif == NULL)
1927 						continue;
1928 					if (ipst->ips_vifs[vifi].
1929 					    v_ipif->ipif_ill == ill) {
1930 						im->im_vif = (uchar_t)vifi;
1931 						break;
1932 					}
1933 				}
1934 				mutex_exit(&ipst->ips_numvifs_mutex);
1935 				ASSERT(vifi < ipst->ips_numvifs);
1936 			}
1937 
1938 			ipst->ips_mrtstat->mrts_upcalls++;
1939 			/* Timer to discard upcalls if mrouted is too slow */
1940 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1941 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1942 			mutex_exit(&mfc_rt->mfc_mutex);
1943 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1944 			/* Pass to RAWIP */
1945 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
1946 		} else {
1947 			mutex_exit(&mfc_rt->mfc_mutex);
1948 			mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1949 			freemsg(mp_copy);
1950 		}
1951 
1952 		MFCB_REFRELE(mfcbp);
1953 		if (tunnel_src != 0)
1954 			return (1);
1955 		else
1956 			return (0);
1957 	error_return:
1958 		mutex_exit(&(ipst->ips_mfcs[hash].mfcb_lock));
1959 		MFCB_REFRELE(mfcbp);
1960 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1961 			mi_free((char *)mfc_rt);
1962 		if (rte != NULL)
1963 			mi_free((char *)rte);
1964 		if (mp_copy != NULL)
1965 			freemsg(mp_copy);
1966 		if (mp0 != NULL)
1967 			freemsg(mp0);
1968 		return (-1);
1969 	}
1970 }
1971 
1972 /*
1973  * Clean up the mfctable cache entry if upcall is not serviced.
1974  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1975  */
1976 static void
1977 expire_upcalls(void *arg)
1978 {
1979 	struct mfc *mfc_rt = arg;
1980 	uint_t hash;
1981 	struct mfc *prev_mfc, *mfc0;
1982 	ip_stack_t	*ipst;
1983 	conn_t		*mrouter;
1984 
1985 	if (mfc_rt->mfc_rte == NULL || mfc_rt->mfc_rte->ill != NULL) {
1986 		cmn_err(CE_WARN, "expire_upcalls: no ILL\n");
1987 		return;
1988 	}
1989 	ipst = mfc_rt->mfc_rte->ill->ill_ipst;
1990 	mrouter = ipst->ips_ip_g_mrouter;
1991 
1992 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1993 	if (ipst->ips_ip_mrtdebug > 1) {
1994 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
1995 		    "expire_upcalls: hash %d s %x g %x",
1996 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1997 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1998 	}
1999 	MFCB_REFHOLD(&ipst->ips_mfcs[hash]);
2000 	mutex_enter(&mfc_rt->mfc_mutex);
2001 	/*
2002 	 * if timeout has been set to zero, than the
2003 	 * entry has been filled, no need to delete it.
2004 	 */
2005 	if (mfc_rt->mfc_timeout_id == 0)
2006 		goto done;
2007 	ipst->ips_mrtstat->mrts_cache_cleanups++;
2008 	mfc_rt->mfc_timeout_id = 0;
2009 
2010 	/* Determine entry to be cleaned up in cache table. */
2011 	for (prev_mfc = mfc0 = ipst->ips_mfcs[hash].mfcb_mfc; mfc0;
2012 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
2013 		if (mfc0 == mfc_rt)
2014 			break;
2015 
2016 	/* del_mfc takes care of gone mfcs */
2017 	ASSERT(prev_mfc != NULL);
2018 	ASSERT(mfc0 != NULL);
2019 
2020 	/*
2021 	 * Delete the entry from the cache
2022 	 */
2023 	ipst->ips_mfcs[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
2024 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
2025 
2026 	/*
2027 	 * release_mfc will drop all queued upcall packets.
2028 	 * and will free the mbuf with the pkt, if, timing info.
2029 	 */
2030 done:
2031 	mutex_exit(&mfc_rt->mfc_mutex);
2032 	MFCB_REFRELE(&ipst->ips_mfcs[hash]);
2033 }
2034 
2035 /*
2036  * Packet forwarding routine once entry in the cache is made.
2037  */
2038 static int
2039 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
2040     struct mfc *rt)
2041 {
2042 	vifi_t vifi;
2043 	struct vif *vifp;
2044 	ipaddr_t dst = ipha->ipha_dst;
2045 	size_t  plen = msgdsize(mp);
2046 	vifi_t num_of_vifs;
2047 	ip_stack_t	*ipst = ill->ill_ipst;
2048 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2049 
2050 	if (ipst->ips_ip_mrtdebug > 1) {
2051 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2052 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2053 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2054 		    ill->ill_name);
2055 	}
2056 
2057 	/* Macro to send packet on vif */
2058 #define	MC_SEND(ipha, mp, vifp, dst) { \
2059 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2060 		encap_send((ipha), (mp), (vifp), (dst)); \
2061 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2062 		register_send((ipha), (mp), (vifp), (dst)); \
2063 	else \
2064 		phyint_send((ipha), (mp), (vifp), (dst)); \
2065 }
2066 
2067 	vifi = rt->mfc_parent;
2068 
2069 	/*
2070 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2071 	 * Mrouted had no route.
2072 	 * We wanted the route installed in the mfctable to prevent multiple
2073 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2074 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2075 	 * 3.6.
2076 	 */
2077 	if (vifi == NO_VIF) {
2078 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2079 		    ill->ill_name));
2080 		if (ipst->ips_ip_mrtdebug > 1) {
2081 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2082 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2083 		}
2084 		return (-1);	/* drop pkt */
2085 	}
2086 
2087 	if (!lock_good_vif(&ipst->ips_vifs[vifi]))
2088 		return (-1);
2089 	/*
2090 	 * The MFC entries are not cleaned up when an ipif goes
2091 	 * away thus this code has to guard against an MFC referencing
2092 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2093 	 * sets the v_ipif to NULL when the ipif disappears.
2094 	 */
2095 	ASSERT(ipst->ips_vifs[vifi].v_ipif != NULL);
2096 
2097 	if (vifi >= ipst->ips_numvifs) {
2098 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2099 		    "%d ill %s viftable ill %s\n",
2100 		    (int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2101 		    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2102 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2103 		return (-1);
2104 	}
2105 	/*
2106 	 * Don't forward if it didn't arrive from the parent vif for its
2107 	 * origin. But do match on the groups as we nominate only one
2108 	 * ill in the group for receiving allmulti packets.
2109 	 */
2110 	if ((ipst->ips_vifs[vifi].v_ipif->ipif_ill != ill &&
2111 	    (ill->ill_group == NULL ||
2112 	    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_group !=
2113 		ill->ill_group)) ||
2114 	    (ipst->ips_vifs[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2115 		/* Came in the wrong interface */
2116 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2117 			"numvifs %d ill %s viftable ill %s\n",
2118 			(int)vifi, (int)ipst->ips_numvifs, ill->ill_name,
2119 			ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name));
2120 		if (ipst->ips_ip_mrtdebug > 1) {
2121 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2122 			    "ip_mdq: arrived wrong if, vifi %d ill "
2123 			    "%s viftable ill %s\n",
2124 			    (int)vifi, ill->ill_name,
2125 			    ipst->ips_vifs[vifi].v_ipif->ipif_ill->ill_name);
2126 		}
2127 		ipst->ips_mrtstat->mrts_wrong_if++;
2128 		rt->mfc_wrong_if++;
2129 
2130 		/*
2131 		 * If we are doing PIM assert processing and we are forwarding
2132 		 * packets on this interface, and it is a broadcast medium
2133 		 * interface (and not a tunnel), send a message to the routing.
2134 		 *
2135 		 * We use the first ipif on the list, since it's all we have.
2136 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2137 		 */
2138 		if (ipst->ips_pim_assert && rt->mfc_ttls[vifi] > 0 &&
2139 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2140 		    !(ipst->ips_vifs[vifi].v_flags & VIFF_TUNNEL)) {
2141 			mblk_t		*mp_copy;
2142 			struct igmpmsg	*im;
2143 
2144 			/* TODO could copy header and dup rest */
2145 			mp_copy = copymsg(mp);
2146 			if (mp_copy == NULL) {
2147 				ipst->ips_mrtstat->mrts_fwd_drop++;
2148 				ip1dbg(("ip_mdq: out of memory "
2149 				    "for mblk, mp_copy\n"));
2150 				unlock_good_vif(&ipst->ips_vifs[vifi]);
2151 				return (-1);
2152 			}
2153 
2154 			im = (struct igmpmsg *)mp_copy->b_rptr;
2155 			im->im_msgtype = IGMPMSG_WRONGVIF;
2156 			im->im_mbz = 0;
2157 			im->im_vif = (ushort_t)vifi;
2158 			/* Pass to RAWIP */
2159 			(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2160 		}
2161 		unlock_good_vif(&ipst->ips_vifs[vifi]);
2162 		if (tunnel_src != 0)
2163 			return (1);
2164 		else
2165 			return (0);
2166 	}
2167 	/*
2168 	 * If I sourced this packet, it counts as output, else it was input.
2169 	 */
2170 	if (ipha->ipha_src == ipst->ips_vifs[vifi].v_lcl_addr.s_addr) {
2171 		ipst->ips_vifs[vifi].v_pkt_out++;
2172 		ipst->ips_vifs[vifi].v_bytes_out += plen;
2173 	} else {
2174 		ipst->ips_vifs[vifi].v_pkt_in++;
2175 		ipst->ips_vifs[vifi].v_bytes_in += plen;
2176 	}
2177 	mutex_enter(&rt->mfc_mutex);
2178 	rt->mfc_pkt_cnt++;
2179 	rt->mfc_byte_cnt += plen;
2180 	mutex_exit(&rt->mfc_mutex);
2181 	unlock_good_vif(&ipst->ips_vifs[vifi]);
2182 	/*
2183 	 * For each vif, decide if a copy of the packet should be forwarded.
2184 	 * Forward if:
2185 	 *		- the vif threshold ttl is non-zero AND
2186 	 *		- the pkt ttl exceeds the vif's threshold
2187 	 * A non-zero mfc_ttl indicates that the vif is part of
2188 	 * the output set for the mfc entry.
2189 	 */
2190 	mutex_enter(&ipst->ips_numvifs_mutex);
2191 	num_of_vifs = ipst->ips_numvifs;
2192 	mutex_exit(&ipst->ips_numvifs_mutex);
2193 	for (vifp = ipst->ips_vifs, vifi = 0;
2194 	    vifi < num_of_vifs;
2195 	    vifp++, vifi++) {
2196 		if (!lock_good_vif(vifp))
2197 			continue;
2198 		if ((rt->mfc_ttls[vifi] > 0) &&
2199 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2200 			/*
2201 			 * lock_good_vif should not have succedded if
2202 			 * v_ipif is null.
2203 			 */
2204 			ASSERT(vifp->v_ipif != NULL);
2205 			vifp->v_pkt_out++;
2206 			vifp->v_bytes_out += plen;
2207 			MC_SEND(ipha, mp, vifp, dst);
2208 			ipst->ips_mrtstat->mrts_fwd_out++;
2209 		}
2210 		unlock_good_vif(vifp);
2211 	}
2212 	if (tunnel_src != 0)
2213 		return (1);
2214 	else
2215 		return (0);
2216 }
2217 
2218 /*
2219  * Send the packet on physical interface.
2220  * Caller assumes can continue to use mp on return.
2221  */
2222 /* ARGSUSED */
2223 static void
2224 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2225 {
2226 	mblk_t 	*mp_copy;
2227 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2228 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2229 
2230 	/* Make a new reference to the packet */
2231 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2232 	if (mp_copy == NULL) {
2233 		ipst->ips_mrtstat->mrts_fwd_drop++;
2234 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2235 		return;
2236 	}
2237 	if (vifp->v_rate_limit <= 0)
2238 		tbf_send_packet(vifp, mp_copy);
2239 	else  {
2240 		if (ipst->ips_ip_mrtdebug > 1) {
2241 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2242 			    "phyint_send: tbf_contr rate %d "
2243 			    "vifp 0x%p mp 0x%p dst 0x%x",
2244 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2245 		}
2246 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2247 	}
2248 }
2249 
2250 /*
2251  * Send the whole packet for REGISTER encapsulation to PIM daemon
2252  * Caller assumes it can continue to use mp on return.
2253  */
2254 /* ARGSUSED */
2255 static void
2256 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2257 {
2258 	struct igmpmsg	*im;
2259 	mblk_t		*mp_copy;
2260 	ipha_t		*ipha_copy;
2261 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2262 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2263 
2264 	if (ipst->ips_ip_mrtdebug > 1) {
2265 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2266 		    "register_send: src %x, dst %x\n",
2267 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2268 	}
2269 
2270 	/*
2271 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2272 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2273 	 * ethernet driver will.
2274 	 */
2275 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2276 	if (mp_copy == NULL) {
2277 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2278 		if (ipst->ips_ip_mrtdebug > 3) {
2279 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2280 			    "register_send: allocb failure.");
2281 		}
2282 		return;
2283 	}
2284 
2285 	/*
2286 	 * Bump write pointer to account for igmpmsg being added.
2287 	 */
2288 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2289 
2290 	/*
2291 	 * Chain packet to new mblk_t.
2292 	 */
2293 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2294 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2295 		if (ipst->ips_ip_mrtdebug > 3) {
2296 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2297 			    "register_send: copymsg failure.");
2298 		}
2299 		freeb(mp_copy);
2300 		return;
2301 	}
2302 
2303 	/*
2304 	 * icmp_input() asserts that IP version field is set to an
2305 	 * appropriate version. Hence, the struct igmpmsg that this really
2306 	 * becomes, needs to have the correct IP version field.
2307 	 */
2308 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2309 	*ipha_copy = multicast_encap_iphdr;
2310 
2311 	/*
2312 	 * The kernel uses the struct igmpmsg header to encode the messages to
2313 	 * the multicast routing daemon. Fill in the fields in the header
2314 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2315 	 */
2316 	im = (struct igmpmsg *)mp_copy->b_rptr;
2317 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2318 	im->im_src.s_addr = ipha->ipha_src;
2319 	im->im_dst.s_addr = ipha->ipha_dst;
2320 
2321 	/*
2322 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2323 	 * header with renamed fields and the multicast routing daemon uses
2324 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2325 	 */
2326 	im->im_mbz = 0;
2327 
2328 	++ipst->ips_mrtstat->mrts_upcalls;
2329 	if (!canputnext(mrouter->conn_rq)) {
2330 		++ipst->ips_mrtstat->mrts_pim_regsend_drops;
2331 		if (ipst->ips_ip_mrtdebug > 3) {
2332 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2333 			    "register_send: register upcall failure.");
2334 		}
2335 		freemsg(mp_copy);
2336 	} else {
2337 		/* Pass to RAWIP */
2338 		(mrouter->conn_recv)(mrouter, mp_copy, NULL);
2339 	}
2340 }
2341 
2342 /*
2343  * pim_validate_cksum handles verification of the checksum in the
2344  * pim header.  For PIM Register packets, the checksum is calculated
2345  * across the PIM header only.  For all other packets, the checksum
2346  * is for the PIM header and remainder of the packet.
2347  *
2348  * returns: B_TRUE, if checksum is okay.
2349  *          B_FALSE, if checksum is not valid.
2350  */
2351 static boolean_t
2352 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2353 {
2354 	mblk_t *mp_dup;
2355 
2356 	if ((mp_dup = dupmsg(mp)) == NULL)
2357 		return (B_FALSE);
2358 
2359 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2360 	if (pimp->pim_type == PIM_REGISTER)
2361 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2362 	if (IP_CSUM(mp_dup, 0, 0)) {
2363 		freemsg(mp_dup);
2364 		return (B_FALSE);
2365 	}
2366 	freemsg(mp_dup);
2367 	return (B_TRUE);
2368 }
2369 
2370 /*
2371  * int
2372  * pim_input(queue_t *, mblk_t *, ill_t *ill) - Process PIM protocol packets.
2373  *	IP Protocol 103. Register messages are decapsulated and sent
2374  *	onto multicast forwarding.
2375  */
2376 int
2377 pim_input(queue_t *q, mblk_t *mp, ill_t *ill)
2378 {
2379 	ipha_t		*eip, *ip;
2380 	int		iplen, pimlen, iphlen;
2381 	struct pim	*pimp;	/* pointer to a pim struct */
2382 	uint32_t	*reghdr;
2383 	ip_stack_t	*ipst = ill->ill_ipst;
2384 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2385 
2386 	/*
2387 	 * Pullup the msg for PIM protocol processing.
2388 	 */
2389 	if (pullupmsg(mp, -1) == 0) {
2390 		++ipst->ips_mrtstat->mrts_pim_nomemory;
2391 		freemsg(mp);
2392 		return (-1);
2393 	}
2394 
2395 	ip = (ipha_t *)mp->b_rptr;
2396 	iplen = ip->ipha_length;
2397 	iphlen = IPH_HDR_LENGTH(ip);
2398 	pimlen = ntohs(iplen) - iphlen;
2399 
2400 	/*
2401 	 * Validate lengths
2402 	 */
2403 	if (pimlen < PIM_MINLEN) {
2404 		++ipst->ips_mrtstat->mrts_pim_malformed;
2405 		if (ipst->ips_ip_mrtdebug > 1) {
2406 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2407 			    "pim_input: length not at least minlen");
2408 		}
2409 		freemsg(mp);
2410 		return (-1);
2411 	}
2412 
2413 	/*
2414 	 * Point to the PIM header.
2415 	 */
2416 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2417 
2418 	/*
2419 	 * Check the version number.
2420 	 */
2421 	if (pimp->pim_vers != PIM_VERSION) {
2422 		++ipst->ips_mrtstat->mrts_pim_badversion;
2423 		if (ipst->ips_ip_mrtdebug > 1) {
2424 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2425 			    "pim_input: unknown version of PIM");
2426 		}
2427 		freemsg(mp);
2428 		return (-1);
2429 	}
2430 
2431 	/*
2432 	 * Validate the checksum
2433 	 */
2434 	if (!pim_validate_cksum(mp, ip, pimp)) {
2435 		++ipst->ips_mrtstat->mrts_pim_rcv_badcsum;
2436 		if (ipst->ips_ip_mrtdebug > 1) {
2437 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2438 			    "pim_input: invalid checksum");
2439 		}
2440 		freemsg(mp);
2441 		return (-1);
2442 	}
2443 
2444 	if (pimp->pim_type != PIM_REGISTER)
2445 		return (0);
2446 
2447 	reghdr = (uint32_t *)(pimp + 1);
2448 	eip = (ipha_t *)(reghdr + 1);
2449 
2450 	/*
2451 	 * check if the inner packet is destined to mcast group
2452 	 */
2453 	if (!CLASSD(eip->ipha_dst)) {
2454 		++ipst->ips_mrtstat->mrts_pim_badregisters;
2455 		if (ipst->ips_ip_mrtdebug > 1) {
2456 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2457 			    "pim_input: Inner pkt not mcast .. !");
2458 		}
2459 		freemsg(mp);
2460 		return (-1);
2461 	}
2462 	if (ipst->ips_ip_mrtdebug > 1) {
2463 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2464 		    "register from %x, to %x, len %d",
2465 		    ntohl(eip->ipha_src),
2466 		    ntohl(eip->ipha_dst),
2467 		    ntohs(eip->ipha_length));
2468 	}
2469 	/*
2470 	 * If the null register bit is not set, decapsulate
2471 	 * the packet before forwarding it.
2472 	 */
2473 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2474 		mblk_t *mp_copy;
2475 
2476 		/* Copy the message */
2477 		if ((mp_copy = copymsg(mp)) == NULL) {
2478 			++ipst->ips_mrtstat->mrts_pim_nomemory;
2479 			freemsg(mp);
2480 			return (-1);
2481 		}
2482 
2483 		/*
2484 		 * Decapsulate the packet and give it to
2485 		 * register_mforward.
2486 		 */
2487 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2488 		    sizeof (*reghdr);
2489 		if (register_mforward(q, mp_copy, ill) != 0) {
2490 			freemsg(mp);
2491 			return (-1);
2492 		}
2493 	}
2494 
2495 	/*
2496 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2497 	 * PIM socket. For Solaris it is done right after pim_input() is
2498 	 * called.
2499 	 */
2500 	return (0);
2501 }
2502 
2503 /*
2504  * PIM sparse mode hook.  Called by pim_input after decapsulating
2505  * the packet. Loop back the packet, as if we have received it.
2506  * In pim_input() we have to check if the destination is a multicast address.
2507  */
2508 /* ARGSUSED */
2509 static int
2510 register_mforward(queue_t *q, mblk_t *mp, ill_t *ill)
2511 {
2512 	ip_stack_t	*ipst = ill->ill_ipst;
2513 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2514 
2515 	ASSERT(ipst->ips_reg_vif_num <= ipst->ips_numvifs);
2516 
2517 	if (ipst->ips_ip_mrtdebug > 3) {
2518 		ipha_t *ipha;
2519 
2520 		ipha = (ipha_t *)mp->b_rptr;
2521 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2522 		    "register_mforward: src %x, dst %x\n",
2523 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2524 	}
2525 	/*
2526 	 * Need to pass in to ip_mforward() the information that the
2527 	 * packet has arrived on the register_vif. We use the solution that
2528 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2529 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2530 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2531 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2532 	 * or passes it to ip_mforward(). We use
2533 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2534 	 * register vif. If in the future we have more than one register vifs,
2535 	 * then this will need re-examination.
2536 	 */
2537 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2538 	++ipst->ips_mrtstat->mrts_pim_regforwards;
2539 	ip_rput(q, mp);
2540 	return (0);
2541 }
2542 
2543 /*
2544  * Send an encapsulated packet.
2545  * Caller assumes can continue to use mp when routine returns.
2546  */
2547 /* ARGSUSED */
2548 static void
2549 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2550 {
2551 	mblk_t 	*mp_copy;
2552 	ipha_t 	*ipha_copy;
2553 	size_t	len;
2554 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2555 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2556 
2557 	if (ipst->ips_ip_mrtdebug > 1) {
2558 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2559 		    "encap_send: vif %ld enter",
2560 		    (ptrdiff_t)(vifp - ipst->ips_vifs));
2561 	}
2562 	len = ntohs(ipha->ipha_length);
2563 
2564 	/*
2565 	 * Copy the old packet & pullup it's IP header into the
2566 	 * new mbuf so we can modify it.  Try to fill the new
2567 	 * mbuf since if we don't the ethernet driver will.
2568 	 */
2569 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2570 	if (mp_copy == NULL)
2571 		return;
2572 	mp_copy->b_rptr += 32;
2573 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2574 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2575 		freeb(mp_copy);
2576 		return;
2577 	}
2578 
2579 	/*
2580 	 * Fill in the encapsulating IP header.
2581 	 * Remote tunnel dst in rmt_addr, from add_vif().
2582 	 */
2583 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2584 	*ipha_copy = multicast_encap_iphdr;
2585 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2586 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2587 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2588 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2589 	ASSERT(ipha_copy->ipha_ident == 0);
2590 
2591 	/* Turn the encapsulated IP header back into a valid one. */
2592 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2593 	ipha->ipha_ttl--;
2594 	ipha->ipha_hdr_checksum = 0;
2595 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2596 
2597 	if (ipst->ips_ip_mrtdebug > 1) {
2598 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2599 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2600 	}
2601 	if (vifp->v_rate_limit <= 0)
2602 		tbf_send_packet(vifp, mp_copy);
2603 	else
2604 		/* ipha is from the original header */
2605 		tbf_control(vifp, mp_copy, ipha);
2606 }
2607 
2608 /*
2609  * De-encapsulate a packet and feed it back through IP input.
2610  * This routine is called whenever IP gets a packet with prototype
2611  * IPPROTO_ENCAP and a local destination address.
2612  */
2613 void
2614 ip_mroute_decap(queue_t *q, mblk_t *mp, ill_t *ill)
2615 {
2616 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2617 	ipha_t		*ipha_encap;
2618 	int		hlen = IPH_HDR_LENGTH(ipha);
2619 	ipaddr_t	src;
2620 	struct vif	*vifp;
2621 	ip_stack_t	*ipst = ill->ill_ipst;
2622 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2623 
2624 	/*
2625 	 * Dump the packet if it's not to a multicast destination or if
2626 	 * we don't have an encapsulating tunnel with the source.
2627 	 * Note:  This code assumes that the remote site IP address
2628 	 * uniquely identifies the tunnel (i.e., that this site has
2629 	 * at most one tunnel with the remote site).
2630 	 */
2631 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2632 	if (!CLASSD(ipha_encap->ipha_dst)) {
2633 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2634 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2635 		freemsg(mp);
2636 		return;
2637 	}
2638 	src = (ipaddr_t)ipha->ipha_src;
2639 	mutex_enter(&ipst->ips_last_encap_lock);
2640 	if (src != ipst->ips_last_encap_src) {
2641 		struct vif *vife;
2642 
2643 		vifp = ipst->ips_vifs;
2644 		vife = vifp + ipst->ips_numvifs;
2645 		ipst->ips_last_encap_src = src;
2646 		ipst->ips_last_encap_vif = 0;
2647 		for (; vifp < vife; ++vifp) {
2648 			if (!lock_good_vif(vifp))
2649 				continue;
2650 			if (vifp->v_rmt_addr.s_addr == src) {
2651 				if (vifp->v_flags & VIFF_TUNNEL)
2652 					ipst->ips_last_encap_vif = vifp;
2653 				if (ipst->ips_ip_mrtdebug > 1) {
2654 					(void) mi_strlog(mrouter->conn_rq,
2655 					    1, SL_TRACE,
2656 					    "ip_mroute_decap: good tun "
2657 					    "vif %ld with %x",
2658 					    (ptrdiff_t)(vifp - ipst->ips_vifs),
2659 					    ntohl(src));
2660 				}
2661 				unlock_good_vif(vifp);
2662 				break;
2663 			}
2664 			unlock_good_vif(vifp);
2665 		}
2666 	}
2667 	if ((vifp = ipst->ips_last_encap_vif) == 0) {
2668 		mutex_exit(&ipst->ips_last_encap_lock);
2669 		ipst->ips_mrtstat->mrts_bad_tunnel++;
2670 		freemsg(mp);
2671 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2672 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(src)));
2673 		return;
2674 	}
2675 	mutex_exit(&ipst->ips_last_encap_lock);
2676 
2677 	/*
2678 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2679 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2680 	 * to pass this information. This is safe since the ip_rput either
2681 	 * frees the packet or passes it to ip_mforward.
2682 	 */
2683 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2684 	mp->b_rptr += hlen;
2685 	/* Feed back into ip_rput as an M_DATA. */
2686 	ip_rput(q, mp);
2687 }
2688 
2689 /*
2690  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2691  * (stream closed).  Called as writer.
2692  */
2693 void
2694 reset_mrt_vif_ipif(ipif_t *ipif)
2695 {
2696 	vifi_t vifi, tmp_vifi;
2697 	vifi_t num_of_vifs;
2698 	ip_stack_t	*ipst = ipif->ipif_ill->ill_ipst;
2699 
2700 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2701 
2702 	mutex_enter(&ipst->ips_numvifs_mutex);
2703 	num_of_vifs = ipst->ips_numvifs;
2704 	mutex_exit(&ipst->ips_numvifs_mutex);
2705 
2706 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2707 		tmp_vifi = vifi - 1;
2708 		if (ipst->ips_vifs[tmp_vifi].v_ipif == ipif) {
2709 			(void) del_vif(&tmp_vifi, NULL, NULL, ipst);
2710 		}
2711 	}
2712 }
2713 
2714 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2715 void
2716 reset_mrt_ill(ill_t *ill)
2717 {
2718 	struct mfc		*rt;
2719 	struct rtdetq	*rte;
2720 	int			i;
2721 	ip_stack_t	*ipst = ill->ill_ipst;
2722 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2723 
2724 	for (i = 0; i < MFCTBLSIZ; i++) {
2725 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
2726 		if ((rt = ipst->ips_mfcs[i].mfcb_mfc) != NULL) {
2727 			if (ipst->ips_ip_mrtdebug > 1) {
2728 				(void) mi_strlog(mrouter->conn_rq, 1,
2729 				    SL_TRACE,
2730 				    "reset_mrt_ill: mfctable [%d]", i);
2731 			}
2732 			while (rt != NULL) {
2733 				mutex_enter(&rt->mfc_mutex);
2734 				while ((rte = rt->mfc_rte) != NULL) {
2735 					if (rte->ill == ill) {
2736 						if (ipst->ips_ip_mrtdebug > 1) {
2737 						(void) mi_strlog(
2738 						    mrouter->conn_rq,
2739 						    1, SL_TRACE,
2740 						    "reset_mrt_ill: "
2741 						    "ill 0x%p", ill);
2742 						}
2743 						rt->mfc_rte = rte->rte_next;
2744 						freemsg(rte->mp);
2745 						mi_free((char *)rte);
2746 					}
2747 				}
2748 				mutex_exit(&rt->mfc_mutex);
2749 				rt = rt->mfc_next;
2750 			}
2751 		}
2752 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
2753 	}
2754 }
2755 
2756 /*
2757  * Token bucket filter module.
2758  * The ipha is for mcastgrp destination for phyint and encap.
2759  */
2760 static void
2761 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2762 {
2763 	size_t 	p_len =  msgdsize(mp);
2764 	struct tbf	*t    = vifp->v_tbf;
2765 	timeout_id_t id = 0;
2766 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2767 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2768 
2769 	/* Drop if packet is too large */
2770 	if (p_len > MAX_BKT_SIZE) {
2771 		ipst->ips_mrtstat->mrts_pkt2large++;
2772 		freemsg(mp);
2773 		return;
2774 	}
2775 	if (ipst->ips_ip_mrtdebug > 1) {
2776 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2777 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2778 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len,
2779 		    ntohl(ipha->ipha_dst));
2780 	}
2781 
2782 	mutex_enter(&t->tbf_lock);
2783 
2784 	tbf_update_tokens(vifp);
2785 
2786 	/*
2787 	 * If there are enough tokens,
2788 	 * and the queue is empty, send this packet out.
2789 	 */
2790 	if (ipst->ips_ip_mrtdebug > 1) {
2791 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2792 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2793 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_n_tok, p_len,
2794 		    t->tbf_q_len);
2795 	}
2796 	/* No packets are queued */
2797 	if (t->tbf_q_len == 0) {
2798 		/* queue empty, send packet if enough tokens */
2799 		if (p_len <= t->tbf_n_tok) {
2800 			t->tbf_n_tok -= p_len;
2801 			mutex_exit(&t->tbf_lock);
2802 			tbf_send_packet(vifp, mp);
2803 			return;
2804 		} else {
2805 			/* Queue packet and timeout till later */
2806 			tbf_queue(vifp, mp);
2807 			ASSERT(vifp->v_timeout_id == 0);
2808 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2809 			    TBF_REPROCESS);
2810 		}
2811 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2812 		/* Finite queue length, so queue pkts and process queue */
2813 		tbf_queue(vifp, mp);
2814 		tbf_process_q(vifp);
2815 	} else {
2816 		/* Check that we have UDP header with IP header */
2817 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2818 		    sizeof (struct udphdr);
2819 
2820 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2821 			if (!pullupmsg(mp, hdr_length)) {
2822 				freemsg(mp);
2823 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2824 				    "vif %ld src 0x%x dst 0x%x\n",
2825 				    (ptrdiff_t)(vifp - ipst->ips_vifs),
2826 				    ntohl(ipha->ipha_src),
2827 				    ntohl(ipha->ipha_dst)));
2828 				mutex_exit(&vifp->v_tbf->tbf_lock);
2829 				return;
2830 			} else
2831 				/* Have to reassign ipha after pullupmsg */
2832 				ipha = (ipha_t *)mp->b_rptr;
2833 		}
2834 		/*
2835 		 * Queue length too much,
2836 		 * try to selectively dq, or queue and process
2837 		 */
2838 		if (!tbf_dq_sel(vifp, ipha)) {
2839 			ipst->ips_mrtstat->mrts_q_overflow++;
2840 			freemsg(mp);
2841 		} else {
2842 			tbf_queue(vifp, mp);
2843 			tbf_process_q(vifp);
2844 		}
2845 	}
2846 	if (t->tbf_q_len == 0) {
2847 		id = vifp->v_timeout_id;
2848 		vifp->v_timeout_id = 0;
2849 	}
2850 	mutex_exit(&vifp->v_tbf->tbf_lock);
2851 	if (id != 0)
2852 		(void) untimeout(id);
2853 }
2854 
2855 /*
2856  * Adds a packet to the tbf queue at the interface.
2857  * The ipha is for mcastgrp destination for phyint and encap.
2858  */
2859 static void
2860 tbf_queue(struct vif *vifp, mblk_t *mp)
2861 {
2862 	struct tbf	*t = vifp->v_tbf;
2863 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2864 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2865 
2866 	if (ipst->ips_ip_mrtdebug > 1) {
2867 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2868 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - ipst->ips_vifs));
2869 	}
2870 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2871 
2872 	if (t->tbf_t == NULL) {
2873 		/* Queue was empty */
2874 		t->tbf_q = mp;
2875 	} else {
2876 		/* Insert at tail */
2877 		t->tbf_t->b_next = mp;
2878 	}
2879 	/* set new tail pointer */
2880 	t->tbf_t = mp;
2881 
2882 	mp->b_next = mp->b_prev = NULL;
2883 
2884 	t->tbf_q_len++;
2885 }
2886 
2887 /*
2888  * Process the queue at the vif interface.
2889  * Drops the tbf_lock when sending packets.
2890  *
2891  * NOTE : The caller should quntimeout if the queue length is 0.
2892  */
2893 static void
2894 tbf_process_q(struct vif *vifp)
2895 {
2896 	mblk_t	*mp;
2897 	struct tbf	*t = vifp->v_tbf;
2898 	size_t	len;
2899 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2900 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2901 
2902 	if (ipst->ips_ip_mrtdebug > 1) {
2903 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2904 		    "tbf_process_q 1: vif %ld qlen = %d",
2905 		    (ptrdiff_t)(vifp - ipst->ips_vifs), t->tbf_q_len);
2906 	}
2907 
2908 	/*
2909 	 * Loop through the queue at the interface and send
2910 	 * as many packets as possible.
2911 	 */
2912 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2913 
2914 	while (t->tbf_q_len > 0) {
2915 		mp = t->tbf_q;
2916 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2917 
2918 		/* Determine if the packet can be sent */
2919 		if (len <= t->tbf_n_tok) {
2920 			/*
2921 			 * If so, reduce no. of tokens, dequeue the packet,
2922 			 * send the packet.
2923 			 */
2924 			t->tbf_n_tok -= len;
2925 
2926 			t->tbf_q = mp->b_next;
2927 			if (--t->tbf_q_len == 0) {
2928 				t->tbf_t = NULL;
2929 			}
2930 			mp->b_next = NULL;
2931 			/* Exit mutex before sending packet, then re-enter */
2932 			mutex_exit(&t->tbf_lock);
2933 			tbf_send_packet(vifp, mp);
2934 			mutex_enter(&t->tbf_lock);
2935 		} else
2936 			break;
2937 	}
2938 }
2939 
2940 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2941 static void
2942 tbf_reprocess_q(void *arg)
2943 {
2944 	struct vif *vifp = arg;
2945 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2946 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2947 
2948 	mutex_enter(&vifp->v_tbf->tbf_lock);
2949 	vifp->v_timeout_id = 0;
2950 	tbf_update_tokens(vifp);
2951 
2952 	tbf_process_q(vifp);
2953 
2954 	if (vifp->v_tbf->tbf_q_len > 0) {
2955 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2956 		    TBF_REPROCESS);
2957 	}
2958 	mutex_exit(&vifp->v_tbf->tbf_lock);
2959 
2960 	if (ipst->ips_ip_mrtdebug > 1) {
2961 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2962 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2963 		    (ptrdiff_t)(vifp - ipst->ips_vifs), vifp->v_timeout_id);
2964 	}
2965 }
2966 
2967 /*
2968  * Function that will selectively discard a member of the tbf queue,
2969  * based on the precedence value and the priority.
2970  *
2971  * NOTE : The caller should quntimeout if the queue length is 0.
2972  */
2973 static int
2974 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2975 {
2976 	uint_t		p;
2977 	struct tbf		*t = vifp->v_tbf;
2978 	mblk_t		**np;
2979 	mblk_t		*last, *mp;
2980 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
2981 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
2982 
2983 	if (ipst->ips_ip_mrtdebug > 1) {
2984 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
2985 		    "dq_sel: vif %ld dst 0x%x",
2986 		    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(ipha->ipha_dst));
2987 	}
2988 
2989 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2990 	p = priority(vifp, ipha);
2991 
2992 	np = &t->tbf_q;
2993 	last = NULL;
2994 	while ((mp = *np) != NULL) {
2995 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2996 			*np = mp->b_next;
2997 			/* If removing the last packet, fix the tail pointer */
2998 			if (mp == t->tbf_t)
2999 				t->tbf_t = last;
3000 			mp->b_prev = mp->b_next = NULL;
3001 			freemsg(mp);
3002 			/*
3003 			 * It's impossible for the queue to be empty, but
3004 			 * we check anyway.
3005 			 */
3006 			if (--t->tbf_q_len == 0) {
3007 				t->tbf_t = NULL;
3008 			}
3009 			ipst->ips_mrtstat->mrts_drop_sel++;
3010 			return (1);
3011 		}
3012 		np = &mp->b_next;
3013 		last = mp;
3014 	}
3015 	return (0);
3016 }
3017 
3018 /* Sends packet, 2 cases - encap tunnel, phyint.  */
3019 static void
3020 tbf_send_packet(struct vif *vifp, mblk_t *mp)
3021 {
3022 	ipif_t  *ipif;
3023 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3024 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3025 
3026 	/* If encap tunnel options */
3027 	if (vifp->v_flags & VIFF_TUNNEL)  {
3028 		if (ipst->ips_ip_mrtdebug > 1) {
3029 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3030 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
3031 			    (ptrdiff_t)(vifp - ipst->ips_vifs));
3032 		}
3033 
3034 		/*
3035 		 * Feed into ip_wput which will set the ident field and
3036 		 * checksum the encapsulating header.
3037 		 * BSD gets the cached route vifp->v_route from ip_output()
3038 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
3039 		 */
3040 		put(vifp->v_ipif->ipif_wq, mp);
3041 		return;
3042 
3043 		/* phyint */
3044 	} else {
3045 		/* Need to loop back to members on the outgoing interface. */
3046 		ipha_t  *ipha;
3047 		ipaddr_t    dst;
3048 		ipha  = (ipha_t *)mp->b_rptr;
3049 		dst  = ipha->ipha_dst;
3050 		ipif = vifp->v_ipif;
3051 
3052 		mutex_enter(&ipif->ipif_ill->ill_lock);
3053 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
3054 			/*
3055 			 * The packet is not yet reassembled, thus we need to
3056 			 * pass it to ip_rput_local for checksum verification
3057 			 * and reassembly (and fanout the user stream).
3058 			 */
3059 			mblk_t 	*mp_loop;
3060 			ire_t	*ire;
3061 
3062 			mutex_exit(&ipif->ipif_ill->ill_lock);
3063 			if (ipst->ips_ip_mrtdebug > 1) {
3064 				(void) mi_strlog(mrouter->conn_rq, 1,
3065 				    SL_TRACE,
3066 				    "tbf_send_pkt: loopback vif %ld",
3067 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3068 			}
3069 			mp_loop = copymsg(mp);
3070 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
3071 			    ALL_ZONES, NULL, MATCH_IRE_TYPE, ipst);
3072 
3073 			if (mp_loop != NULL && ire != NULL) {
3074 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
3075 				    ((ipha_t *)mp_loop->b_rptr),
3076 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
3077 			} else {
3078 				/* Either copymsg failed or no ire */
3079 				(void) mi_strlog(mrouter->conn_rq, 1,
3080 				    SL_TRACE,
3081 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
3082 				    "vif %ld\n", mp_loop, ire,
3083 				    (ptrdiff_t)(vifp - ipst->ips_vifs));
3084 			}
3085 			if (ire != NULL)
3086 				ire_refrele(ire);
3087 		} else {
3088 			mutex_exit(&ipif->ipif_ill->ill_lock);
3089 		}
3090 		if (ipst->ips_ip_mrtdebug > 1) {
3091 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3092 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3093 			    (ptrdiff_t)(vifp - ipst->ips_vifs), ntohl(dst));
3094 		}
3095 		ip_rput_forward_multicast(dst, mp, ipif);
3096 	}
3097 }
3098 
3099 /*
3100  * Determine the current time and then the elapsed time (between the last time
3101  * and time now).  Update the no. of tokens in the bucket.
3102  */
3103 static void
3104 tbf_update_tokens(struct vif *vifp)
3105 {
3106 	timespec_t	tp;
3107 	hrtime_t	tm;
3108 	struct tbf	*t = vifp->v_tbf;
3109 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3110 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3111 
3112 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3113 
3114 	/* Time in secs and nsecs, rate limit in kbits/sec */
3115 	gethrestime(&tp);
3116 
3117 	/*LINTED*/
3118 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3119 
3120 	/*
3121 	 * This formula is actually
3122 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3123 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3124 	 *
3125 	 * The (1000/1024) was introduced in add_vif to optimize
3126 	 * this divide into a shift.
3127 	 */
3128 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3129 	t->tbf_last_pkt_t = tp;
3130 
3131 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3132 		t->tbf_n_tok = MAX_BKT_SIZE;
3133 	if (ipst->ips_ip_mrtdebug > 1) {
3134 		(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3135 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3136 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - ipst->ips_vifs));
3137 	}
3138 }
3139 
3140 /*
3141  * Priority currently is based on port nos.
3142  * Different forwarding mechanisms have different ways
3143  * of obtaining the port no. Hence, the vif must be
3144  * given along with the packet itself.
3145  *
3146  */
3147 static int
3148 priority(struct vif *vifp, ipha_t *ipha)
3149 {
3150 	int prio;
3151 	ip_stack_t	*ipst = vifp->v_ipif->ipif_ill->ill_ipst;
3152 	conn_t		*mrouter = ipst->ips_ip_g_mrouter;
3153 
3154 	/* Temporary hack; may add general packet classifier some day */
3155 
3156 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3157 
3158 	/*
3159 	 * The UDP port space is divided up into four priority ranges:
3160 	 * [0, 16384)	: unclassified - lowest priority
3161 	 * [16384, 32768)	: audio - highest priority
3162 	 * [32768, 49152)	: whiteboard - medium priority
3163 	 * [49152, 65536)	: video - low priority
3164 	 */
3165 
3166 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3167 		struct udphdr *udp =
3168 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3169 		switch (ntohs(udp->uh_dport) & 0xc000) {
3170 		case 0x4000:
3171 			prio = 70;
3172 			break;
3173 		case 0x8000:
3174 			prio = 60;
3175 			break;
3176 		case 0xc000:
3177 			prio = 55;
3178 			break;
3179 		default:
3180 			prio = 50;
3181 			break;
3182 		}
3183 		if (ipst->ips_ip_mrtdebug > 1) {
3184 			(void) mi_strlog(mrouter->conn_rq, 1, SL_TRACE,
3185 			    "priority: port %x prio %d\n",
3186 			    ntohs(udp->uh_dport), prio);
3187 		}
3188 	} else
3189 		prio = 50;  /* default priority */
3190 	return (prio);
3191 }
3192 
3193 /*
3194  * End of token bucket filter modifications
3195  */
3196 
3197 
3198 
3199 /*
3200  * Produces data for netstat -M.
3201  */
3202 int
3203 ip_mroute_stats(mblk_t *mp, ip_stack_t *ipst)
3204 {
3205 	ipst->ips_mrtstat->mrts_vifctlSize = sizeof (struct vifctl);
3206 	ipst->ips_mrtstat->mrts_mfcctlSize = sizeof (struct mfcctl);
3207 	if (!snmp_append_data(mp, (char *)ipst->ips_mrtstat,
3208 		sizeof (struct mrtstat))) {
3209 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3210 		    (size_t)sizeof (struct mrtstat)));
3211 		return (0);
3212 	}
3213 	return (1);
3214 }
3215 
3216 /*
3217  * Sends info for SNMP's MIB.
3218  */
3219 int
3220 ip_mroute_vif(mblk_t *mp, ip_stack_t *ipst)
3221 {
3222 	struct vifctl 	vi;
3223 	vifi_t		vifi;
3224 
3225 	mutex_enter(&ipst->ips_numvifs_mutex);
3226 	for (vifi = 0; vifi < ipst->ips_numvifs; vifi++) {
3227 		if (ipst->ips_vifs[vifi].v_lcl_addr.s_addr == 0)
3228 			continue;
3229 		/*
3230 		 * No locks here, an approximation is fine.
3231 		 */
3232 		vi.vifc_vifi = vifi;
3233 		vi.vifc_flags = ipst->ips_vifs[vifi].v_flags;
3234 		vi.vifc_threshold = ipst->ips_vifs[vifi].v_threshold;
3235 		vi.vifc_rate_limit	= ipst->ips_vifs[vifi].v_rate_limit;
3236 		vi.vifc_lcl_addr	= ipst->ips_vifs[vifi].v_lcl_addr;
3237 		vi.vifc_rmt_addr	= ipst->ips_vifs[vifi].v_rmt_addr;
3238 		vi.vifc_pkt_in		= ipst->ips_vifs[vifi].v_pkt_in;
3239 		vi.vifc_pkt_out		= ipst->ips_vifs[vifi].v_pkt_out;
3240 
3241 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3242 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3243 			    (size_t)sizeof (vi)));
3244 			return (0);
3245 		}
3246 	}
3247 	mutex_exit(&ipst->ips_numvifs_mutex);
3248 	return (1);
3249 }
3250 
3251 /*
3252  * Called by ip_snmp_get to send up multicast routing table.
3253  */
3254 int
3255 ip_mroute_mrt(mblk_t *mp, ip_stack_t *ipst)
3256 {
3257 	int			i, j;
3258 	struct mfc		*rt;
3259 	struct mfcctl	mfcc;
3260 
3261 	/*
3262 	 * Make sure multicast has not been turned off.
3263 	 */
3264 	if (is_mrouter_off(ipst))
3265 		return (1);
3266 
3267 	/* Loop over all hash buckets and their chains */
3268 	for (i = 0; i < MFCTBLSIZ; i++) {
3269 		MFCB_REFHOLD(&ipst->ips_mfcs[i]);
3270 		for (rt = ipst->ips_mfcs[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3271 			mutex_enter(&rt->mfc_mutex);
3272 			if (rt->mfc_rte != NULL ||
3273 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3274 				mutex_exit(&rt->mfc_mutex);
3275 				continue;
3276 			}
3277 			mfcc.mfcc_origin = rt->mfc_origin;
3278 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3279 			mfcc.mfcc_parent = rt->mfc_parent;
3280 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3281 			mutex_enter(&ipst->ips_numvifs_mutex);
3282 			for (j = 0; j < (int)ipst->ips_numvifs; j++)
3283 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3284 			for (j = (int)ipst->ips_numvifs; j < MAXVIFS; j++)
3285 				mfcc.mfcc_ttls[j] = 0;
3286 			mutex_exit(&ipst->ips_numvifs_mutex);
3287 
3288 			mutex_exit(&rt->mfc_mutex);
3289 			if (!snmp_append_data(mp, (char *)&mfcc,
3290 			    sizeof (mfcc))) {
3291 				MFCB_REFRELE(&ipst->ips_mfcs[i]);
3292 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3293 				    (size_t)sizeof (mfcc)));
3294 				return (0);
3295 			}
3296 		}
3297 		MFCB_REFRELE(&ipst->ips_mfcs[i]);
3298 	}
3299 	return (1);
3300 }
3301