xref: /titanic_44/usr/src/uts/common/inet/ip/ip_mroute.c (revision bdfc6d18da790deeec2e0eb09c625902defe2498)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.
24  * All rights reserved.  Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * Procedures for the kernel part of DVMRP,
32  * a Distance-Vector Multicast Routing Protocol.
33  * (See RFC-1075)
34  * Written by David Waitzman, BBN Labs, August 1988.
35  * Modified by Steve Deering, Stanford, February 1989.
36  * Modified by Mark J. Steiglitz, Stanford, May, 1991
37  * Modified by Van Jacobson, LBL, January 1993
38  * Modified by Ajit Thyagarajan, PARC, August 1993
39  * Modified by Bill Fenner, PARC, April 1995
40  *
41  * MROUTING 3.5
42  */
43 
44 /*
45  * TODO
46  * - function pointer field in vif, void *vif_sendit()
47  */
48 
49 #include <sys/types.h>
50 #include <sys/stream.h>
51 #include <sys/dlpi.h>
52 #include <sys/stropts.h>
53 #include <sys/strlog.h>
54 #include <sys/systm.h>
55 #include <sys/ddi.h>
56 #include <sys/cmn_err.h>
57 #include <sys/zone.h>
58 
59 #include <sys/param.h>
60 #include <sys/socket.h>
61 #define	_SUN_TPI_VERSION	2
62 #include <sys/tihdr.h>
63 #include <sys/vtrace.h>
64 #include <sys/debug.h>
65 #include <net/if.h>
66 #include <net/if_arp.h>
67 #include <sys/sockio.h>
68 #include <net/route.h>
69 #include <netinet/in.h>
70 #include <net/if_dl.h>
71 
72 #include <inet/common.h>
73 #include <inet/mi.h>
74 #include <inet/nd.h>
75 #include <inet/arp.h>
76 #include <inet/mib2.h>
77 #include <netinet/ip6.h>
78 #include <inet/ip.h>
79 #include <inet/snmpcom.h>
80 
81 #include <netinet/igmp.h>
82 #include <netinet/igmp_var.h>
83 #include <netinet/udp.h>
84 #include <netinet/ip_mroute.h>
85 #include <inet/ip_multi.h>
86 #include <inet/ip_ire.h>
87 #include <inet/ip_if.h>
88 #include <inet/ipclassifier.h>
89 
90 #include <netinet/pim.h>
91 
92 
93 /*
94  * MT Design:
95  *
96  * There are three main data structures viftable, mfctable and tbftable that
97  * need to be protected against MT races.
98  *
99  * vitable is a fixed length array of vif structs. There is no lock to protect
100  * the whole array, instead each struct is protected by its own indiviual lock.
101  * The value of v_marks in conjuction with the value of v_refcnt determines the
102  * current state of a vif structure. One special state that needs mention
103  * is when the vif is marked VIF_MARK_NOTINUSE but refcnt != 0. This indicates
104  * that vif is being initalized.
105  * Each structure is freed when the refcnt goes down to zero. If a delete comes
106  * in when the the recfnt is > 1, the vif structure is marked VIF_MARK_CONDEMNED
107  * which prevents the struct from further use.  When the refcnt goes to zero
108  * the struct is freed and is marked VIF_MARK_NOTINUSE.
109  * vif struct stores a pointer to the ipif in v_ipif, to prevent ipif/ill
110  * from  going away a refhold is put on the ipif before using it. see
111  * lock_good_vif() and unlock_good_vif().
112  *
113  * VIF_REFHOLD and VIF_REFRELE macros have been provided to manipulate refcnts
114  * of the vif struct.
115  *
116  * tbftable is also a fixed length array of tbf structs and is only accessed
117  * via v_tbf.  It is protected by its own lock tbf_lock.
118  *
119  * Lock Ordering is
120  * v_lock --> tbf_lock
121  * v_lock --> ill_locK
122  *
123  * mfctable is a fixed size hash table of mfc buckets strcuts (struct mfcb).
124  * Each mfc bucket struct (struct mfcb) maintains a refcnt for each walker,
125  * it also maintains a state. These fields are protected by a lock (mfcb_lock).
126  * mfc structs only maintain a state and have no refcnt. mfc_mutex is used to
127  * protect the struct elements.
128  *
129  * mfc structs are dynamically allocated and are singly linked
130  * at the head of the chain. When an mfc structure is to be deleted
131  * it is marked condemned and so is the state in the bucket struct.
132  * When the last walker of the hash bucket exits all the mfc structs
133  * marked condemed are freed.
134  *
135  * Locking Hierarchy:
136  * The bucket lock should be acquired before the mfc struct lock.
137  * MFCB_REFHOLD and MFCB_REFRELE macros are provided for locking
138  * operations on the bucket struct.
139  *
140  * last_encap_lock and numvifs_mutex should be acquired after
141  * acquring vif or mfc locks. These locks protect some global variables.
142  *
143  * The statistics are not currently protected by a lock
144  * causing the stats be be approximate, not exact.
145  */
146 
147 /*
148  * Globals
149  * All but ip_g_mrouter and ip_mrtproto could be static,
150  * except for netstat or debugging purposes.
151  */
152 queue_t		*ip_g_mrouter	= NULL;
153 static kmutex_t	ip_g_mrouter_mutex;
154 
155 int		ip_mrtproto	= IGMP_DVMRP;	/* for netstat only */
156 struct mrtstat	mrtstat;	/* Stats for netstat */
157 
158 #define	NO_VIF	MAXVIFS 	/* from mrouted, no route for src */
159 
160 /*
161  * Timeouts:
162  * 	Upcall timeouts - BSD uses boolean_t mfc->expire and
163  *	nexpire[MFCTBLSIZE], the number of times expire has been called.
164  *	SunOS 5.x uses mfc->timeout for each mfc.
165  *	Some Unixes are limited in the number of simultaneous timeouts
166  * 	that can be run, SunOS 5.x does not have this restriction.
167  */
168 
169 /*
170  * In BSD, EXPIRE_TIMEOUT is how often expire_upcalls() is called and
171  * UPCALL_EXPIRE is the nmber of timeouts before a particular upcall
172  * expires. Thus the time till expiration is EXPIRE_TIMEOUT * UPCALL_EXPIRE
173  */
174 #define		EXPIRE_TIMEOUT	(hz/4)	/* 4x / second	*/
175 #define		UPCALL_EXPIRE	6	/* number of timeouts	*/
176 
177 /*
178  * Hash function for a source, group entry
179  */
180 #define	MFCHASH(a, g) MFCHASHMOD(((a) >> 20) ^ ((a) >> 10) ^ (a) ^ \
181 	((g) >> 20) ^ ((g) >> 10) ^ (g))
182 
183 /*
184  * mfctable:
185  * Includes all mfcs, including waiting upcalls.
186  * Multiple mfcs per bucket.
187  */
188 static struct mfcb	mfctable[MFCTBLSIZ];	/* kernel routing table	*/
189 
190 /*
191  * Define the token bucket filter structures.
192  * tbftable -> each vif has one of these for storing info.
193  */
194 struct tbf 		tbftable[MAXVIFS];
195 #define			TBF_REPROCESS	(hz / 100)	/* 100x /second	*/
196 
197 /* Identify PIM packet that came on a Register interface */
198 #define	PIM_REGISTER_MARKER	0xffffffff
199 
200 /* Function declarations */
201 static int	add_mfc(struct mfcctl *);
202 static int	add_vif(struct vifctl *, queue_t *, mblk_t *);
203 static int	del_mfc(struct mfcctl *);
204 static int	del_vif(vifi_t *, queue_t *, mblk_t *);
205 static void	del_vifp(struct vif *);
206 static void	encap_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
207 static void	expire_upcalls(void *);
208 static void	fill_route(struct mfc *, struct mfcctl *);
209 static int	get_assert(uchar_t *);
210 static int	get_lsg_cnt(struct sioc_lsg_req *);
211 static int	get_sg_cnt(struct sioc_sg_req *);
212 static int	get_version(uchar_t *);
213 static int	get_vif_cnt(struct sioc_vif_req *);
214 static int	ip_mdq(mblk_t *, ipha_t *, ill_t *,
215 		    ipaddr_t, struct mfc *);
216 static int	ip_mrouter_init(queue_t *, uchar_t *, int);
217 static void	phyint_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
218 static int	register_mforward(queue_t *, mblk_t *);
219 static void	register_send(ipha_t *, mblk_t *, struct vif *, ipaddr_t);
220 static int	set_assert(int *);
221 
222 /*
223  * Token Bucket Filter functions
224  */
225 static int  priority(struct vif *, ipha_t *);
226 static void tbf_control(struct vif *, mblk_t *, ipha_t *);
227 static int  tbf_dq_sel(struct vif *, ipha_t *);
228 static void tbf_process_q(struct vif *);
229 static void tbf_queue(struct vif *, mblk_t *);
230 static void tbf_reprocess_q(void *);
231 static void tbf_send_packet(struct vif *, mblk_t *);
232 static void tbf_update_tokens(struct vif *);
233 static void release_mfc(struct mfcb *);
234 
235 static boolean_t is_mrouter_off(void);
236 /*
237  * Encapsulation packets
238  */
239 
240 #define	ENCAP_TTL	64
241 
242 /* prototype IP hdr for encapsulated packets */
243 static ipha_t multicast_encap_iphdr = {
244 	IP_SIMPLE_HDR_VERSION,
245 	0,				/* tos */
246 	sizeof (ipha_t),		/* total length */
247 	0,				/* id */
248 	0,				/* frag offset */
249 	ENCAP_TTL, IPPROTO_ENCAP,
250 	0,				/* checksum */
251 };
252 
253 /*
254  * Private variables.
255  */
256 static int		saved_ip_g_forward = -1;
257 
258 /*
259  * numvifs is only a hint about the max interface being used.
260  */
261 static vifi_t		numvifs = 0;
262 static kmutex_t		numvifs_mutex;
263 
264 static struct vif	viftable[MAXVIFS+1];	/* Index needs to accomodate */
265 /* the value of NO_VIF, which */
266 /* is MAXVIFS. */
267 
268 /*
269  * One-back cache used to locate a tunnel's vif,
270  * given a datagram's src ip address.
271  */
272 static ipaddr_t		last_encap_src;
273 static struct vif	*last_encap_vif;
274 static kmutex_t		last_encap_lock;	/* Protects the above */
275 
276 /*
277  * Whether or not special PIM assert processing is enabled.
278  */
279 /*
280  * reg_vif_num is protected by numvifs_mutex
281  */
282 static vifi_t reg_vif_num = ALL_VIFS; 	/* Index to Register vif */
283 static int pim_assert;
284 
285 /*
286  * Rate limit for assert notification messages, in nsec.
287  */
288 #define	ASSERT_MSG_TIME		3000000000
289 
290 
291 #define	VIF_REFHOLD(vifp) {			\
292 	mutex_enter(&(vifp)->v_lock);		\
293 	(vifp)->v_refcnt++;			\
294 	mutex_exit(&(vifp)->v_lock);		\
295 }
296 
297 #define	VIF_REFRELE_LOCKED(vifp) {				\
298 	(vifp)->v_refcnt--;					\
299 	if ((vifp)->v_refcnt == 0 &&				\
300 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
301 			del_vifp(vifp);				\
302 	} else {						\
303 		mutex_exit(&(vifp)->v_lock);			\
304 	}							\
305 }
306 
307 #define	VIF_REFRELE(vifp) {					\
308 	mutex_enter(&(vifp)->v_lock);				\
309 	(vifp)->v_refcnt--;					\
310 	if ((vifp)->v_refcnt == 0 &&				\
311 		((vifp)->v_marks & VIF_MARK_CONDEMNED)) {	\
312 			del_vifp(vifp);				\
313 	} else {						\
314 		mutex_exit(&(vifp)->v_lock);			\
315 	}							\
316 }
317 
318 #define	MFCB_REFHOLD(mfcb) {				\
319 	mutex_enter(&(mfcb)->mfcb_lock);		\
320 	(mfcb)->mfcb_refcnt++;				\
321 	ASSERT((mfcb)->mfcb_refcnt != 0);		\
322 	mutex_exit(&(mfcb)->mfcb_lock);			\
323 }
324 
325 #define	MFCB_REFRELE(mfcb) {					\
326 	mutex_enter(&(mfcb)->mfcb_lock);			\
327 	ASSERT((mfcb)->mfcb_refcnt != 0);			\
328 	if (--(mfcb)->mfcb_refcnt == 0 &&			\
329 		((mfcb)->mfcb_marks & MFCB_MARK_CONDEMNED)) {	\
330 			release_mfc(mfcb);			\
331 	}							\
332 	mutex_exit(&(mfcb)->mfcb_lock);				\
333 }
334 
335 /*
336  * MFCFIND:
337  * Find a route for a given origin IP address and multicast group address.
338  * Skip entries with pending upcalls.
339  * Type of service parameter to be added in the future!
340  */
341 #define	MFCFIND(mfcbp, o, g, rt) { \
342 	struct mfc *_mb_rt = NULL; \
343 	rt = NULL; \
344 	_mb_rt = mfcbp->mfcb_mfc; \
345 	while (_mb_rt) { \
346 		if ((_mb_rt->mfc_origin.s_addr == o) && \
347 		    (_mb_rt->mfc_mcastgrp.s_addr == g) && \
348 		    (_mb_rt->mfc_rte == NULL) && \
349 		    (!(_mb_rt->mfc_marks & MFCB_MARK_CONDEMNED))) {        \
350 		    rt = _mb_rt; \
351 		    break; \
352 		} \
353 	_mb_rt = _mb_rt->mfc_next; \
354 	} \
355 }
356 
357 /*
358  * BSD uses timeval with sec and usec. In SunOS 5.x uniqtime() and gethrtime()
359  * are inefficient. We use gethrestime() which returns a timespec_t with
360  * sec and nsec, the resolution is machine dependent.
361  * The following 2 macros have been changed to use nsec instead of usec.
362  */
363 /*
364  * Macros to compute elapsed time efficiently.
365  * Borrowed from Van Jacobson's scheduling code.
366  * Delta should be a hrtime_t.
367  */
368 #define	TV_DELTA(a, b, delta) { \
369 	int xxs; \
370  \
371 	delta = (a).tv_nsec - (b).tv_nsec; \
372 	if ((xxs = (a).tv_sec - (b).tv_sec) != 0) { \
373 		switch (xxs) { \
374 		case 2: \
375 		    delta += 1000000000; \
376 		    /*FALLTHROUGH*/ \
377 		case 1: \
378 		    delta += 1000000000; \
379 		    break; \
380 		default: \
381 		    delta += (1000000000 * xxs); \
382 		} \
383 	} \
384 }
385 
386 #define	TV_LT(a, b) (((a).tv_nsec < (b).tv_nsec && \
387 	(a).tv_sec <= (b).tv_sec) || (a).tv_sec < (b).tv_sec)
388 
389 /*
390  * Handle MRT setsockopt commands to modify the multicast routing tables.
391  */
392 int
393 ip_mrouter_set(int cmd, queue_t *q, int checkonly, uchar_t *data,
394     int datalen, mblk_t *first_mp)
395 {
396 	mutex_enter(&ip_g_mrouter_mutex);
397 	if (cmd != MRT_INIT && q != ip_g_mrouter) {
398 		mutex_exit(&ip_g_mrouter_mutex);
399 		return (EACCES);
400 	}
401 	mutex_exit(&ip_g_mrouter_mutex);
402 
403 	if (checkonly) {
404 		/*
405 		 * do not do operation, just pretend to - new T_CHECK
406 		 * Note: Even routines further on can probably fail but
407 		 * this T_CHECK stuff is only to please XTI so it not
408 		 * necessary to be perfect.
409 		 */
410 		switch (cmd) {
411 		case MRT_INIT:
412 		case MRT_DONE:
413 		case MRT_ADD_VIF:
414 		case MRT_DEL_VIF:
415 		case MRT_ADD_MFC:
416 		case MRT_DEL_MFC:
417 		case MRT_ASSERT:
418 		    return (0);
419 		default:
420 		    return (EOPNOTSUPP);
421 		}
422 	}
423 
424 	/*
425 	 * make sure no command is issued after multicast routing has been
426 	 * turned off.
427 	 */
428 	if (cmd != MRT_INIT && cmd != MRT_DONE) {
429 		if (is_mrouter_off())
430 			return (EINVAL);
431 	}
432 
433 	switch (cmd) {
434 	case MRT_INIT:	return (ip_mrouter_init(q, data, datalen));
435 	case MRT_DONE:	return (ip_mrouter_done(first_mp));
436 	case MRT_ADD_VIF:  return (add_vif((struct vifctl *)data, q, first_mp));
437 	case MRT_DEL_VIF:  return (del_vif((vifi_t *)data, q, first_mp));
438 	case MRT_ADD_MFC:  return (add_mfc((struct mfcctl *)data));
439 	case MRT_DEL_MFC:  return (del_mfc((struct mfcctl *)data));
440 	case MRT_ASSERT:   return (set_assert((int *)data));
441 	default:	   return (EOPNOTSUPP);
442 	}
443 }
444 
445 /*
446  * Handle MRT getsockopt commands
447  */
448 int
449 ip_mrouter_get(int cmd, queue_t *q, uchar_t *data)
450 {
451 	if (q != ip_g_mrouter)
452 		return (EACCES);
453 
454 	switch (cmd) {
455 	case MRT_VERSION:	return (get_version((uchar_t *)data));
456 	case MRT_ASSERT:	return (get_assert((uchar_t *)data));
457 	default:		return (EOPNOTSUPP);
458 	}
459 }
460 
461 /*
462  * Handle ioctl commands to obtain information from the cache.
463  * Called with shared access to IP. These are read_only ioctls.
464  */
465 /* ARGSUSED */
466 int
467 mrt_ioctl(ipif_t *ipif, sin_t *sin, queue_t *q, mblk_t *mp,
468     ip_ioctl_cmd_t *ipip, void *if_req)
469 {
470 	mblk_t	*mp1;
471 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
472 
473 	/* Existence verified in ip_wput_nondata */
474 	mp1 = mp->b_cont->b_cont;
475 
476 	switch (iocp->ioc_cmd) {
477 	case (SIOCGETVIFCNT):
478 		return (get_vif_cnt((struct sioc_vif_req *)mp1->b_rptr));
479 	case (SIOCGETSGCNT):
480 		return (get_sg_cnt((struct sioc_sg_req *)mp1->b_rptr));
481 	case (SIOCGETLSGCNT):
482 		return (get_lsg_cnt((struct sioc_lsg_req *)mp1->b_rptr));
483 	default:
484 		return (EINVAL);
485 	}
486 }
487 
488 /*
489  * Returns the packet, byte, rpf-failure count for the source, group provided.
490  */
491 static int
492 get_sg_cnt(struct sioc_sg_req *req)
493 {
494 	struct mfc *rt;
495 	struct mfcb *mfcbp;
496 
497 	mfcbp = &mfctable[MFCHASH(req->src.s_addr, req->grp.s_addr)];
498 	MFCB_REFHOLD(mfcbp);
499 	MFCFIND(mfcbp, req->src.s_addr, req->grp.s_addr, rt);
500 
501 	if (rt != NULL) {
502 		mutex_enter(&rt->mfc_mutex);
503 		req->pktcnt   = rt->mfc_pkt_cnt;
504 		req->bytecnt  = rt->mfc_byte_cnt;
505 		req->wrong_if = rt->mfc_wrong_if;
506 		mutex_exit(&rt->mfc_mutex);
507 	} else
508 		req->pktcnt = req->bytecnt = req->wrong_if = 0xffffffffU;
509 
510 	MFCB_REFRELE(mfcbp);
511 	return (0);
512 }
513 
514 /*
515  * Returns the packet, byte, rpf-failure count for the source, group provided.
516  * Uses larger counters and IPv6 addresses.
517  */
518 /* ARGSUSED XXX until implemented */
519 static int
520 get_lsg_cnt(struct sioc_lsg_req *req)
521 {
522 	/* XXX TODO SIOCGETLSGCNT */
523 	return (ENXIO);
524 }
525 
526 /*
527  * Returns the input and output packet and byte counts on the vif provided.
528  */
529 static int
530 get_vif_cnt(struct sioc_vif_req *req)
531 {
532 	vifi_t vifi = req->vifi;
533 
534 	if (vifi >= numvifs)
535 		return (EINVAL);
536 
537 	/*
538 	 * No locks here, an approximation is fine.
539 	 */
540 	req->icount = viftable[vifi].v_pkt_in;
541 	req->ocount = viftable[vifi].v_pkt_out;
542 	req->ibytes = viftable[vifi].v_bytes_in;
543 	req->obytes = viftable[vifi].v_bytes_out;
544 
545 	return (0);
546 }
547 
548 static int
549 get_version(uchar_t *data)
550 {
551 	int *v = (int *)data;
552 
553 	*v = 0x0305;	/* XXX !!!! */
554 
555 	return (0);
556 }
557 
558 /*
559  * Set PIM assert processing global.
560  */
561 static int
562 set_assert(int *i)
563 {
564 	if ((*i != 1) && (*i != 0))
565 		return (EINVAL);
566 
567 	pim_assert = *i;
568 
569 	return (0);
570 }
571 
572 /*
573  * Get PIM assert processing global.
574  */
575 static int
576 get_assert(uchar_t *data)
577 {
578 	int *i = (int *)data;
579 
580 	*i = pim_assert;
581 
582 	return (0);
583 }
584 
585 /*
586  * Enable multicast routing.
587  */
588 static int
589 ip_mrouter_init(queue_t *q, uchar_t *data, int datalen)
590 {
591 	conn_t	*connp = Q_TO_CONN(q);
592 	int	*v;
593 
594 	if (data == NULL || (datalen != sizeof (int)))
595 		return (ENOPROTOOPT);
596 
597 	v = (int *)data;
598 	if (*v != 1)
599 		return (ENOPROTOOPT);
600 
601 	mutex_enter(&ip_g_mrouter_mutex);
602 	if (ip_g_mrouter != NULL) {
603 		mutex_exit(&ip_g_mrouter_mutex);
604 		return (EADDRINUSE);
605 	}
606 
607 	ip_g_mrouter = q;
608 	connp->conn_multi_router = 1;
609 
610 	mutex_init(&last_encap_lock, NULL, MUTEX_DEFAULT, NULL);
611 
612 	mrtstat.mrts_vifctlSize = sizeof (struct vifctl);
613 	mrtstat.mrts_mfcctlSize = sizeof (struct mfcctl);
614 
615 	pim_assert = 0;
616 
617 	/* In order for tunnels to work we have to turn ip_g_forward on */
618 	if (!WE_ARE_FORWARDING) {
619 		if (ip_mrtdebug > 1) {
620 			(void) mi_strlog(q, 1, SL_TRACE,
621 			    "ip_mrouter_init: turning on forwarding");
622 		}
623 		saved_ip_g_forward = ip_g_forward;
624 		ip_g_forward = IP_FORWARD_ALWAYS;
625 	}
626 
627 	mutex_exit(&ip_g_mrouter_mutex);
628 	return (0);
629 }
630 
631 /*
632  * Disable multicast routing.
633  * Didn't use global timeout_val (BSD version), instead check the mfctable.
634  */
635 int
636 ip_mrouter_done(mblk_t *mp)
637 {
638 	conn_t		*connp;
639 	vifi_t 		vifi;
640 	struct mfc	*mfc_rt;
641 	int		i;
642 
643 	mutex_enter(&ip_g_mrouter_mutex);
644 	if (ip_g_mrouter == NULL) {
645 		mutex_exit(&ip_g_mrouter_mutex);
646 		return (EINVAL);
647 	}
648 
649 	connp = Q_TO_CONN(ip_g_mrouter);
650 
651 	if (saved_ip_g_forward != -1) {
652 		if (ip_mrtdebug > 1) {
653 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
654 			    "ip_mrouter_done: turning off forwarding");
655 		}
656 		ip_g_forward = saved_ip_g_forward;
657 		saved_ip_g_forward = -1;
658 	}
659 
660 	/*
661 	 * Always clear cache when vifs change.
662 	 * No need to get last_encap_lock since we are running as a writer.
663 	 */
664 	mutex_enter(&last_encap_lock);
665 	last_encap_src = 0;
666 	last_encap_vif = NULL;
667 	mutex_exit(&last_encap_lock);
668 	connp->conn_multi_router = 0;
669 
670 	mutex_exit(&ip_g_mrouter_mutex);
671 
672 	/*
673 	 * For each phyint in use,
674 	 * disable promiscuous reception of all IP multicasts.
675 	 */
676 	for (vifi = 0; vifi < MAXVIFS; vifi++) {
677 		struct vif *vifp = viftable + vifi;
678 
679 		mutex_enter(&vifp->v_lock);
680 		/*
681 		 * if the vif is active mark it condemned.
682 		 */
683 		if (vifp->v_marks & VIF_MARK_GOOD) {
684 			ASSERT(vifp->v_ipif != NULL);
685 			ipif_refhold(vifp->v_ipif);
686 			/* Phyint only */
687 			if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
688 				ipif_t *ipif = vifp->v_ipif;
689 				ipsq_t  *ipsq;
690 				boolean_t suc;
691 				ill_t *ill;
692 
693 				ill = ipif->ipif_ill;
694 				suc = B_FALSE;
695 				if (mp == NULL) {
696 					/*
697 					 * being called from ip_close,
698 					 * lets do it synchronously.
699 					 * Clear VIF_MARK_GOOD and
700 					 * set VIF_MARK_CONDEMNED.
701 					 */
702 					vifp->v_marks &= ~VIF_MARK_GOOD;
703 					vifp->v_marks |= VIF_MARK_CONDEMNED;
704 					mutex_exit(&(vifp)->v_lock);
705 					suc = ipsq_enter(ill, B_FALSE);
706 					ipsq = ill->ill_phyint->phyint_ipsq;
707 				} else {
708 					ipsq = ipsq_try_enter(ipif, NULL,
709 					    ip_g_mrouter, mp,
710 					    ip_restart_optmgmt, NEW_OP, B_TRUE);
711 					if (ipsq == NULL) {
712 						mutex_exit(&(vifp)->v_lock);
713 						return (EINPROGRESS);
714 					}
715 					/*
716 					 * Clear VIF_MARK_GOOD and
717 					 * set VIF_MARK_CONDEMNED.
718 					 */
719 					vifp->v_marks &= ~VIF_MARK_GOOD;
720 					vifp->v_marks |= VIF_MARK_CONDEMNED;
721 						mutex_exit(&(vifp)->v_lock);
722 					suc = B_TRUE;
723 				}
724 
725 				if (suc) {
726 					(void) ip_delmulti(INADDR_ANY, ipif,
727 					    B_TRUE, B_TRUE);
728 					ipsq_exit(ipsq, B_TRUE, B_TRUE);
729 				}
730 				mutex_enter(&vifp->v_lock);
731 			}
732 			/*
733 			 * decreases the refcnt added in add_vif.
734 			 * and release v_lock.
735 			 */
736 			VIF_REFRELE_LOCKED(vifp);
737 		} else {
738 			mutex_exit(&vifp->v_lock);
739 			continue;
740 		}
741 	}
742 
743 	mutex_enter(&numvifs_mutex);
744 	numvifs = 0;
745 	pim_assert = 0;
746 	reg_vif_num = ALL_VIFS;
747 	mutex_exit(&numvifs_mutex);
748 
749 	/*
750 	 * Free upcall msgs.
751 	 * Go through mfctable and stop any outstanding upcall
752 	 * timeouts remaining on mfcs.
753 	 */
754 	for (i = 0; i < MFCTBLSIZ; i++) {
755 		mutex_enter(&mfctable[i].mfcb_lock);
756 		mfctable[i].mfcb_refcnt++;
757 		mfctable[i].mfcb_marks |= MFCB_MARK_CONDEMNED;
758 		mutex_exit(&mfctable[i].mfcb_lock);
759 		mfc_rt = mfctable[i].mfcb_mfc;
760 		while (mfc_rt) {
761 			/* Free upcalls */
762 			mutex_enter(&mfc_rt->mfc_mutex);
763 			if (mfc_rt->mfc_rte != NULL) {
764 				if (mfc_rt->mfc_timeout_id != 0) {
765 					/*
766 					 * OK to drop the lock as we have
767 					 * a refcnt on the bucket. timeout
768 					 * can fire but it will see that
769 					 * mfc_timeout_id == 0 and not do
770 					 * anything. see expire_upcalls().
771 					 */
772 					mfc_rt->mfc_timeout_id = 0;
773 					mutex_exit(&mfc_rt->mfc_mutex);
774 					(void) untimeout(
775 					    mfc_rt->mfc_timeout_id);
776 						mfc_rt->mfc_timeout_id = 0;
777 					mutex_enter(&mfc_rt->mfc_mutex);
778 
779 					/*
780 					 * all queued upcall packets
781 					 * and mblk will be freed in
782 					 * release_mfc().
783 					 */
784 				}
785 			}
786 
787 			mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
788 
789 			mutex_exit(&mfc_rt->mfc_mutex);
790 			mfc_rt = mfc_rt->mfc_next;
791 		}
792 		MFCB_REFRELE(&mfctable[i]);
793 	}
794 
795 	mutex_enter(&ip_g_mrouter_mutex);
796 	ip_g_mrouter = NULL;
797 	mutex_exit(&ip_g_mrouter_mutex);
798 	return (0);
799 }
800 
801 static boolean_t
802 is_mrouter_off(void)
803 {
804 	conn_t	*connp;
805 
806 	mutex_enter(&ip_g_mrouter_mutex);
807 	if (ip_g_mrouter == NULL) {
808 		mutex_exit(&ip_g_mrouter_mutex);
809 		return (B_TRUE);
810 	}
811 
812 	connp = Q_TO_CONN(ip_g_mrouter);
813 	if (connp->conn_multi_router == 0) {
814 		mutex_exit(&ip_g_mrouter_mutex);
815 		return (B_TRUE);
816 	}
817 	mutex_exit(&ip_g_mrouter_mutex);
818 	return (B_FALSE);
819 }
820 
821 static void
822 unlock_good_vif(struct vif *vifp)
823 {
824 	ASSERT(vifp->v_ipif != NULL);
825 	ipif_refrele(vifp->v_ipif);
826 	VIF_REFRELE(vifp);
827 }
828 
829 static boolean_t
830 lock_good_vif(struct vif *vifp)
831 {
832 	mutex_enter(&vifp->v_lock);
833 	if (!(vifp->v_marks & VIF_MARK_GOOD)) {
834 		mutex_exit(&vifp->v_lock);
835 		return (B_FALSE);
836 	}
837 
838 	ASSERT(vifp->v_ipif != NULL);
839 	mutex_enter(&vifp->v_ipif->ipif_ill->ill_lock);
840 	if (!IPIF_CAN_LOOKUP(vifp->v_ipif)) {
841 		mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
842 		mutex_exit(&vifp->v_lock);
843 		return (B_FALSE);
844 	}
845 	ipif_refhold_locked(vifp->v_ipif);
846 	mutex_exit(&vifp->v_ipif->ipif_ill->ill_lock);
847 	vifp->v_refcnt++;
848 	mutex_exit(&vifp->v_lock);
849 	return (B_TRUE);
850 }
851 
852 /*
853  * Add a vif to the vif table.
854  */
855 static int
856 add_vif(struct vifctl *vifcp, queue_t *q, mblk_t *first_mp)
857 {
858 	struct vif	*vifp = viftable + vifcp->vifc_vifi;
859 	ipif_t		*ipif;
860 	int		error;
861 	struct tbf	*v_tbf = tbftable + vifcp->vifc_vifi;
862 	conn_t   	*connp = Q_TO_CONN(q);
863 	ipsq_t  	*ipsq;
864 
865 	ASSERT(connp != NULL);
866 
867 	if (vifcp->vifc_vifi >= MAXVIFS)
868 		return (EINVAL);
869 
870 	if (is_mrouter_off())
871 		return (EINVAL);
872 
873 	mutex_enter(&vifp->v_lock);
874 	/*
875 	 * Viftable entry should be 0.
876 	 * if v_marks == 0 but v_refcnt != 0 means struct is being
877 	 * initialized.
878 	 *
879 	 * Also note that it is very unlikely that we will get a MRT_ADD_VIF
880 	 * request while the delete is in progress, mrouted only sends add
881 	 * requests when a new interface is added and the new interface cannot
882 	 * have the same vifi as an existing interface. We make sure that
883 	 * ill_delete will block till the vif is deleted by adding a refcnt
884 	 * to ipif in del_vif().
885 	 */
886 	if (vifp->v_lcl_addr.s_addr != 0 ||
887 	    vifp->v_marks != 0 ||
888 	    vifp->v_refcnt != 0) {
889 		mutex_exit(&vifp->v_lock);
890 		return (EADDRINUSE);
891 	}
892 
893 	/* Incoming vif should not be 0 */
894 	if (vifcp->vifc_lcl_addr.s_addr == 0) {
895 		mutex_exit(&vifp->v_lock);
896 		return (EINVAL);
897 	}
898 
899 	vifp->v_refcnt++;
900 	mutex_exit(&vifp->v_lock);
901 	/* Find the interface with the local address */
902 	ipif = ipif_lookup_addr((ipaddr_t)vifcp->vifc_lcl_addr.s_addr, NULL,
903 	    connp->conn_zoneid, CONNP_TO_WQ(connp), first_mp,
904 	    ip_restart_optmgmt, &error);
905 	if (ipif == NULL) {
906 		VIF_REFRELE(vifp);
907 		if (error == EINPROGRESS)
908 			return (error);
909 		return (EADDRNOTAVAIL);
910 	}
911 
912 	/*
913 	 * We have to be exclusive as we have to call ip_addmulti()
914 	 * This is the best position to try to be exclusive in case
915 	 * we have to wait.
916 	 */
917 	ipsq = ipsq_try_enter(ipif, NULL, CONNP_TO_WQ(connp), first_mp,
918 	    ip_restart_optmgmt, NEW_OP, B_TRUE);
919 	if ((ipsq) == NULL) {
920 		VIF_REFRELE(vifp);
921 		ipif_refrele(ipif);
922 		return (EINPROGRESS);
923 	}
924 
925 	if (ip_mrtdebug > 1) {
926 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
927 		    "add_vif: src 0x%x enter",
928 		    vifcp->vifc_lcl_addr.s_addr);
929 	}
930 
931 	mutex_enter(&vifp->v_lock);
932 	/*
933 	 * Always clear cache when vifs change.
934 	 * Needed to ensure that src isn't left over from before vif was added.
935 	 * No need to get last_encap_lock, since we are running as a writer.
936 	 */
937 
938 	mutex_enter(&last_encap_lock);
939 	last_encap_src = 0;
940 	last_encap_vif = NULL;
941 	mutex_exit(&last_encap_lock);
942 
943 	if (vifcp->vifc_flags & VIFF_TUNNEL) {
944 		if ((vifcp->vifc_flags & VIFF_SRCRT) != 0) {
945 			cmn_err(CE_WARN,
946 			    "add_vif: source route tunnels not supported\n");
947 			VIF_REFRELE_LOCKED(vifp);
948 			ipif_refrele(ipif);
949 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
950 			return (EOPNOTSUPP);
951 		}
952 		vifp->v_rmt_addr  = vifcp->vifc_rmt_addr;
953 
954 	} else {
955 		/* Phyint or Register vif */
956 		if (vifcp->vifc_flags & VIFF_REGISTER) {
957 			/*
958 			 * Note: Since all IPPROTO_IP level options (including
959 			 * MRT_ADD_VIF) are done exclusively via
960 			 * ip_optmgmt_writer(), a lock is not necessary to
961 			 * protect reg_vif_num.
962 			 */
963 			mutex_enter(&numvifs_mutex);
964 			if (reg_vif_num == ALL_VIFS) {
965 				reg_vif_num = vifcp->vifc_vifi;
966 				mutex_exit(&numvifs_mutex);
967 			} else {
968 				mutex_exit(&numvifs_mutex);
969 				VIF_REFRELE_LOCKED(vifp);
970 				ipif_refrele(ipif);
971 				ipsq_exit(ipsq, B_TRUE, B_TRUE);
972 				return (EADDRINUSE);
973 			}
974 		}
975 
976 		/* Make sure the interface supports multicast */
977 		if ((ipif->ipif_ill->ill_flags & ILLF_MULTICAST) == 0) {
978 			VIF_REFRELE_LOCKED(vifp);
979 			ipif_refrele(ipif);
980 			if (vifcp->vifc_flags & VIFF_REGISTER) {
981 				mutex_enter(&numvifs_mutex);
982 				reg_vif_num = ALL_VIFS;
983 				mutex_exit(&numvifs_mutex);
984 			}
985 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
986 			return (EOPNOTSUPP);
987 		}
988 		/* Enable promiscuous reception of all IP mcasts from the if */
989 		mutex_exit(&vifp->v_lock);
990 		error = ip_addmulti(INADDR_ANY, ipif, ILGSTAT_NONE,
991 		    MODE_IS_EXCLUDE, NULL);
992 		mutex_enter(&vifp->v_lock);
993 		/*
994 		 * since we released the lock lets make sure that
995 		 * ip_mrouter_done() has not been called.
996 		 */
997 		if (error != 0 || is_mrouter_off()) {
998 			if (error == 0)
999 				(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE,
1000 				    B_TRUE);
1001 			if (vifcp->vifc_flags & VIFF_REGISTER) {
1002 				mutex_enter(&numvifs_mutex);
1003 				reg_vif_num = ALL_VIFS;
1004 				mutex_exit(&numvifs_mutex);
1005 			}
1006 			VIF_REFRELE_LOCKED(vifp);
1007 			ipif_refrele(ipif);
1008 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1009 			return (error?error:EINVAL);
1010 		}
1011 	}
1012 	/* Define parameters for the tbf structure */
1013 	vifp->v_tbf = v_tbf;
1014 	gethrestime(&vifp->v_tbf->tbf_last_pkt_t);
1015 	vifp->v_tbf->tbf_n_tok = 0;
1016 	vifp->v_tbf->tbf_q_len = 0;
1017 	vifp->v_tbf->tbf_max_q_len = MAXQSIZE;
1018 	vifp->v_tbf->tbf_q = vifp->v_tbf->tbf_t = NULL;
1019 
1020 	vifp->v_flags = vifcp->vifc_flags;
1021 	vifp->v_threshold = vifcp->vifc_threshold;
1022 	vifp->v_lcl_addr = vifcp->vifc_lcl_addr;
1023 	vifp->v_ipif = ipif;
1024 	ipif_refrele(ipif);
1025 	/* Scaling up here, allows division by 1024 in critical code.	*/
1026 	vifp->v_rate_limit = vifcp->vifc_rate_limit * (1024/1000);
1027 	vifp->v_timeout_id = 0;
1028 	/* initialize per vif pkt counters */
1029 	vifp->v_pkt_in = 0;
1030 	vifp->v_pkt_out = 0;
1031 	vifp->v_bytes_in = 0;
1032 	vifp->v_bytes_out = 0;
1033 	mutex_init(&vifp->v_tbf->tbf_lock, NULL, MUTEX_DEFAULT, NULL);
1034 
1035 	/* Adjust numvifs up, if the vifi is higher than numvifs */
1036 	mutex_enter(&numvifs_mutex);
1037 	if (numvifs <= vifcp->vifc_vifi)
1038 		numvifs = vifcp->vifc_vifi + 1;
1039 	mutex_exit(&numvifs_mutex);
1040 
1041 	if (ip_mrtdebug > 1) {
1042 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1043 		    "add_vif: #%d, lcladdr %x, %s %x, thresh %x, rate %d",
1044 		    vifcp->vifc_vifi,
1045 		    ntohl(vifcp->vifc_lcl_addr.s_addr),
1046 		    (vifcp->vifc_flags & VIFF_TUNNEL) ? "rmtaddr" : "mask",
1047 		    ntohl(vifcp->vifc_rmt_addr.s_addr),
1048 		    vifcp->vifc_threshold, vifcp->vifc_rate_limit);
1049 	}
1050 
1051 	vifp->v_marks = VIF_MARK_GOOD;
1052 	mutex_exit(&vifp->v_lock);
1053 	ipsq_exit(ipsq, B_TRUE, B_TRUE);
1054 	return (0);
1055 }
1056 
1057 
1058 /* Delete a vif from the vif table. */
1059 static void
1060 del_vifp(struct vif *vifp)
1061 {
1062 	struct tbf	*t = vifp->v_tbf;
1063 	mblk_t  *mp0;
1064 	vifi_t  vifi;
1065 
1066 
1067 	ASSERT(vifp->v_marks & VIF_MARK_CONDEMNED);
1068 	ASSERT(t != NULL);
1069 
1070 	/*
1071 	 * release the ref we put in vif_del.
1072 	 */
1073 	ASSERT(vifp->v_ipif != NULL);
1074 	ipif_refrele(vifp->v_ipif);
1075 
1076 	if (ip_mrtdebug > 1) {
1077 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1078 		    "del_vif: src 0x%x\n", vifp->v_lcl_addr.s_addr);
1079 	}
1080 
1081 	if (vifp->v_timeout_id != 0) {
1082 		(void) untimeout(vifp->v_timeout_id);
1083 		vifp->v_timeout_id = 0;
1084 	}
1085 
1086 	/*
1087 	 * Free packets queued at the interface.
1088 	 * Mrouted takes care of cleaning up mfcs - makes calls to del_mfc.
1089 	 */
1090 	mutex_enter(&t->tbf_lock);
1091 	while (t->tbf_q != NULL) {
1092 		mp0 = t->tbf_q;
1093 		t->tbf_q = t->tbf_q->b_next;
1094 		mp0->b_prev = mp0->b_next = NULL;
1095 		freemsg(mp0);
1096 	}
1097 	mutex_exit(&t->tbf_lock);
1098 
1099 	/*
1100 	 * Always clear cache when vifs change.
1101 	 * No need to get last_encap_lock since we are running as a writer.
1102 	 */
1103 	mutex_enter(&last_encap_lock);
1104 	if (vifp == last_encap_vif) {
1105 		last_encap_vif = NULL;
1106 		last_encap_src = 0;
1107 	}
1108 	mutex_exit(&last_encap_lock);
1109 
1110 	mutex_destroy(&t->tbf_lock);
1111 
1112 	bzero(vifp->v_tbf, sizeof (*(vifp->v_tbf)));
1113 
1114 	/* Adjust numvifs down */
1115 	mutex_enter(&numvifs_mutex);
1116 	for (vifi = numvifs; vifi != 0; vifi--) /* vifi is unsigned */
1117 		if (viftable[vifi - 1].v_lcl_addr.s_addr != 0)
1118 			break;
1119 	numvifs = vifi;
1120 	mutex_exit(&numvifs_mutex);
1121 
1122 	bzero(vifp, sizeof (*vifp));
1123 }
1124 
1125 static int
1126 del_vif(vifi_t *vifip, queue_t *q, mblk_t *first_mp)
1127 {
1128 	struct vif	*vifp = viftable + *vifip;
1129 	conn_t		*connp;
1130 	ipsq_t  	*ipsq;
1131 
1132 	if (*vifip >= numvifs)
1133 		return (EINVAL);
1134 
1135 
1136 	mutex_enter(&vifp->v_lock);
1137 	/*
1138 	 * Not initialized
1139 	 * Here we are not looking at the vif that is being initialized
1140 	 * i.e vifp->v_marks == 0 and refcnt > 0.
1141 	 */
1142 	if (vifp->v_lcl_addr.s_addr == 0 ||
1143 	    !(vifp->v_marks & VIF_MARK_GOOD)) {
1144 		mutex_exit(&vifp->v_lock);
1145 		return (EADDRNOTAVAIL);
1146 	}
1147 
1148 	/*
1149 	 * This is an optimization, if first_mp == NULL
1150 	 * than we are being called from reset_mrt_vif_ipif()
1151 	 * so we already have exclusive access to the ipsq.
1152 	 * the ASSERT below is a check for this condition.
1153 	 */
1154 	if (first_mp != NULL &&
1155 	    !(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1156 		connp = Q_TO_CONN(q);
1157 		ASSERT(connp != NULL);
1158 		/*
1159 		 * We have to be exclusive as we have to call ip_delmulti()
1160 		 * This is the best position to try to be exclusive in case
1161 		 * we have to wait.
1162 		 */
1163 		ipsq = ipsq_try_enter(vifp->v_ipif, NULL, CONNP_TO_WQ(connp),
1164 		    first_mp, ip_restart_optmgmt, NEW_OP, B_TRUE);
1165 		if ((ipsq) == NULL) {
1166 			mutex_exit(&vifp->v_lock);
1167 			return (EINPROGRESS);
1168 		}
1169 		/* recheck after being exclusive */
1170 		if (vifp->v_lcl_addr.s_addr == 0 ||
1171 		    !vifp->v_marks & VIF_MARK_GOOD) {
1172 			/*
1173 			 * someone beat us.
1174 			 */
1175 			mutex_exit(&vifp->v_lock);
1176 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1177 			return (EADDRNOTAVAIL);
1178 		}
1179 	}
1180 
1181 
1182 	ASSERT(IAM_WRITER_IPIF(vifp->v_ipif));
1183 
1184 
1185 	/*
1186 	 * add a refhold so that ipif does not go away while
1187 	 * there are still users, this will be released in del_vifp
1188 	 * when we free the vif.
1189 	 */
1190 	ipif_refhold(vifp->v_ipif);
1191 
1192 	/* Clear VIF_MARK_GOOD and set VIF_MARK_CONDEMNED. */
1193 	vifp->v_marks &= ~VIF_MARK_GOOD;
1194 	vifp->v_marks |= VIF_MARK_CONDEMNED;
1195 
1196 	/* Phyint only */
1197 	if (!(vifp->v_flags & (VIFF_TUNNEL | VIFF_REGISTER))) {
1198 		ipif_t *ipif = vifp->v_ipif;
1199 		ASSERT(ipif != NULL);
1200 		/*
1201 		 * should be OK to drop the lock as we
1202 		 * have marked this as CONDEMNED.
1203 		 */
1204 		mutex_exit(&(vifp)->v_lock);
1205 		(void) ip_delmulti(INADDR_ANY, ipif, B_TRUE, B_TRUE);
1206 		if (first_mp != NULL)
1207 			ipsq_exit(ipsq, B_TRUE, B_TRUE);
1208 		mutex_enter(&(vifp)->v_lock);
1209 	}
1210 
1211 	/*
1212 	 * decreases the refcnt added in add_vif.
1213 	 */
1214 	VIF_REFRELE_LOCKED(vifp);
1215 	return (0);
1216 }
1217 
1218 /*
1219  * Add an mfc entry.
1220  */
1221 static int
1222 add_mfc(struct mfcctl *mfccp)
1223 {
1224 	struct mfc *rt;
1225 	struct rtdetq *rte;
1226 	ushort_t nstl;
1227 	int i;
1228 	struct mfcb *mfcbp;
1229 
1230 	/*
1231 	 * The value of vifi is NO_VIF (==MAXVIFS) if Mrouted
1232 	 * did not have a real route for pkt.
1233 	 * We want this pkt without rt installed in the mfctable to prevent
1234 	 * multiiple tries, so go ahead and put it in mfctable, it will
1235 	 * be discarded later in ip_mdq() because the child is NULL.
1236 	 */
1237 
1238 	/* Error checking, out of bounds? */
1239 	if (mfccp->mfcc_parent > MAXVIFS) {
1240 		ip0dbg(("ADD_MFC: mfcc_parent out of range %d",
1241 		    (int)mfccp->mfcc_parent));
1242 		return (EINVAL);
1243 	}
1244 
1245 	if ((mfccp->mfcc_parent != NO_VIF) &&
1246 	    (viftable[mfccp->mfcc_parent].v_ipif == NULL)) {
1247 		ip0dbg(("ADD_MFC: NULL ipif for parent vif %d\n",
1248 		    (int)mfccp->mfcc_parent));
1249 		return (EINVAL);
1250 	}
1251 
1252 	if (is_mrouter_off()) {
1253 		return (EINVAL);
1254 	}
1255 
1256 	mfcbp = &mfctable[MFCHASH(mfccp->mfcc_origin.s_addr,
1257 	    mfccp->mfcc_mcastgrp.s_addr)];
1258 	MFCB_REFHOLD(mfcbp);
1259 	MFCFIND(mfcbp, mfccp->mfcc_origin.s_addr,
1260 	    mfccp->mfcc_mcastgrp.s_addr, rt);
1261 
1262 	/* If an entry already exists, just update the fields */
1263 	if (rt) {
1264 		if (ip_mrtdebug > 1) {
1265 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1266 			    "add_mfc: update o %x grp %x parent %x",
1267 			    ntohl(mfccp->mfcc_origin.s_addr),
1268 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1269 			    mfccp->mfcc_parent);
1270 		}
1271 		mutex_enter(&rt->mfc_mutex);
1272 		rt->mfc_parent = mfccp->mfcc_parent;
1273 
1274 		mutex_enter(&numvifs_mutex);
1275 		for (i = 0; i < (int)numvifs; i++)
1276 			rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1277 		mutex_exit(&numvifs_mutex);
1278 		mutex_exit(&rt->mfc_mutex);
1279 
1280 		MFCB_REFRELE(mfcbp);
1281 		return (0);
1282 	}
1283 
1284 	/*
1285 	 * Find the entry for which the upcall was made and update.
1286 	 */
1287 	for (rt = mfcbp->mfcb_mfc, nstl = 0; rt; rt = rt->mfc_next) {
1288 		mutex_enter(&rt->mfc_mutex);
1289 		if ((rt->mfc_origin.s_addr == mfccp->mfcc_origin.s_addr) &&
1290 		    (rt->mfc_mcastgrp.s_addr == mfccp->mfcc_mcastgrp.s_addr) &&
1291 		    (rt->mfc_rte != NULL) &&
1292 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1293 			if (nstl++ != 0)
1294 				cmn_err(CE_WARN,
1295 				    "add_mfc: %s o %x g %x p %x",
1296 				    "multiple kernel entries",
1297 				    ntohl(mfccp->mfcc_origin.s_addr),
1298 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1299 				    mfccp->mfcc_parent);
1300 
1301 			if (ip_mrtdebug > 1) {
1302 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1303 				    "add_mfc: o %x g %x p %x",
1304 				    ntohl(mfccp->mfcc_origin.s_addr),
1305 				    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1306 				    mfccp->mfcc_parent);
1307 			}
1308 			fill_route(rt, mfccp);
1309 
1310 			/*
1311 			 * Prevent cleanup of cache entry.
1312 			 * Timer starts in ip_mforward.
1313 			 */
1314 			if (rt->mfc_timeout_id != 0) {
1315 				timeout_id_t id;
1316 				id = rt->mfc_timeout_id;
1317 				/*
1318 				 * setting id to zero will avoid this
1319 				 * entry from being cleaned up in
1320 				 * expire_up_calls().
1321 				 */
1322 				rt->mfc_timeout_id = 0;
1323 				/*
1324 				 * dropping the lock is fine as we
1325 				 * have a refhold on the bucket.
1326 				 * so mfc cannot be freed.
1327 				 * The timeout can fire but it will see
1328 				 * that mfc_timeout_id == 0 and not cleanup.
1329 				 */
1330 				mutex_exit(&rt->mfc_mutex);
1331 				(void) untimeout(id);
1332 				mutex_enter(&rt->mfc_mutex);
1333 			}
1334 
1335 			/*
1336 			 * Send all pkts that are queued waiting for the upcall.
1337 			 * ip_mdq param tun set to 0 -
1338 			 * the return value of ip_mdq() isn't used here,
1339 			 * so value we send doesn't matter.
1340 			 */
1341 			while (rt->mfc_rte != NULL) {
1342 				rte = rt->mfc_rte;
1343 				rt->mfc_rte = rte->rte_next;
1344 				mutex_exit(&rt->mfc_mutex);
1345 				(void) ip_mdq(rte->mp, (ipha_t *)
1346 				    rte->mp->b_rptr, rte->ill, 0, rt);
1347 				freemsg(rte->mp);
1348 				mi_free((char *)rte);
1349 				mutex_enter(&rt->mfc_mutex);
1350 			}
1351 		}
1352 		mutex_exit(&rt->mfc_mutex);
1353 	}
1354 
1355 
1356 	/*
1357 	 * It is possible that an entry is being inserted without an upcall
1358 	 */
1359 	if (nstl == 0) {
1360 		mutex_enter(&(mfcbp->mfcb_lock));
1361 		if (ip_mrtdebug > 1) {
1362 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1363 			    "add_mfc: no upcall o %x g %x p %x",
1364 			    ntohl(mfccp->mfcc_origin.s_addr),
1365 			    ntohl(mfccp->mfcc_mcastgrp.s_addr),
1366 			    mfccp->mfcc_parent);
1367 		}
1368 		if (is_mrouter_off()) {
1369 			mutex_exit(&mfcbp->mfcb_lock);
1370 			MFCB_REFRELE(mfcbp);
1371 			return (EINVAL);
1372 		}
1373 
1374 		for (rt = mfcbp->mfcb_mfc; rt; rt = rt->mfc_next) {
1375 
1376 			mutex_enter(&rt->mfc_mutex);
1377 			if ((rt->mfc_origin.s_addr ==
1378 			    mfccp->mfcc_origin.s_addr) &&
1379 			    (rt->mfc_mcastgrp.s_addr ==
1380 				mfccp->mfcc_mcastgrp.s_addr) &&
1381 				(!(rt->mfc_marks & MFCB_MARK_CONDEMNED))) {
1382 				fill_route(rt, mfccp);
1383 				mutex_exit(&rt->mfc_mutex);
1384 				break;
1385 			}
1386 			mutex_exit(&rt->mfc_mutex);
1387 		}
1388 
1389 		/* No upcall, so make a new entry into mfctable */
1390 		if (rt == NULL) {
1391 			rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1392 			if (rt == NULL) {
1393 				ip1dbg(("add_mfc: out of memory\n"));
1394 				mutex_exit(&mfcbp->mfcb_lock);
1395 				MFCB_REFRELE(mfcbp);
1396 				return (ENOBUFS);
1397 			}
1398 
1399 			/* Insert new entry at head of hash chain */
1400 			mutex_enter(&rt->mfc_mutex);
1401 			fill_route(rt, mfccp);
1402 
1403 			/* Link into table */
1404 			rt->mfc_next   = mfcbp->mfcb_mfc;
1405 			mfcbp->mfcb_mfc = rt;
1406 			mutex_exit(&rt->mfc_mutex);
1407 		}
1408 		mutex_exit(&mfcbp->mfcb_lock);
1409 	}
1410 
1411 	MFCB_REFRELE(mfcbp);
1412 	return (0);
1413 }
1414 
1415 /*
1416  * Fills in mfc structure from mrouted mfcctl.
1417  */
1418 static void
1419 fill_route(struct mfc *rt, struct mfcctl *mfccp)
1420 {
1421 	int i;
1422 
1423 	rt->mfc_origin		= mfccp->mfcc_origin;
1424 	rt->mfc_mcastgrp	= mfccp->mfcc_mcastgrp;
1425 	rt->mfc_parent		= mfccp->mfcc_parent;
1426 	mutex_enter(&numvifs_mutex);
1427 	for (i = 0; i < (int)numvifs; i++) {
1428 		rt->mfc_ttls[i] = mfccp->mfcc_ttls[i];
1429 	}
1430 	mutex_exit(&numvifs_mutex);
1431 	/* Initialize pkt counters per src-grp */
1432 	rt->mfc_pkt_cnt	= 0;
1433 	rt->mfc_byte_cnt	= 0;
1434 	rt->mfc_wrong_if	= 0;
1435 	rt->mfc_last_assert.tv_sec = rt->mfc_last_assert.tv_nsec = 0;
1436 
1437 }
1438 
1439 static void
1440 free_queue(struct mfc *mfcp)
1441 {
1442 	struct rtdetq *rte0;
1443 
1444 	/*
1445 	 * Drop all queued upcall packets.
1446 	 * Free the mbuf with the pkt.
1447 	 */
1448 	while ((rte0 = mfcp->mfc_rte) != NULL) {
1449 		mfcp->mfc_rte = rte0->rte_next;
1450 		freemsg(rte0->mp);
1451 		mi_free((char *)rte0);
1452 	}
1453 }
1454 /*
1455  * go thorugh the hash bucket and free all the entries marked condemned.
1456  */
1457 void
1458 release_mfc(struct mfcb *mfcbp)
1459 {
1460 	struct mfc *current_mfcp;
1461 	struct mfc *prev_mfcp;
1462 
1463 	prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1464 
1465 	while (current_mfcp != NULL) {
1466 		if (current_mfcp->mfc_marks & MFCB_MARK_CONDEMNED) {
1467 			if (current_mfcp == mfcbp->mfcb_mfc) {
1468 				mfcbp->mfcb_mfc = current_mfcp->mfc_next;
1469 				free_queue(current_mfcp);
1470 				mi_free(current_mfcp);
1471 				prev_mfcp = current_mfcp = mfcbp->mfcb_mfc;
1472 				continue;
1473 			}
1474 			ASSERT(prev_mfcp != NULL);
1475 			prev_mfcp->mfc_next = current_mfcp->mfc_next;
1476 			free_queue(current_mfcp);
1477 			mi_free(current_mfcp);
1478 			current_mfcp = NULL;
1479 		} else {
1480 			prev_mfcp = current_mfcp;
1481 		}
1482 
1483 		current_mfcp = prev_mfcp->mfc_next;
1484 
1485 	}
1486 	mfcbp->mfcb_marks &= ~MFCB_MARK_CONDEMNED;
1487 	ASSERT(mfcbp->mfcb_mfc != NULL || mfcbp->mfcb_marks == 0);
1488 }
1489 
1490 /*
1491  * Delete an mfc entry.
1492  */
1493 static int
1494 del_mfc(struct mfcctl *mfccp)
1495 {
1496 	struct in_addr	origin;
1497 	struct in_addr	mcastgrp;
1498 	struct mfc 		*rt;
1499 	uint_t			hash;
1500 
1501 	origin = mfccp->mfcc_origin;
1502 	mcastgrp = mfccp->mfcc_mcastgrp;
1503 	hash = MFCHASH(origin.s_addr, mcastgrp.s_addr);
1504 
1505 	if (ip_mrtdebug > 1) {
1506 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1507 		    "del_mfc: o %x g %x",
1508 		    ntohl(origin.s_addr),
1509 		    ntohl(mcastgrp.s_addr));
1510 	}
1511 
1512 	MFCB_REFHOLD(&mfctable[hash]);
1513 
1514 	/* Find mfc in mfctable, finds only entries without upcalls */
1515 	for (rt = mfctable[hash].mfcb_mfc; rt; rt = rt->mfc_next) {
1516 		mutex_enter(&rt->mfc_mutex);
1517 		if (origin.s_addr == rt->mfc_origin.s_addr &&
1518 		    mcastgrp.s_addr == rt->mfc_mcastgrp.s_addr &&
1519 		    rt->mfc_rte == NULL &&
1520 		    !(rt->mfc_marks & MFCB_MARK_CONDEMNED))
1521 			break;
1522 		mutex_exit(&rt->mfc_mutex);
1523 	}
1524 
1525 	/*
1526 	 * Return if there was an upcall (mfc_rte != NULL,
1527 	 * or rt not in mfctable.
1528 	 */
1529 	if (rt == NULL) {
1530 		MFCB_REFRELE(&mfctable[hash]);
1531 		return (EADDRNOTAVAIL);
1532 	}
1533 
1534 
1535 	/*
1536 	 * no need to hold lock as we have a reference.
1537 	 */
1538 	mfctable[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1539 	/* error checking */
1540 	if (rt->mfc_timeout_id != 0) {
1541 		ip0dbg(("del_mfc: TIMEOUT NOT 0, rte not null"));
1542 		/*
1543 		 * Its ok to drop the lock,  the struct cannot be freed
1544 		 * since we have a ref on the hash bucket.
1545 		 */
1546 		rt->mfc_timeout_id = 0;
1547 		mutex_exit(&rt->mfc_mutex);
1548 		(void) untimeout(rt->mfc_timeout_id);
1549 		mutex_enter(&rt->mfc_mutex);
1550 	}
1551 
1552 	ASSERT(rt->mfc_rte == NULL);
1553 
1554 
1555 	/*
1556 	 * Delete the entry from the cache
1557 	 */
1558 	rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1559 	mutex_exit(&rt->mfc_mutex);
1560 
1561 	MFCB_REFRELE(&mfctable[hash]);
1562 
1563 	return (0);
1564 }
1565 
1566 #define	TUNNEL_LEN  12  /* # bytes of IP option for tunnel encapsulation  */
1567 
1568 /*
1569  * IP multicast forwarding function. This function assumes that the packet
1570  * pointed to by ipha has arrived on (or is about to be sent to) the interface
1571  * pointed to by "ill", and the packet is to be relayed to other networks
1572  * that have members of the packet's destination IP multicast group.
1573  *
1574  * The packet is returned unscathed to the caller, unless it is
1575  * erroneous, in which case a -1 value tells the caller (IP)
1576  * to discard it.
1577  *
1578  * Unlike BSD, SunOS 5.x needs to return to IP info about
1579  * whether pkt came in thru a tunnel, so it can be discarded, unless
1580  * it's IGMP. In BSD, the ifp is bogus for tunnels, so pkt won't try
1581  * to be delivered.
1582  * Return values are 0 - pkt is okay and phyint
1583  *		    -1 - pkt is malformed and to be tossed
1584  *                   1 - pkt came in on tunnel
1585  */
1586 int
1587 ip_mforward(ill_t *ill, ipha_t *ipha, mblk_t *mp)
1588 {
1589 	struct mfc 	*rt;
1590 	ipaddr_t	src, dst, tunnel_src = 0;
1591 	static int	srctun = 0;
1592 	vifi_t		vifi;
1593 	boolean_t	pim_reg_packet = B_FALSE;
1594 	struct mfcb *mfcbp;
1595 
1596 	if (ip_mrtdebug > 1) {
1597 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1598 		    "ip_mforward: RECV ipha_src %x, ipha_dst %x, ill %s",
1599 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
1600 		    ill->ill_name);
1601 	}
1602 
1603 	dst = ipha->ipha_dst;
1604 	if ((uint32_t)(uintptr_t)mp->b_prev == PIM_REGISTER_MARKER)
1605 		pim_reg_packet = B_TRUE;
1606 	else
1607 		tunnel_src = (ipaddr_t)(uintptr_t)mp->b_prev;
1608 
1609 	/*
1610 	 * Don't forward a packet with time-to-live of zero or one,
1611 	 * or a packet destined to a local-only group.
1612 	 */
1613 	if (CLASSD(dst) && (ipha->ipha_ttl <= 1 ||
1614 			(ipaddr_t)ntohl(dst) <= INADDR_MAX_LOCAL_GROUP)) {
1615 		if (ip_mrtdebug > 1) {
1616 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1617 			    "ip_mforward: not forwarded ttl %d,"
1618 			    " dst 0x%x ill %s",
1619 			    ipha->ipha_ttl, ntohl(dst), ill->ill_name);
1620 		}
1621 		mp->b_prev = NULL;
1622 		if (tunnel_src != 0)
1623 			return (1);
1624 		else
1625 			return (0);
1626 	}
1627 
1628 	if ((tunnel_src != 0) || pim_reg_packet) {
1629 		/*
1630 		 * Packet arrived over an encapsulated tunnel or via a PIM
1631 		 * register message. Both ip_mroute_decap() and pim_input()
1632 		 * encode information in mp->b_prev.
1633 		 */
1634 		mp->b_prev = NULL;
1635 		if (ip_mrtdebug > 1) {
1636 			if (tunnel_src != 0) {
1637 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1638 				    "ip_mforward: ill %s arrived via ENCAP TUN",
1639 				    ill->ill_name);
1640 			} else if (pim_reg_packet) {
1641 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1642 				    "ip_mforward: ill %s arrived via"
1643 				    "  REGISTER VIF",
1644 				    ill->ill_name);
1645 			}
1646 		}
1647 	} else if ((ipha->ipha_version_and_hdr_length & 0xf) <
1648 	    (uint_t)(IP_SIMPLE_HDR_LENGTH + TUNNEL_LEN) >> 2 ||
1649 	    ((uchar_t *)(ipha + 1))[1] != IPOPT_LSRR) {
1650 		/* Packet arrived via a physical interface. */
1651 		if (ip_mrtdebug > 1) {
1652 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1653 			    "ip_mforward: ill %s arrived via PHYINT",
1654 			    ill->ill_name);
1655 		}
1656 
1657 	} else {
1658 		/*
1659 		 * Packet arrived through a SRCRT tunnel.
1660 		 * Source-route tunnels are no longer supported.
1661 		 * Error message printed every 1000 times.
1662 		 */
1663 		if ((srctun++ % 1000) == 0) {
1664 			cmn_err(CE_WARN,
1665 			    "ip_mforward: received source-routed pkt from %x",
1666 			    ntohl(ipha->ipha_src));
1667 		}
1668 		return (-1);
1669 	}
1670 
1671 	mrtstat.mrts_fwd_in++;
1672 	src = ipha->ipha_src;
1673 
1674 	/* Find route in cache, return NULL if not there or upcalls q'ed. */
1675 
1676 	/*
1677 	 * Lock the mfctable against changes made by ip_mforward.
1678 	 * Note that only add_mfc and del_mfc can remove entries and
1679 	 * they run with exclusive access to IP. So we do not need to
1680 	 * guard against the rt being deleted, so release lock after reading.
1681 	 */
1682 
1683 	if (is_mrouter_off())
1684 		return (-1);
1685 
1686 	mfcbp = &mfctable[MFCHASH(src, dst)];
1687 	MFCB_REFHOLD(mfcbp);
1688 	MFCFIND(mfcbp, src, dst, rt);
1689 
1690 	/* Entry exists, so forward if necessary */
1691 	if (rt != NULL) {
1692 		int ret = 0;
1693 		mrtstat.mrts_mfc_hits++;
1694 		if (pim_reg_packet) {
1695 			ASSERT(reg_vif_num != ALL_VIFS);
1696 			ret = ip_mdq(mp, ipha,
1697 			    viftable[reg_vif_num].v_ipif->ipif_ill, 0, rt);
1698 		} else {
1699 			ret = ip_mdq(mp, ipha, ill, tunnel_src, rt);
1700 		}
1701 
1702 		MFCB_REFRELE(mfcbp);
1703 		return (ret);
1704 
1705 		/*
1706 		 * Don't forward if we don't have a cache entry.  Mrouted will
1707 		 * always provide a cache entry in response to an upcall.
1708 		 */
1709 	} else {
1710 		/*
1711 		 * If we don't have a route for packet's origin, make a copy
1712 		 * of the packet and send message to routing daemon.
1713 		 */
1714 		struct mfc	*mfc_rt	 = NULL;
1715 		mblk_t		*mp0	 = NULL;
1716 		mblk_t		*mp_copy = NULL;
1717 		struct rtdetq	*rte	 = NULL;
1718 		struct rtdetq	*rte_m, *rte1, *prev_rte;
1719 		uint_t		hash;
1720 		int		npkts;
1721 		boolean_t	new_mfc = B_FALSE;
1722 		mrtstat.mrts_mfc_misses++;
1723 		/* BSD uses mrts_no_route++ */
1724 		if (ip_mrtdebug > 1) {
1725 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1726 			    "ip_mforward: no rte ill %s src %x g %x misses %d",
1727 			    ill->ill_name, ntohl(src), ntohl(dst),
1728 			    (int)mrtstat.mrts_mfc_misses);
1729 		}
1730 		/*
1731 		 * The order of the following code differs from the BSD code.
1732 		 * Pre-mc3.5, the BSD code was incorrect and SunOS 5.x
1733 		 * code works, so SunOS 5.x wasn't changed to conform to the
1734 		 * BSD version.
1735 		 */
1736 
1737 		/* Lock mfctable. */
1738 		hash = MFCHASH(src, dst);
1739 		mutex_enter(&(mfctable[hash].mfcb_lock));
1740 
1741 		/*
1742 		 * If we are turning off mrouted return an error
1743 		 */
1744 		if (is_mrouter_off()) {
1745 			mutex_exit(&mfcbp->mfcb_lock);
1746 			MFCB_REFRELE(mfcbp);
1747 			return (-1);
1748 		}
1749 
1750 		/* Is there an upcall waiting for this packet? */
1751 		for (mfc_rt = mfctable[hash].mfcb_mfc; mfc_rt;
1752 		    mfc_rt = mfc_rt->mfc_next) {
1753 			mutex_enter(&mfc_rt->mfc_mutex);
1754 			if (ip_mrtdebug > 1) {
1755 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1756 				    "ip_mforward: MFCTAB hash %d o 0x%x"
1757 				    " g 0x%x\n",
1758 				    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1759 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1760 			}
1761 			/* There is an upcall */
1762 			if ((src == mfc_rt->mfc_origin.s_addr) &&
1763 			    (dst == mfc_rt->mfc_mcastgrp.s_addr) &&
1764 			    (mfc_rt->mfc_rte != NULL) &&
1765 			    !(mfc_rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
1766 				break;
1767 			}
1768 			mutex_exit(&mfc_rt->mfc_mutex);
1769 		}
1770 		/* No upcall, so make a new entry into mfctable */
1771 		if (mfc_rt == NULL) {
1772 			mfc_rt = (struct mfc *)mi_zalloc(sizeof (struct mfc));
1773 			if (mfc_rt == NULL) {
1774 				mrtstat.mrts_fwd_drop++;
1775 				ip1dbg(("ip_mforward: out of memory "
1776 				    "for mfc, mfc_rt\n"));
1777 				goto error_return;
1778 			} else
1779 				new_mfc = B_TRUE;
1780 			/* Get resources */
1781 			/* TODO could copy header and dup rest */
1782 			mp_copy = copymsg(mp);
1783 			if (mp_copy == NULL) {
1784 				mrtstat.mrts_fwd_drop++;
1785 				ip1dbg(("ip_mforward: out of memory for "
1786 				    "mblk, mp_copy\n"));
1787 				goto error_return;
1788 			}
1789 			mutex_enter(&mfc_rt->mfc_mutex);
1790 		}
1791 		/* Get resources for rte, whether first rte or not first. */
1792 		/* Add this packet into rtdetq */
1793 		rte = (struct rtdetq *)mi_zalloc(sizeof (struct rtdetq));
1794 		if (rte == NULL) {
1795 			mrtstat.mrts_fwd_drop++;
1796 			mutex_exit(&mfc_rt->mfc_mutex);
1797 			ip1dbg(("ip_mforward: out of memory for"
1798 			    " rtdetq, rte\n"));
1799 			goto error_return;
1800 		}
1801 
1802 		mp0 = copymsg(mp);
1803 		if (mp0 == NULL) {
1804 			mrtstat.mrts_fwd_drop++;
1805 			ip1dbg(("ip_mforward: out of memory for mblk, mp0\n"));
1806 			mutex_exit(&mfc_rt->mfc_mutex);
1807 			goto error_return;
1808 		}
1809 		rte->mp		= mp0;
1810 		if (pim_reg_packet) {
1811 			ASSERT(reg_vif_num != ALL_VIFS);
1812 			rte->ill = viftable[reg_vif_num].v_ipif->ipif_ill;
1813 		} else {
1814 			rte->ill = ill;
1815 		}
1816 		rte->rte_next	= NULL;
1817 
1818 		/*
1819 		 * Determine if upcall q (rtdetq) has overflowed.
1820 		 * mfc_rt->mfc_rte is null by mi_zalloc
1821 		 * if it is the first message.
1822 		 */
1823 		for (rte_m = mfc_rt->mfc_rte, npkts = 0; rte_m;
1824 		    rte_m = rte_m->rte_next)
1825 			npkts++;
1826 		if (ip_mrtdebug > 1) {
1827 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1828 			    "ip_mforward: upcalls %d\n", npkts);
1829 		}
1830 		if (npkts > MAX_UPQ) {
1831 			mrtstat.mrts_upq_ovflw++;
1832 			mutex_exit(&mfc_rt->mfc_mutex);
1833 			goto error_return;
1834 		}
1835 
1836 		if (npkts == 0) {	/* first upcall */
1837 			int i = 0;
1838 			/*
1839 			 * Now finish installing the new mfc! Now that we have
1840 			 * resources!  Insert new entry at head of hash chain.
1841 			 * Use src and dst which are ipaddr_t's.
1842 			 */
1843 			mfc_rt->mfc_origin.s_addr = src;
1844 			mfc_rt->mfc_mcastgrp.s_addr = dst;
1845 
1846 			mutex_enter(&numvifs_mutex);
1847 			for (i = 0; i < (int)numvifs; i++)
1848 				mfc_rt->mfc_ttls[i] = 0;
1849 			mutex_exit(&numvifs_mutex);
1850 			mfc_rt->mfc_parent = ALL_VIFS;
1851 
1852 			/* Link into table */
1853 			if (ip_mrtdebug > 1) {
1854 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1855 				    "ip_mforward: NEW MFCTAB hash %d o 0x%x "
1856 				    "g 0x%x\n", hash,
1857 				    ntohl(mfc_rt->mfc_origin.s_addr),
1858 				    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1859 			}
1860 			mfc_rt->mfc_next = mfctable[hash].mfcb_mfc;
1861 			mfctable[hash].mfcb_mfc = mfc_rt;
1862 			mfc_rt->mfc_rte = NULL;
1863 		}
1864 
1865 		/* Link in the upcall */
1866 		/* First upcall */
1867 		if (mfc_rt->mfc_rte == NULL)
1868 			mfc_rt->mfc_rte = rte;
1869 		else {
1870 			/* not the first upcall */
1871 			prev_rte = mfc_rt->mfc_rte;
1872 			for (rte1 = mfc_rt->mfc_rte->rte_next; rte1;
1873 			    prev_rte = rte1, rte1 = rte1->rte_next);
1874 			prev_rte->rte_next = rte;
1875 		}
1876 
1877 		/*
1878 		 * No upcalls waiting, this is first one, so send a message to
1879 		 * routing daemon to install a route into kernel table.
1880 		 */
1881 		if (npkts == 0) {
1882 			struct igmpmsg	*im;
1883 			/* ipha_protocol is 0, for upcall */
1884 			ASSERT(mp_copy != NULL);
1885 			im = (struct igmpmsg *)mp_copy->b_rptr;
1886 			im->im_msgtype	= IGMPMSG_NOCACHE;
1887 			im->im_mbz = 0;
1888 			mutex_enter(&numvifs_mutex);
1889 			if (pim_reg_packet) {
1890 				im->im_vif = (uchar_t)reg_vif_num;
1891 				mutex_exit(&numvifs_mutex);
1892 			} else {
1893 				/*
1894 				 * XXX do we need to hold locks here ?
1895 				 */
1896 				for (vifi = 0; vifi < numvifs; vifi++) {
1897 					if (viftable[vifi].v_ipif == NULL)
1898 						continue;
1899 					if (viftable[vifi].v_ipif->ipif_ill ==
1900 					    ill) {
1901 						im->im_vif = (uchar_t)vifi;
1902 						break;
1903 					}
1904 				}
1905 				mutex_exit(&numvifs_mutex);
1906 				ASSERT(vifi < numvifs);
1907 			}
1908 
1909 			mrtstat.mrts_upcalls++;
1910 			/* Timer to discard upcalls if mrouted is too slow */
1911 			mfc_rt->mfc_timeout_id = timeout(expire_upcalls,
1912 			    mfc_rt, EXPIRE_TIMEOUT * UPCALL_EXPIRE);
1913 			mutex_exit(&mfc_rt->mfc_mutex);
1914 			mutex_exit(&(mfctable[hash].mfcb_lock));
1915 			putnext(RD(ip_g_mrouter), mp_copy);
1916 
1917 		} else {
1918 			mutex_exit(&mfc_rt->mfc_mutex);
1919 			mutex_exit(&(mfctable[hash].mfcb_lock));
1920 			freemsg(mp_copy);
1921 		}
1922 
1923 		MFCB_REFRELE(mfcbp);
1924 		if (tunnel_src != 0)
1925 			return (1);
1926 		else
1927 			return (0);
1928 	error_return:
1929 		mutex_exit(&(mfctable[hash].mfcb_lock));
1930 		MFCB_REFRELE(mfcbp);
1931 		if (mfc_rt != NULL && (new_mfc == B_TRUE))
1932 			mi_free((char *)mfc_rt);
1933 		if (rte != NULL)
1934 			mi_free((char *)rte);
1935 		if (mp_copy != NULL)
1936 			freemsg(mp_copy);
1937 		if (mp0 != NULL)
1938 			freemsg(mp0);
1939 		return (-1);
1940 	}
1941 }
1942 
1943 /*
1944  * Clean up the mfctable cache entry if upcall is not serviced.
1945  * SunOS 5.x has timeout per mfc, unlike BSD which has one timer.
1946  */
1947 static void
1948 expire_upcalls(void *arg)
1949 {
1950 	struct mfc *mfc_rt = arg;
1951 	uint_t hash;
1952 	struct mfc *prev_mfc, *mfc0;
1953 
1954 	hash = MFCHASH(mfc_rt->mfc_origin.s_addr, mfc_rt->mfc_mcastgrp.s_addr);
1955 	if (ip_mrtdebug > 1) {
1956 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
1957 		    "expire_upcalls: hash %d s %x g %x",
1958 		    hash, ntohl(mfc_rt->mfc_origin.s_addr),
1959 		    ntohl(mfc_rt->mfc_mcastgrp.s_addr));
1960 	}
1961 	MFCB_REFHOLD(&mfctable[hash]);
1962 	mutex_enter(&mfc_rt->mfc_mutex);
1963 	/*
1964 	 * if timeout has been set to zero, than the
1965 	 * entry has been filled, no need to delete it.
1966 	 */
1967 	if (mfc_rt->mfc_timeout_id == 0)
1968 		goto done;
1969 	mrtstat.mrts_cache_cleanups++;
1970 	mfc_rt->mfc_timeout_id = 0;
1971 
1972 	/* Determine entry to be cleaned up in cache table. */
1973 	for (prev_mfc = mfc0 = mfctable[hash].mfcb_mfc; mfc0;
1974 	    prev_mfc = mfc0, mfc0 = mfc0->mfc_next)
1975 		if (mfc0 == mfc_rt)
1976 			break;
1977 
1978 	/* del_mfc takes care of gone mfcs */
1979 	ASSERT(prev_mfc != NULL);
1980 	ASSERT(mfc0 != NULL);
1981 
1982 	/*
1983 	 * Delete the entry from the cache
1984 	 */
1985 	mfctable[hash].mfcb_marks |= MFCB_MARK_CONDEMNED;
1986 	mfc_rt->mfc_marks |= MFCB_MARK_CONDEMNED;
1987 
1988 	/*
1989 	 * release_mfc will drop all queued upcall packets.
1990 	 * and will free the mbuf with the pkt, if, timing info.
1991 	 */
1992 done:
1993 	mutex_exit(&mfc_rt->mfc_mutex);
1994 	MFCB_REFRELE(&mfctable[hash]);
1995 }
1996 
1997 /*
1998  * Packet forwarding routine once entry in the cache is made.
1999  */
2000 static int
2001 ip_mdq(mblk_t *mp, ipha_t *ipha, ill_t *ill, ipaddr_t tunnel_src,
2002     struct mfc *rt)
2003 {
2004 	vifi_t vifi;
2005 	struct vif *vifp;
2006 	ipaddr_t dst = ipha->ipha_dst;
2007 	size_t  plen = msgdsize(mp);
2008 	vifi_t num_of_vifs;
2009 
2010 	if (ip_mrtdebug > 1) {
2011 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2012 		    "ip_mdq: SEND src %x, ipha_dst %x, ill %s",
2013 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst),
2014 		    ill->ill_name);
2015 	}
2016 
2017 	/* Macro to send packet on vif */
2018 #define	MC_SEND(ipha, mp, vifp, dst) { \
2019 	if ((vifp)->v_flags & VIFF_TUNNEL) \
2020 		encap_send((ipha), (mp), (vifp), (dst)); \
2021 	else if ((vifp)->v_flags & VIFF_REGISTER) \
2022 		register_send((ipha), (mp), (vifp), (dst)); \
2023 	else \
2024 		phyint_send((ipha), (mp), (vifp), (dst)); \
2025 }
2026 
2027 	vifi = rt->mfc_parent;
2028 
2029 	/*
2030 	 * The value of vifi is MAXVIFS if the pkt had no parent, i.e.,
2031 	 * Mrouted had no route.
2032 	 * We wanted the route installed in the mfctable to prevent multiple
2033 	 * tries, so it passed add_mfc(), but is discarded here. The v_ipif is
2034 	 * NULL so we don't want to check the ill. Still needed as of Mrouted
2035 	 * 3.6.
2036 	 */
2037 	if (vifi == NO_VIF) {
2038 		ip1dbg(("ip_mdq: no route for origin ill %s, vifi is NO_VIF\n",
2039 		    ill->ill_name));
2040 		if (ip_mrtdebug > 1) {
2041 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2042 			    "ip_mdq: vifi is NO_VIF ill = %s", ill->ill_name);
2043 		}
2044 		return (-1);	/* drop pkt */
2045 	}
2046 
2047 	if (!lock_good_vif(&viftable[vifi]))
2048 		return (-1);
2049 	/*
2050 	 * The MFC entries are not cleaned up when an ipif goes
2051 	 * away thus this code has to guard against an MFC referencing
2052 	 * an ipif that has been closed. Note: reset_mrt_vif_ipif
2053 	 * sets the v_ipif to NULL when the ipif disappears.
2054 	 */
2055 	ASSERT(viftable[vifi].v_ipif != NULL);
2056 
2057 	if (vifi >= numvifs) {
2058 		cmn_err(CE_WARN, "ip_mdq: illegal vifi %d numvifs "
2059 		    "%d ill %s viftable ill %s\n",
2060 		    (int)vifi, (int)numvifs, ill->ill_name,
2061 		    viftable[vifi].v_ipif->ipif_ill->ill_name);
2062 		unlock_good_vif(&viftable[vifi]);
2063 		return (-1);
2064 	}
2065 	/*
2066 	 * Don't forward if it didn't arrive from the parent vif for its
2067 	 * origin. But do match on the groups as we nominate only one
2068 	 * ill in the group for receiving allmulti packets.
2069 	 */
2070 	if ((viftable[vifi].v_ipif->ipif_ill != ill &&
2071 	    (ill->ill_group == NULL ||
2072 	    viftable[vifi].v_ipif->ipif_ill->ill_group != ill->ill_group)) ||
2073 	    (viftable[vifi].v_rmt_addr.s_addr != tunnel_src)) {
2074 		/* Came in the wrong interface */
2075 		ip1dbg(("ip_mdq: arrived wrong if, vifi %d "
2076 			"numvifs %d ill %s viftable ill %s\n",
2077 			(int)vifi, (int)numvifs, ill->ill_name,
2078 			viftable[vifi].v_ipif->ipif_ill->ill_name));
2079 		if (ip_mrtdebug > 1) {
2080 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2081 			    "ip_mdq: arrived wrong if, vifi %d ill "
2082 			    "%s viftable ill %s\n",
2083 			    (int)vifi, ill->ill_name,
2084 			    viftable[vifi].v_ipif->ipif_ill->ill_name);
2085 		}
2086 		mrtstat.mrts_wrong_if++;
2087 		rt->mfc_wrong_if++;
2088 
2089 		/*
2090 		 * If we are doing PIM assert processing and we are forwarding
2091 		 * packets on this interface, and it is a broadcast medium
2092 		 * interface (and not a tunnel), send a message to the routing.
2093 		 *
2094 		 * We use the first ipif on the list, since it's all we have.
2095 		 * Chances are the ipif_flags are the same for ipifs on the ill.
2096 		 */
2097 		if (pim_assert && rt->mfc_ttls[vifi] > 0 &&
2098 		    (ill->ill_ipif->ipif_flags & IPIF_BROADCAST) &&
2099 		    !(viftable[vifi].v_flags & VIFF_TUNNEL)) {
2100 			mblk_t		*mp_copy;
2101 			struct igmpmsg	*im;
2102 
2103 			/* TODO could copy header and dup rest */
2104 			mp_copy = copymsg(mp);
2105 			if (mp_copy == NULL) {
2106 				mrtstat.mrts_fwd_drop++;
2107 				ip1dbg(("ip_mdq: out of memory "
2108 				    "for mblk, mp_copy\n"));
2109 				unlock_good_vif(&viftable[vifi]);
2110 				return (-1);
2111 			}
2112 
2113 			im = (struct igmpmsg *)mp_copy->b_rptr;
2114 			im->im_msgtype = IGMPMSG_WRONGVIF;
2115 			im->im_mbz = 0;
2116 			im->im_vif = (ushort_t)vifi;
2117 			putnext(RD(ip_g_mrouter), mp_copy);
2118 		}
2119 		unlock_good_vif(&viftable[vifi]);
2120 		if (tunnel_src != 0)
2121 			return (1);
2122 		else
2123 			return (0);
2124 	}
2125 	/*
2126 	 * If I sourced this packet, it counts as output, else it was input.
2127 	 */
2128 	if (ipha->ipha_src == viftable[vifi].v_lcl_addr.s_addr) {
2129 		viftable[vifi].v_pkt_out++;
2130 		viftable[vifi].v_bytes_out += plen;
2131 	} else {
2132 		viftable[vifi].v_pkt_in++;
2133 		viftable[vifi].v_bytes_in += plen;
2134 	}
2135 	mutex_enter(&rt->mfc_mutex);
2136 	rt->mfc_pkt_cnt++;
2137 	rt->mfc_byte_cnt += plen;
2138 	mutex_exit(&rt->mfc_mutex);
2139 	unlock_good_vif(&viftable[vifi]);
2140 	/*
2141 	 * For each vif, decide if a copy of the packet should be forwarded.
2142 	 * Forward if:
2143 	 *		- the vif threshold ttl is non-zero AND
2144 	 *		- the pkt ttl exceeds the vif's threshold
2145 	 * A non-zero mfc_ttl indicates that the vif is part of
2146 	 * the output set for the mfc entry.
2147 	 */
2148 	mutex_enter(&numvifs_mutex);
2149 	num_of_vifs = numvifs;
2150 	mutex_exit(&numvifs_mutex);
2151 	for (vifp = viftable, vifi = 0; vifi < num_of_vifs; vifp++, vifi++) {
2152 		if (!lock_good_vif(vifp))
2153 			continue;
2154 		if ((rt->mfc_ttls[vifi] > 0) &&
2155 		    (ipha->ipha_ttl > rt->mfc_ttls[vifi])) {
2156 			/*
2157 			 * lock_good_vif should not have succedded if
2158 			 * v_ipif is null.
2159 			 */
2160 			ASSERT(vifp->v_ipif != NULL);
2161 			vifp->v_pkt_out++;
2162 			vifp->v_bytes_out += plen;
2163 			MC_SEND(ipha, mp, vifp, dst);
2164 			mrtstat.mrts_fwd_out++;
2165 		}
2166 		unlock_good_vif(vifp);
2167 	}
2168 	if (tunnel_src != 0)
2169 		return (1);
2170 	else
2171 		return (0);
2172 }
2173 
2174 /*
2175  * Send the packet on physical interface.
2176  * Caller assumes can continue to use mp on return.
2177  */
2178 /* ARGSUSED */
2179 static void
2180 phyint_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2181 {
2182 	mblk_t 	*mp_copy;
2183 
2184 	/* Make a new reference to the packet */
2185 	mp_copy = copymsg(mp);	/* TODO could copy header and dup rest */
2186 	if (mp_copy == NULL) {
2187 		mrtstat.mrts_fwd_drop++;
2188 		ip1dbg(("phyint_send: out of memory for mblk, mp_copy\n"));
2189 		return;
2190 	}
2191 	if (vifp->v_rate_limit <= 0)
2192 		tbf_send_packet(vifp, mp_copy);
2193 	else  {
2194 		if (ip_mrtdebug > 1) {
2195 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2196 			    "phyint_send: tbf_contr rate %d "
2197 			    "vifp 0x%p mp 0x%p dst 0x%x",
2198 			    vifp->v_rate_limit, (void *)vifp, (void *)mp, dst);
2199 		}
2200 		tbf_control(vifp, mp_copy, (ipha_t *)mp_copy->b_rptr);
2201 	}
2202 }
2203 
2204 /*
2205  * Send the whole packet for REGISTER encapsulation to PIM daemon
2206  * Caller assumes it can continue to use mp on return.
2207  */
2208 /* ARGSUSED */
2209 static void
2210 register_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2211 {
2212 	struct igmpmsg	*im;
2213 	mblk_t		*mp_copy;
2214 	ipha_t		*ipha_copy;
2215 
2216 	if (ip_mrtdebug > 1) {
2217 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2218 		    "register_send: src %x, dst %x\n",
2219 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2220 	}
2221 
2222 	/*
2223 	 * Copy the old packet & pullup its IP header into the new mblk_t so we
2224 	 * can modify it.  Try to fill the new mblk_t since if we don't the
2225 	 * ethernet driver will.
2226 	 */
2227 	mp_copy = allocb(sizeof (struct igmpmsg) + sizeof (ipha_t), BPRI_MED);
2228 	if (mp_copy == NULL) {
2229 		++mrtstat.mrts_pim_nomemory;
2230 		if (ip_mrtdebug > 3) {
2231 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2232 			    "register_send: allocb failure.");
2233 		}
2234 		return;
2235 	}
2236 
2237 	/*
2238 	 * Bump write pointer to account for igmpmsg being added.
2239 	 */
2240 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (struct igmpmsg);
2241 
2242 	/*
2243 	 * Chain packet to new mblk_t.
2244 	 */
2245 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2246 		++mrtstat.mrts_pim_nomemory;
2247 		if (ip_mrtdebug > 3) {
2248 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2249 			    "register_send: copymsg failure.");
2250 		}
2251 		freeb(mp_copy);
2252 		return;
2253 	}
2254 
2255 	/*
2256 	 * icmp_rput() asserts that IP version field is set to an
2257 	 * appropriate version. Hence, the struct igmpmsg that this really
2258 	 * becomes, needs to have the correct IP version field.
2259 	 */
2260 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2261 	*ipha_copy = multicast_encap_iphdr;
2262 
2263 	/*
2264 	 * The kernel uses the struct igmpmsg header to encode the messages to
2265 	 * the multicast routing daemon. Fill in the fields in the header
2266 	 * starting with the message type which is IGMPMSG_WHOLEPKT
2267 	 */
2268 	im = (struct igmpmsg *)mp_copy->b_rptr;
2269 	im->im_msgtype = IGMPMSG_WHOLEPKT;
2270 	im->im_src.s_addr = ipha->ipha_src;
2271 	im->im_dst.s_addr = ipha->ipha_dst;
2272 
2273 	/*
2274 	 * Must Be Zero. This is because the struct igmpmsg is really an IP
2275 	 * header with renamed fields and the multicast routing daemon uses
2276 	 * an ipha_protocol (aka im_mbz) of 0 to distinguish these messages.
2277 	 */
2278 	im->im_mbz = 0;
2279 
2280 	++mrtstat.mrts_upcalls;
2281 	if (!canputnext(RD(ip_g_mrouter))) {
2282 		++mrtstat.mrts_pim_regsend_drops;
2283 		if (ip_mrtdebug > 3) {
2284 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2285 			    "register_send: register upcall failure.");
2286 		}
2287 		freemsg(mp_copy);
2288 	} else {
2289 		putnext(RD(ip_g_mrouter), mp_copy);
2290 	}
2291 }
2292 
2293 /*
2294  * pim_validate_cksum handles verification of the checksum in the
2295  * pim header.  For PIM Register packets, the checksum is calculated
2296  * across the PIM header only.  For all other packets, the checksum
2297  * is for the PIM header and remainder of the packet.
2298  *
2299  * returns: B_TRUE, if checksum is okay.
2300  *          B_FALSE, if checksum is not valid.
2301  */
2302 static boolean_t
2303 pim_validate_cksum(mblk_t *mp, ipha_t *ip, struct pim *pimp)
2304 {
2305 	mblk_t *mp_dup;
2306 
2307 	if ((mp_dup = dupmsg(mp)) == NULL)
2308 		return (B_FALSE);
2309 
2310 	mp_dup->b_rptr += IPH_HDR_LENGTH(ip);
2311 	if (pimp->pim_type == PIM_REGISTER)
2312 		mp_dup->b_wptr = mp_dup->b_rptr + PIM_MINLEN;
2313 	if (IP_CSUM(mp_dup, 0, 0)) {
2314 		freemsg(mp_dup);
2315 		return (B_FALSE);
2316 	}
2317 	freemsg(mp_dup);
2318 	return (B_TRUE);
2319 }
2320 
2321 /*
2322  * int
2323  * pim_input(queue_t *, mblk_t *) - Process PIM protocol packets.
2324  *	IP Protocol 103. Register messages are decapsulated and sent
2325  *	onto multicast forwarding.
2326  */
2327 int
2328 pim_input(queue_t *q, mblk_t *mp)
2329 {
2330 	ipha_t		*eip, *ip;
2331 	int		iplen, pimlen, iphlen;
2332 	struct pim	*pimp;	/* pointer to a pim struct */
2333 	uint32_t	*reghdr;
2334 
2335 	/*
2336 	 * Pullup the msg for PIM protocol processing.
2337 	 */
2338 	if (pullupmsg(mp, -1) == 0) {
2339 		++mrtstat.mrts_pim_nomemory;
2340 		freemsg(mp);
2341 		return (-1);
2342 	}
2343 
2344 	ip = (ipha_t *)mp->b_rptr;
2345 	iplen = ip->ipha_length;
2346 	iphlen = IPH_HDR_LENGTH(ip);
2347 	pimlen = ntohs(iplen) - iphlen;
2348 
2349 	/*
2350 	 * Validate lengths
2351 	 */
2352 	if (pimlen < PIM_MINLEN) {
2353 		++mrtstat.mrts_pim_malformed;
2354 		if (ip_mrtdebug > 1) {
2355 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2356 			    "pim_input: length not at least minlen");
2357 		}
2358 		freemsg(mp);
2359 		return (-1);
2360 	}
2361 
2362 	/*
2363 	 * Point to the PIM header.
2364 	 */
2365 	pimp = (struct pim *)((caddr_t)ip + iphlen);
2366 
2367 	/*
2368 	 * Check the version number.
2369 	 */
2370 	if (pimp->pim_vers != PIM_VERSION) {
2371 		++mrtstat.mrts_pim_badversion;
2372 		if (ip_mrtdebug > 1) {
2373 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2374 			    "pim_input: unknown version of PIM");
2375 		}
2376 		freemsg(mp);
2377 		return (-1);
2378 	}
2379 
2380 	/*
2381 	 * Validate the checksum
2382 	 */
2383 	if (!pim_validate_cksum(mp, ip, pimp)) {
2384 		++mrtstat.mrts_pim_rcv_badcsum;
2385 		if (ip_mrtdebug > 1) {
2386 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2387 			    "pim_input: invalid checksum");
2388 		}
2389 		freemsg(mp);
2390 		return (-1);
2391 	}
2392 
2393 	if (pimp->pim_type != PIM_REGISTER)
2394 		return (0);
2395 
2396 	reghdr = (uint32_t *)(pimp + 1);
2397 	eip = (ipha_t *)(reghdr + 1);
2398 
2399 	/*
2400 	 * check if the inner packet is destined to mcast group
2401 	 */
2402 	if (!CLASSD(eip->ipha_dst)) {
2403 		++mrtstat.mrts_pim_badregisters;
2404 		if (ip_mrtdebug > 1) {
2405 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2406 			    "pim_input: Inner pkt not mcast .. !");
2407 		}
2408 		freemsg(mp);
2409 		return (-1);
2410 	}
2411 	if (ip_mrtdebug > 1) {
2412 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2413 		    "register from %x, to %x, len %d",
2414 		    ntohl(eip->ipha_src),
2415 		    ntohl(eip->ipha_dst),
2416 		    ntohs(eip->ipha_length));
2417 	}
2418 	/*
2419 	 * If the null register bit is not set, decapsulate
2420 	 * the packet before forwarding it.
2421 	 */
2422 	if (!(ntohl(*reghdr) & PIM_NULL_REGISTER)) {
2423 		mblk_t *mp_copy;
2424 
2425 		/* Copy the message */
2426 		if ((mp_copy = copymsg(mp)) == NULL) {
2427 			++mrtstat.mrts_pim_nomemory;
2428 			freemsg(mp);
2429 			return (-1);
2430 		}
2431 
2432 		/*
2433 		 * Decapsulate the packet and give it to
2434 		 * register_mforward.
2435 		 */
2436 		mp_copy->b_rptr += iphlen + sizeof (pim_t) +
2437 		    sizeof (*reghdr);
2438 		if (register_mforward(q, mp_copy) != 0) {
2439 			freemsg(mp);
2440 			return (-1);
2441 		}
2442 	}
2443 
2444 	/*
2445 	 * Pass all valid PIM packets up to any process(es) listening on a raw
2446 	 * PIM socket. For Solaris it is done right after pim_input() is
2447 	 * called.
2448 	 */
2449 	return (0);
2450 }
2451 
2452 /*
2453  * PIM sparse mode hook.  Called by pim_input after decapsulating
2454  * the packet. Loop back the packet, as if we have received it.
2455  * In pim_input() we have to check if the destination is a multicast address.
2456  */
2457 /* ARGSUSED */
2458 static int
2459 register_mforward(queue_t *q, mblk_t *mp)
2460 {
2461 	ASSERT(reg_vif_num <= numvifs);
2462 
2463 	if (ip_mrtdebug > 3) {
2464 		ipha_t *ipha;
2465 
2466 		ipha = (ipha_t *)mp->b_rptr;
2467 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2468 		    "register_mforward: src %x, dst %x\n",
2469 		    ntohl(ipha->ipha_src), ntohl(ipha->ipha_dst));
2470 	}
2471 	/*
2472 	 * Need to pass in to ip_mforward() the information that the
2473 	 * packet has arrived on the register_vif. We use the solution that
2474 	 * ip_mroute_decap() employs: use mp->b_prev to pass some information
2475 	 * to ip_mforward(). Nonzero value means the packet has arrived on a
2476 	 * tunnel (ip_mroute_decap() puts the address of the other side of the
2477 	 * tunnel there.) This is safe since ip_rput() either frees the packet
2478 	 * or passes it to ip_mforward(). We use
2479 	 * PIM_REGISTER_MARKER = 0xffffffff to indicate the has arrived on the
2480 	 * register vif. If in the future we have more than one register vifs,
2481 	 * then this will need re-examination.
2482 	 */
2483 	mp->b_prev = (mblk_t *)PIM_REGISTER_MARKER;
2484 	++mrtstat.mrts_pim_regforwards;
2485 	ip_rput(q, mp);
2486 	return (0);
2487 }
2488 
2489 /*
2490  * Send an encapsulated packet.
2491  * Caller assumes can continue to use mp when routine returns.
2492  */
2493 /* ARGSUSED */
2494 static void
2495 encap_send(ipha_t *ipha, mblk_t *mp, struct vif *vifp, ipaddr_t dst)
2496 {
2497 	mblk_t 	*mp_copy;
2498 	ipha_t 	*ipha_copy;
2499 	size_t	len;
2500 
2501 	if (ip_mrtdebug > 1) {
2502 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2503 		    "encap_send: vif %ld enter", (ptrdiff_t)(vifp - viftable));
2504 	}
2505 	len = ntohs(ipha->ipha_length);
2506 
2507 	/*
2508 	 * Copy the old packet & pullup it's IP header into the
2509 	 * new mbuf so we can modify it.  Try to fill the new
2510 	 * mbuf since if we don't the ethernet driver will.
2511 	 */
2512 	mp_copy = allocb(32 + sizeof (multicast_encap_iphdr), BPRI_MED);
2513 	if (mp_copy == NULL)
2514 		return;
2515 	mp_copy->b_rptr += 32;
2516 	mp_copy->b_wptr = mp_copy->b_rptr + sizeof (multicast_encap_iphdr);
2517 	if ((mp_copy->b_cont = copymsg(mp)) == NULL) {
2518 		freeb(mp_copy);
2519 		return;
2520 	}
2521 
2522 	/*
2523 	 * Fill in the encapsulating IP header.
2524 	 * Remote tunnel dst in rmt_addr, from add_vif().
2525 	 */
2526 	ipha_copy = (ipha_t *)mp_copy->b_rptr;
2527 	*ipha_copy = multicast_encap_iphdr;
2528 	ASSERT((len + sizeof (ipha_t)) <= IP_MAXPACKET);
2529 	ipha_copy->ipha_length = htons(len + sizeof (ipha_t));
2530 	ipha_copy->ipha_src = vifp->v_lcl_addr.s_addr;
2531 	ipha_copy->ipha_dst = vifp->v_rmt_addr.s_addr;
2532 	ASSERT(ipha_copy->ipha_ident == 0);
2533 
2534 	/* Turn the encapsulated IP header back into a valid one. */
2535 	ipha = (ipha_t *)mp_copy->b_cont->b_rptr;
2536 	ipha->ipha_ttl--;
2537 	ipha->ipha_hdr_checksum = 0;
2538 	ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2539 
2540 	if (ip_mrtdebug > 1) {
2541 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2542 		    "encap_send: group 0x%x", ntohl(ipha->ipha_dst));
2543 	}
2544 	if (vifp->v_rate_limit <= 0)
2545 		tbf_send_packet(vifp, mp_copy);
2546 	else
2547 		/* ipha is from the original header */
2548 		tbf_control(vifp, mp_copy, ipha);
2549 }
2550 
2551 /*
2552  * De-encapsulate a packet and feed it back through IP input.
2553  * This routine is called whenever IP gets a packet with prototype
2554  * IPPROTO_ENCAP and a local destination address.
2555  */
2556 void
2557 ip_mroute_decap(queue_t *q, mblk_t *mp)
2558 {
2559 	ipha_t		*ipha = (ipha_t *)mp->b_rptr;
2560 	ipha_t		*ipha_encap;
2561 	int		hlen = IPH_HDR_LENGTH(ipha);
2562 	ipaddr_t	src;
2563 	struct vif	*vifp;
2564 
2565 	/*
2566 	 * Dump the packet if it's not to a multicast destination or if
2567 	 * we don't have an encapsulating tunnel with the source.
2568 	 * Note:  This code assumes that the remote site IP address
2569 	 * uniquely identifies the tunnel (i.e., that this site has
2570 	 * at most one tunnel with the remote site).
2571 	 */
2572 	ipha_encap = (ipha_t *)((char *)ipha + hlen);
2573 	if (!CLASSD(ipha_encap->ipha_dst)) {
2574 		mrtstat.mrts_bad_tunnel++;
2575 		ip1dbg(("ip_mroute_decap: bad tunnel\n"));
2576 		freemsg(mp);
2577 		return;
2578 	}
2579 	src = (ipaddr_t)ipha->ipha_src;
2580 	mutex_enter(&last_encap_lock);
2581 	if (src != last_encap_src) {
2582 		struct vif *vife;
2583 
2584 		vifp = viftable;
2585 		vife = vifp + numvifs;
2586 		last_encap_src = src;
2587 		last_encap_vif = 0;
2588 		for (; vifp < vife; ++vifp) {
2589 			if (!lock_good_vif(vifp))
2590 				continue;
2591 			if (vifp->v_rmt_addr.s_addr == src) {
2592 				if (vifp->v_flags & VIFF_TUNNEL)
2593 					last_encap_vif = vifp;
2594 				if (ip_mrtdebug > 1) {
2595 					(void) mi_strlog(ip_g_mrouter,
2596 					    1, SL_TRACE,
2597 					    "ip_mroute_decap: good tun "
2598 					    "vif %ld with %x",
2599 					    (ptrdiff_t)(vifp - viftable),
2600 					    ntohl(src));
2601 				}
2602 				unlock_good_vif(vifp);
2603 				break;
2604 			}
2605 			unlock_good_vif(vifp);
2606 		}
2607 	}
2608 	if ((vifp = last_encap_vif) == 0) {
2609 		mutex_exit(&last_encap_lock);
2610 		mrtstat.mrts_bad_tunnel++;
2611 		freemsg(mp);
2612 		ip1dbg(("ip_mroute_decap: vif %ld no tunnel with %x\n",
2613 		    (ptrdiff_t)(vifp - viftable), ntohl(src)));
2614 		return;
2615 	}
2616 	mutex_exit(&last_encap_lock);
2617 
2618 	/*
2619 	 * Need to pass in the tunnel source to ip_mforward (so that it can
2620 	 * verify that the packet arrived over the correct vif.)  We use b_prev
2621 	 * to pass this information. This is safe since the ip_rput either
2622 	 * frees the packet or passes it to ip_mforward.
2623 	 */
2624 	mp->b_prev = (mblk_t *)(uintptr_t)src;
2625 	mp->b_rptr += hlen;
2626 	/* Feed back into ip_rput as an M_DATA. */
2627 	ip_rput(q, mp);
2628 }
2629 
2630 /*
2631  * Remove all records with v_ipif == ipif.  Called when an interface goes away
2632  * (stream closed).  Called as writer.
2633  */
2634 void
2635 reset_mrt_vif_ipif(ipif_t *ipif)
2636 {
2637 	vifi_t vifi, tmp_vifi;
2638 	vifi_t num_of_vifs;
2639 
2640 	/* Can't check vifi >= 0 since vifi_t is unsigned! */
2641 
2642 	mutex_enter(&numvifs_mutex);
2643 	num_of_vifs = numvifs;
2644 	mutex_exit(&numvifs_mutex);
2645 
2646 	for (vifi = num_of_vifs; vifi != 0; vifi--) {
2647 		tmp_vifi = vifi - 1;
2648 		if (viftable[tmp_vifi].v_ipif == ipif) {
2649 			(void) del_vif(&tmp_vifi, NULL, NULL);
2650 		}
2651 	}
2652 }
2653 
2654 /* Remove pending upcall msgs when ill goes away.  Called by ill_delete.  */
2655 void
2656 reset_mrt_ill(ill_t *ill)
2657 {
2658 	struct mfc		*rt;
2659 	struct rtdetq	*rte;
2660 	int			i;
2661 
2662 	for (i = 0; i < MFCTBLSIZ; i++) {
2663 		MFCB_REFHOLD(&mfctable[i]);
2664 		if ((rt = mfctable[i].mfcb_mfc) != NULL) {
2665 			if (ip_mrtdebug > 1) {
2666 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2667 				    "reset_mrt_ill: mfctable [%d]", i);
2668 			}
2669 			while (rt != NULL) {
2670 				mutex_enter(&rt->mfc_mutex);
2671 				while ((rte = rt->mfc_rte) != NULL) {
2672 					if (rte->ill == ill) {
2673 						if (ip_mrtdebug > 1) {
2674 							(void) mi_strlog(
2675 							    ip_g_mrouter,
2676 							    1, SL_TRACE,
2677 							    "reset_mrt_ill: "
2678 							    "ill 0x%p", ill);
2679 						}
2680 						rt->mfc_rte = rte->rte_next;
2681 						freemsg(rte->mp);
2682 						mi_free((char *)rte);
2683 					}
2684 				}
2685 				mutex_exit(&rt->mfc_mutex);
2686 				rt = rt->mfc_next;
2687 			}
2688 		}
2689 		MFCB_REFRELE(&mfctable[i]);
2690 	}
2691 }
2692 
2693 /*
2694  * Token bucket filter module.
2695  * The ipha is for mcastgrp destination for phyint and encap.
2696  */
2697 static void
2698 tbf_control(struct vif *vifp, mblk_t *mp, ipha_t *ipha)
2699 {
2700 	size_t 	p_len =  msgdsize(mp);
2701 	struct tbf	*t    = vifp->v_tbf;
2702 	timeout_id_t id = 0;
2703 
2704 	/* Drop if packet is too large */
2705 	if (p_len > MAX_BKT_SIZE) {
2706 		mrtstat.mrts_pkt2large++;
2707 		freemsg(mp);
2708 		return;
2709 	}
2710 	if (ip_mrtdebug > 1) {
2711 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2712 		    "tbf_ctrl: SEND vif %ld, qlen %d, ipha_dst 0x%x",
2713 		    (ptrdiff_t)(vifp - viftable), t->tbf_q_len,
2714 		    ntohl(ipha->ipha_dst));
2715 	}
2716 
2717 	mutex_enter(&t->tbf_lock);
2718 
2719 	tbf_update_tokens(vifp);
2720 
2721 	/*
2722 	 * If there are enough tokens,
2723 	 * and the queue is empty, send this packet out.
2724 	 */
2725 	if (ip_mrtdebug > 1) {
2726 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2727 		    "tbf_control: vif %ld, TOKENS  %d, pkt len  %lu, qlen  %d",
2728 		    (ptrdiff_t)(vifp - viftable), t->tbf_n_tok, p_len,
2729 		    t->tbf_q_len);
2730 	}
2731 	/* No packets are queued */
2732 	if (t->tbf_q_len == 0) {
2733 		/* queue empty, send packet if enough tokens */
2734 		if (p_len <= t->tbf_n_tok) {
2735 			t->tbf_n_tok -= p_len;
2736 			mutex_exit(&t->tbf_lock);
2737 			tbf_send_packet(vifp, mp);
2738 			return;
2739 		} else {
2740 			/* Queue packet and timeout till later */
2741 			tbf_queue(vifp, mp);
2742 			ASSERT(vifp->v_timeout_id == 0);
2743 			vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2744 			    TBF_REPROCESS);
2745 		}
2746 	} else if (t->tbf_q_len < t->tbf_max_q_len) {
2747 		/* Finite queue length, so queue pkts and process queue */
2748 		tbf_queue(vifp, mp);
2749 		tbf_process_q(vifp);
2750 	} else {
2751 		/* Check that we have UDP header with IP header */
2752 		size_t hdr_length = IPH_HDR_LENGTH(ipha) +
2753 					sizeof (struct udphdr);
2754 
2755 		if ((mp->b_wptr - mp->b_rptr) < hdr_length) {
2756 			if (!pullupmsg(mp, hdr_length)) {
2757 				freemsg(mp);
2758 				ip1dbg(("tbf_ctl: couldn't pullup udp hdr, "
2759 				    "vif %ld src 0x%x dst 0x%x\n",
2760 				    (ptrdiff_t)(vifp - viftable),
2761 				    ntohl(ipha->ipha_src),
2762 				    ntohl(ipha->ipha_dst)));
2763 				mutex_exit(&vifp->v_tbf->tbf_lock);
2764 				return;
2765 			} else
2766 				/* Have to reassign ipha after pullupmsg */
2767 				ipha = (ipha_t *)mp->b_rptr;
2768 		}
2769 		/*
2770 		 * Queue length too much,
2771 		 * try to selectively dq, or queue and process
2772 		 */
2773 		if (!tbf_dq_sel(vifp, ipha)) {
2774 			mrtstat.mrts_q_overflow++;
2775 			freemsg(mp);
2776 		} else {
2777 			tbf_queue(vifp, mp);
2778 			tbf_process_q(vifp);
2779 		}
2780 	}
2781 	if (t->tbf_q_len == 0) {
2782 		id = vifp->v_timeout_id;
2783 		vifp->v_timeout_id = 0;
2784 	}
2785 	mutex_exit(&vifp->v_tbf->tbf_lock);
2786 	if (id != 0)
2787 		(void) untimeout(id);
2788 }
2789 
2790 /*
2791  * Adds a packet to the tbf queue at the interface.
2792  * The ipha is for mcastgrp destination for phyint and encap.
2793  */
2794 static void
2795 tbf_queue(struct vif *vifp, mblk_t *mp)
2796 {
2797 	struct tbf	*t = vifp->v_tbf;
2798 
2799 	if (ip_mrtdebug > 1) {
2800 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2801 		    "tbf_queue: vif %ld", (ptrdiff_t)(vifp - viftable));
2802 	}
2803 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2804 
2805 	if (t->tbf_t == NULL) {
2806 		/* Queue was empty */
2807 		t->tbf_q = mp;
2808 	} else {
2809 		/* Insert at tail */
2810 		t->tbf_t->b_next = mp;
2811 	}
2812 	/* set new tail pointer */
2813 	t->tbf_t = mp;
2814 
2815 	mp->b_next = mp->b_prev = NULL;
2816 
2817 	t->tbf_q_len++;
2818 }
2819 
2820 /*
2821  * Process the queue at the vif interface.
2822  * Drops the tbf_lock when sending packets.
2823  *
2824  * NOTE : The caller should quntimeout if the queue length is 0.
2825  */
2826 static void
2827 tbf_process_q(struct vif *vifp)
2828 {
2829 	mblk_t	*mp;
2830 	struct tbf	*t = vifp->v_tbf;
2831 	size_t	len;
2832 
2833 	if (ip_mrtdebug > 1) {
2834 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2835 		    "tbf_process_q 1: vif %ld qlen = %d",
2836 		    (ptrdiff_t)(vifp - viftable), t->tbf_q_len);
2837 	}
2838 
2839 	/*
2840 	 * Loop through the queue at the interface and send
2841 	 * as many packets as possible.
2842 	 */
2843 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2844 
2845 	while (t->tbf_q_len > 0) {
2846 		mp = t->tbf_q;
2847 		len = (size_t)msgdsize(mp); /* length of ip pkt */
2848 
2849 		/* Determine if the packet can be sent */
2850 		if (len <= t->tbf_n_tok) {
2851 			/*
2852 			 * If so, reduce no. of tokens, dequeue the packet,
2853 			 * send the packet.
2854 			 */
2855 			t->tbf_n_tok -= len;
2856 
2857 			t->tbf_q = mp->b_next;
2858 			if (--t->tbf_q_len == 0) {
2859 				t->tbf_t = NULL;
2860 			}
2861 			mp->b_next = NULL;
2862 			/* Exit mutex before sending packet, then re-enter */
2863 			mutex_exit(&t->tbf_lock);
2864 			tbf_send_packet(vifp, mp);
2865 			mutex_enter(&t->tbf_lock);
2866 		} else
2867 			break;
2868 	}
2869 }
2870 
2871 /* Called at tbf timeout to update tokens, process q and reset timer.  */
2872 static void
2873 tbf_reprocess_q(void *arg)
2874 {
2875 	struct vif *vifp = arg;
2876 
2877 	mutex_enter(&vifp->v_tbf->tbf_lock);
2878 	vifp->v_timeout_id = 0;
2879 	tbf_update_tokens(vifp);
2880 
2881 	tbf_process_q(vifp);
2882 
2883 	if (vifp->v_tbf->tbf_q_len > 0) {
2884 		vifp->v_timeout_id = timeout(tbf_reprocess_q, vifp,
2885 		    TBF_REPROCESS);
2886 	}
2887 	mutex_exit(&vifp->v_tbf->tbf_lock);
2888 
2889 	if (ip_mrtdebug > 1) {
2890 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2891 		    "tbf_reprcess_q: vif %ld timeout id = %p",
2892 		    (ptrdiff_t)(vifp - viftable), vifp->v_timeout_id);
2893 	}
2894 }
2895 
2896 /*
2897  * Function that will selectively discard a member of the tbf queue,
2898  * based on the precedence value and the priority.
2899  *
2900  * NOTE : The caller should quntimeout if the queue length is 0.
2901  */
2902 static int
2903 tbf_dq_sel(struct vif *vifp, ipha_t *ipha)
2904 {
2905 	uint_t		p;
2906 	struct tbf		*t = vifp->v_tbf;
2907 	mblk_t		**np;
2908 	mblk_t		*last, *mp;
2909 
2910 	if (ip_mrtdebug > 1) {
2911 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2912 		    "dq_sel: vif %ld dst 0x%x",
2913 		    (ptrdiff_t)(vifp - viftable), ntohl(ipha->ipha_dst));
2914 	}
2915 
2916 	ASSERT(MUTEX_HELD(&t->tbf_lock));
2917 	p = priority(vifp, ipha);
2918 
2919 	np = &t->tbf_q;
2920 	last = NULL;
2921 	while ((mp = *np) != NULL) {
2922 		if (p > (priority(vifp, (ipha_t *)mp->b_rptr))) {
2923 			*np = mp->b_next;
2924 			/* If removing the last packet, fix the tail pointer */
2925 			if (mp == t->tbf_t)
2926 				t->tbf_t = last;
2927 			mp->b_prev = mp->b_next = NULL;
2928 			freemsg(mp);
2929 			/*
2930 			 * It's impossible for the queue to be empty, but
2931 			 * we check anyway.
2932 			 */
2933 			if (--t->tbf_q_len == 0) {
2934 				t->tbf_t = NULL;
2935 			}
2936 			mrtstat.mrts_drop_sel++;
2937 			return (1);
2938 		}
2939 		np = &mp->b_next;
2940 		last = mp;
2941 	}
2942 	return (0);
2943 }
2944 
2945 /* Sends packet, 2 cases - encap tunnel, phyint.  */
2946 static void
2947 tbf_send_packet(struct vif *vifp, mblk_t *mp)
2948 {
2949 	ipif_t  *ipif;
2950 
2951 	/* If encap tunnel options */
2952 	if (vifp->v_flags & VIFF_TUNNEL)  {
2953 		if (ip_mrtdebug > 1) {
2954 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2955 			    "tbf_send_pkt: ENCAP tunnel vif %ld",
2956 			    (ptrdiff_t)(vifp - viftable));
2957 		}
2958 
2959 		/*
2960 		 * Feed into ip_wput which will set the ident field and
2961 		 * checksum the encapsulating header.
2962 		 * BSD gets the cached route vifp->v_route from ip_output()
2963 		 * to speed up route table lookups. Not necessary in SunOS 5.x.
2964 		 */
2965 		put(vifp->v_ipif->ipif_wq, mp);
2966 		return;
2967 
2968 		/* phyint */
2969 	} else {
2970 		/* Need to loop back to members on the outgoing interface. */
2971 		ipha_t  *ipha;
2972 		ipaddr_t    dst;
2973 		ipha  = (ipha_t *)mp->b_rptr;
2974 		dst  = ipha->ipha_dst;
2975 		ipif = vifp->v_ipif;
2976 
2977 		mutex_enter(&ipif->ipif_ill->ill_lock);
2978 		if (ilm_lookup_ipif(ipif, dst) != NULL) {
2979 			/*
2980 			 * The packet is not yet reassembled, thus we need to
2981 			 * pass it to ip_rput_local for checksum verification
2982 			 * and reassembly (and fanout the user stream).
2983 			 */
2984 			mblk_t 	*mp_loop;
2985 			ire_t	*ire;
2986 
2987 			mutex_exit(&ipif->ipif_ill->ill_lock);
2988 			if (ip_mrtdebug > 1) {
2989 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
2990 				    "tbf_send_pkt: loopback vif %ld",
2991 				    (ptrdiff_t)(vifp - viftable));
2992 			}
2993 			mp_loop = copymsg(mp);
2994 			ire = ire_ctable_lookup(~0, 0, IRE_BROADCAST, NULL,
2995 			    ALL_ZONES, MATCH_IRE_TYPE);
2996 
2997 			if (mp_loop != NULL && ire != NULL) {
2998 				IP_RPUT_LOCAL(ipif->ipif_rq, mp_loop,
2999 				    ((ipha_t *)mp_loop->b_rptr),
3000 				    ire, (ill_t *)ipif->ipif_rq->q_ptr);
3001 			} else {
3002 				/* Either copymsg failed or no ire */
3003 				(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3004 				    "tbf_send_pkt: mp_loop 0x%p, ire 0x%p "
3005 				    "vif %ld\n", mp_loop, ire,
3006 				    (ptrdiff_t)(vifp - viftable));
3007 			}
3008 			if (ire != NULL)
3009 				ire_refrele(ire);
3010 		} else {
3011 			mutex_exit(&ipif->ipif_ill->ill_lock);
3012 		}
3013 		if (ip_mrtdebug > 1) {
3014 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3015 			    "tbf_send_pkt: phyint forward  vif %ld dst = 0x%x",
3016 			    (ptrdiff_t)(vifp - viftable), ntohl(dst));
3017 		}
3018 		ip_rput_forward_multicast(dst, mp, ipif);
3019 	}
3020 }
3021 
3022 /*
3023  * Determine the current time and then the elapsed time (between the last time
3024  * and time now).  Update the no. of tokens in the bucket.
3025  */
3026 static void
3027 tbf_update_tokens(struct vif *vifp)
3028 {
3029 	timespec_t	tp;
3030 	hrtime_t	tm;
3031 	struct tbf	*t = vifp->v_tbf;
3032 
3033 	ASSERT(MUTEX_HELD(&t->tbf_lock));
3034 
3035 	/* Time in secs and nsecs, rate limit in kbits/sec */
3036 	gethrestime(&tp);
3037 
3038 	/*LINTED*/
3039 	TV_DELTA(tp, t->tbf_last_pkt_t, tm);
3040 
3041 	/*
3042 	 * This formula is actually
3043 	 * "time in seconds" * "bytes/second".  Scaled for nsec.
3044 	 * (tm/1000000000) * (v_rate_limit * 1000 * (1000/1024) /8)
3045 	 *
3046 	 * The (1000/1024) was introduced in add_vif to optimize
3047 	 * this divide into a shift.
3048 	 */
3049 	t->tbf_n_tok += (tm/1000) * vifp->v_rate_limit / 1024 / 8;
3050 	t->tbf_last_pkt_t = tp;
3051 
3052 	if (t->tbf_n_tok > MAX_BKT_SIZE)
3053 		t->tbf_n_tok = MAX_BKT_SIZE;
3054 	if (ip_mrtdebug > 1) {
3055 		(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3056 		    "tbf_update_tok: tm %lld tok %d vif %ld",
3057 		    tm, t->tbf_n_tok, (ptrdiff_t)(vifp - viftable));
3058 	}
3059 }
3060 
3061 /*
3062  * Priority currently is based on port nos.
3063  * Different forwarding mechanisms have different ways
3064  * of obtaining the port no. Hence, the vif must be
3065  * given along with the packet itself.
3066  *
3067  */
3068 static int
3069 priority(struct vif *vifp, ipha_t *ipha)
3070 {
3071 	int prio;
3072 
3073 	/* Temporary hack; may add general packet classifier some day */
3074 
3075 	ASSERT(MUTEX_HELD(&vifp->v_tbf->tbf_lock));
3076 
3077 	/*
3078 	 * The UDP port space is divided up into four priority ranges:
3079 	 * [0, 16384)	: unclassified - lowest priority
3080 	 * [16384, 32768)	: audio - highest priority
3081 	 * [32768, 49152)	: whiteboard - medium priority
3082 	 * [49152, 65536)	: video - low priority
3083 	 */
3084 
3085 	if (ipha->ipha_protocol == IPPROTO_UDP) {
3086 		struct udphdr *udp =
3087 		    (struct udphdr *)((char *)ipha + IPH_HDR_LENGTH(ipha));
3088 		switch (ntohs(udp->uh_dport) & 0xc000) {
3089 		case 0x4000:
3090 			prio = 70;
3091 			break;
3092 		case 0x8000:
3093 			prio = 60;
3094 			break;
3095 		case 0xc000:
3096 			prio = 55;
3097 			break;
3098 		default:
3099 			prio = 50;
3100 			break;
3101 		}
3102 		if (ip_mrtdebug > 1) {
3103 			(void) mi_strlog(ip_g_mrouter, 1, SL_TRACE,
3104 			    "priority: port %x prio %d\n",
3105 			    ntohs(udp->uh_dport), prio);
3106 		}
3107 	} else
3108 		prio = 50;  /* default priority */
3109 	return (prio);
3110 }
3111 
3112 /*
3113  * End of token bucket filter modifications
3114  */
3115 
3116 
3117 
3118 /*
3119  * Produces data for netstat -M.
3120  */
3121 int
3122 ip_mroute_stats(mblk_t *mp)
3123 {
3124 	mrtstat.mrts_vifctlSize = sizeof (struct vifctl);
3125 	mrtstat.mrts_mfcctlSize = sizeof (struct mfcctl);
3126 	if (!snmp_append_data(mp, (char *)&mrtstat, sizeof (mrtstat))) {
3127 		ip0dbg(("ip_mroute_stats: failed %ld bytes\n",
3128 		    (size_t)sizeof (mrtstat)));
3129 		return (0);
3130 	}
3131 	return (1);
3132 }
3133 
3134 /*
3135  * Sends info for SNMP's MIB.
3136  */
3137 int
3138 ip_mroute_vif(mblk_t *mp)
3139 {
3140 	struct vifctl 	vi;
3141 	vifi_t		vifi;
3142 
3143 	mutex_enter(&numvifs_mutex);
3144 	for (vifi = 0; vifi < numvifs; vifi++) {
3145 		if (viftable[vifi].v_lcl_addr.s_addr == 0)
3146 			continue;
3147 		/*
3148 		 * No locks here, an approximation is fine.
3149 		 */
3150 		vi.vifc_vifi = vifi;
3151 		vi.vifc_flags = viftable[vifi].v_flags;
3152 		vi.vifc_threshold = viftable[vifi].v_threshold;
3153 		vi.vifc_rate_limit	= viftable[vifi].v_rate_limit;
3154 		vi.vifc_lcl_addr	= viftable[vifi].v_lcl_addr;
3155 		vi.vifc_rmt_addr	= viftable[vifi].v_rmt_addr;
3156 		vi.vifc_pkt_in		= viftable[vifi].v_pkt_in;
3157 		vi.vifc_pkt_out		= viftable[vifi].v_pkt_out;
3158 
3159 		if (!snmp_append_data(mp, (char *)&vi, sizeof (vi))) {
3160 			ip0dbg(("ip_mroute_vif: failed %ld bytes\n",
3161 			    (size_t)sizeof (vi)));
3162 			return (0);
3163 		}
3164 	}
3165 	mutex_exit(&numvifs_mutex);
3166 	return (1);
3167 }
3168 
3169 /*
3170  * Called by ip_snmp_get to send up multicast routing table.
3171  */
3172 int
3173 ip_mroute_mrt(mblk_t *mp)
3174 {
3175 	int			i, j;
3176 	struct mfc		*rt;
3177 	struct mfcctl	mfcc;
3178 
3179 	/*
3180 	 * Make sure multicast has not been turned off.
3181 	 */
3182 	if (is_mrouter_off())
3183 		return (1);
3184 
3185 	/* Loop over all hash buckets and their chains */
3186 	for (i = 0; i < MFCTBLSIZ; i++) {
3187 		MFCB_REFHOLD(&mfctable[i]);
3188 		for (rt = mfctable[i].mfcb_mfc; rt; rt = rt->mfc_next) {
3189 			mutex_enter(&rt->mfc_mutex);
3190 			if (rt->mfc_rte != NULL ||
3191 			    (rt->mfc_marks & MFCB_MARK_CONDEMNED)) {
3192 				mutex_exit(&rt->mfc_mutex);
3193 				continue;
3194 			}
3195 			mfcc.mfcc_origin = rt->mfc_origin;
3196 			mfcc.mfcc_mcastgrp = rt->mfc_mcastgrp;
3197 			mfcc.mfcc_parent = rt->mfc_parent;
3198 			mfcc.mfcc_pkt_cnt = rt->mfc_pkt_cnt;
3199 			mutex_enter(&numvifs_mutex);
3200 			for (j = 0; j < (int)numvifs; j++)
3201 				mfcc.mfcc_ttls[j] = rt->mfc_ttls[j];
3202 			for (j = (int)numvifs; j < MAXVIFS; j++)
3203 				mfcc.mfcc_ttls[j] = 0;
3204 			mutex_exit(&numvifs_mutex);
3205 
3206 			mutex_exit(&rt->mfc_mutex);
3207 			if (!snmp_append_data(mp, (char *)&mfcc,
3208 			    sizeof (mfcc))) {
3209 				MFCB_REFRELE(&mfctable[i]);
3210 				ip0dbg(("ip_mroute_mrt: failed %ld bytes\n",
3211 				    (size_t)sizeof (mfcc)));
3212 				return (0);
3213 			}
3214 		}
3215 		MFCB_REFRELE(&mfctable[i]);
3216 	}
3217 	return (1);
3218 }
3219