xref: /titanic_52/usr/src/uts/common/inet/ip/igmp.c (revision 1cb6af97c6f66f456d4f726ef056e1ebc0f73305)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * Internet Group Management Protocol (IGMP) routines.
32  * Multicast Listener Discovery Protocol (MLD) routines.
33  *
34  * Written by Steve Deering, Stanford, May 1988.
35  * Modified by Rosen Sharma, Stanford, Aug 1994.
36  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
37  *
38  * MULTICAST 3.5.1.1
39  */
40 
41 
42 #include <sys/types.h>
43 #include <sys/stream.h>
44 #include <sys/dlpi.h>
45 #include <sys/stropts.h>
46 #include <sys/strlog.h>
47 #include <sys/strsun.h>
48 #include <sys/systm.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/cmn_err.h>
52 #include <sys/atomic.h>
53 #include <sys/zone.h>
54 
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #define	_SUN_TPI_VERSION	2
58 #include <sys/tihdr.h>
59 #include <inet/ipclassifier.h>
60 #include <net/if.h>
61 #include <net/if_arp.h>
62 #include <sys/sockio.h>
63 #include <net/route.h>
64 #include <netinet/in.h>
65 #include <netinet/igmp_var.h>
66 #include <netinet/ip6.h>
67 #include <netinet/icmp6.h>
68 
69 #include <inet/common.h>
70 #include <inet/mi.h>
71 #include <inet/nd.h>
72 #include <inet/arp.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/ip_multi.h>
76 #include <inet/ip_listutils.h>
77 
78 #include <netinet/igmp.h>
79 #include <inet/ip_if.h>
80 #include <net/pfkeyv2.h>
81 #include <inet/ipsec_info.h>
82 
83 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
84 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
85 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
86 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
87 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
88 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
89 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
90 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
91 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
92 		    slist_t *srclist, mrec_t *next);
93 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
94 		    mcast_record_t rtype, slist_t *flist);
95 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
96 
97 /* Following protected by igmp_timer_lock */
98 static int 	igmp_time_to_next;	/* Time since last timeout */
99 static int 	igmp_timer_fired_last;
100 uint_t		igmp_deferred_next = INFINITY;
101 timeout_id_t	igmp_timeout_id = 0;
102 kmutex_t	igmp_timer_lock;
103 
104 /* Protected by igmp_slowtimeout_lock */
105 timeout_id_t	igmp_slowtimeout_id = 0;
106 kmutex_t	igmp_slowtimeout_lock;
107 
108 /* Following protected by mld_timer_lock */
109 static int 	mld_time_to_next;	/* Time since last timeout */
110 static int 	mld_timer_fired_last;
111 uint_t		mld_deferred_next = INFINITY;
112 timeout_id_t	mld_timeout_id = 0;
113 kmutex_t	mld_timer_lock;
114 
115 /* Protected by mld_slowtimeout_lock */
116 timeout_id_t	mld_slowtimeout_id = 0;
117 kmutex_t	mld_slowtimeout_lock;
118 
119 /*
120  * Macros used to do timer len conversions.  Timer values are always
121  * stored and passed to the timer functions as milliseconds; but the
122  * default values and values from the wire may not be.
123  *
124  * And yes, it's obscure, but decisecond is easier to abbreviate than
125  * "tenths of a second".
126  */
127 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
128 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
129 
130 /*
131  * The first multicast join will trigger the igmp timers / mld timers
132  * The unit for next is milliseconds.
133  */
134 void
135 igmp_start_timers(unsigned next)
136 {
137 	int	time_left;
138 	/* Protected by igmp_timer_lock */
139 	static  boolean_t igmp_timer_setter_active;
140 	int	ret;
141 
142 	ASSERT(next != 0 && next != INFINITY);
143 
144 	mutex_enter(&igmp_timer_lock);
145 
146 	if (igmp_timer_setter_active) {
147 		/*
148 		 * Serialize timer setters, one at a time. If the
149 		 * timer is currently being set by someone,
150 		 * just record the next time when it has to be
151 		 * invoked and return. The current setter will
152 		 * take care.
153 		 */
154 		igmp_time_to_next = MIN(igmp_time_to_next, next);
155 		mutex_exit(&igmp_timer_lock);
156 		return;
157 	} else {
158 		igmp_timer_setter_active = B_TRUE;
159 	}
160 	if (igmp_timeout_id == 0) {
161 		/*
162 		 * The timer is inactive. We need to start a timer
163 		 */
164 		igmp_time_to_next = next;
165 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
166 		    MSEC_TO_TICK(igmp_time_to_next));
167 		igmp_timer_setter_active = B_FALSE;
168 		mutex_exit(&igmp_timer_lock);
169 		return;
170 	}
171 
172 	/*
173 	 * The timer was scheduled sometime back for firing in
174 	 * 'igmp_time_to_next' ms and is active. We need to
175 	 * reschedule the timeout if the new 'next' will happen
176 	 * earlier than the currently scheduled timeout
177 	 */
178 	time_left = igmp_timer_fired_last +
179 	    MSEC_TO_TICK(igmp_time_to_next) - ddi_get_lbolt();
180 	if (time_left < MSEC_TO_TICK(next)) {
181 		igmp_timer_setter_active = B_FALSE;
182 		mutex_exit(&igmp_timer_lock);
183 		return;
184 	}
185 
186 	mutex_exit(&igmp_timer_lock);
187 	ret = untimeout(igmp_timeout_id);
188 	mutex_enter(&igmp_timer_lock);
189 	/*
190 	 * The timeout was cancelled, or the timeout handler
191 	 * completed, while we were blocked in the untimeout.
192 	 * No other thread could have set the timer meanwhile
193 	 * since we serialized all the timer setters. Thus
194 	 * no timer is currently active nor executing nor will
195 	 * any timer fire in the future. We start the timer now
196 	 * if needed.
197 	 */
198 	if (ret == -1) {
199 		ASSERT(igmp_timeout_id == 0);
200 	} else {
201 		ASSERT(igmp_timeout_id != 0);
202 		igmp_timeout_id = 0;
203 	}
204 	if (igmp_time_to_next != 0) {
205 		igmp_time_to_next = MIN(igmp_time_to_next, next);
206 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
207 		    MSEC_TO_TICK(igmp_time_to_next));
208 		igmp_timer_setter_active = B_FALSE;
209 	}
210 	mutex_exit(&igmp_timer_lock);
211 }
212 
213 /*
214  * mld_start_timers:
215  * The unit for next is milliseconds.
216  */
217 void
218 mld_start_timers(unsigned next)
219 {
220 	int	time_left;
221 	/* Protedted by mld_timer_lock */
222 	static  boolean_t mld_timer_setter_active;
223 	int	ret;
224 
225 	ASSERT(next != 0 && next != INFINITY);
226 
227 	mutex_enter(&mld_timer_lock);
228 	if (mld_timer_setter_active) {
229 		/*
230 		 * Serialize timer setters, one at a time. If the
231 		 * timer is currently being set by someone,
232 		 * just record the next time when it has to be
233 		 * invoked and return. The current setter will
234 		 * take care.
235 		 */
236 		mld_time_to_next = MIN(mld_time_to_next, next);
237 		mutex_exit(&mld_timer_lock);
238 		return;
239 	} else {
240 		mld_timer_setter_active = B_TRUE;
241 	}
242 	if (mld_timeout_id == 0) {
243 		/*
244 		 * The timer is inactive. We need to start a timer
245 		 */
246 		mld_time_to_next = next;
247 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
248 		    MSEC_TO_TICK(mld_time_to_next));
249 		mld_timer_setter_active = B_FALSE;
250 		mutex_exit(&mld_timer_lock);
251 		return;
252 	}
253 
254 	/*
255 	 * The timer was scheduled sometime back for firing in
256 	 * 'igmp_time_to_next' ms and is active. We need to
257 	 * reschedule the timeout if the new 'next' will happen
258 	 * earlier than the currently scheduled timeout
259 	 */
260 	time_left = mld_timer_fired_last +
261 	    MSEC_TO_TICK(mld_time_to_next) - ddi_get_lbolt();
262 	if (time_left < MSEC_TO_TICK(next)) {
263 		mld_timer_setter_active = B_FALSE;
264 		mutex_exit(&mld_timer_lock);
265 		return;
266 	}
267 
268 	mutex_exit(&mld_timer_lock);
269 	ret = untimeout(mld_timeout_id);
270 	mutex_enter(&mld_timer_lock);
271 	/*
272 	 * The timeout was cancelled, or the timeout handler
273 	 * completed, while we were blocked in the untimeout.
274 	 * No other thread could have set the timer meanwhile
275 	 * since we serialized all the timer setters. Thus
276 	 * no timer is currently active nor executing nor will
277 	 * any timer fire in the future. We start the timer now
278 	 * if needed.
279 	 */
280 	if (ret == -1) {
281 		ASSERT(mld_timeout_id == 0);
282 	} else {
283 		ASSERT(mld_timeout_id != 0);
284 		mld_timeout_id = 0;
285 	}
286 	if (mld_time_to_next != 0) {
287 		mld_time_to_next = MIN(mld_time_to_next, next);
288 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
289 		    MSEC_TO_TICK(mld_time_to_next));
290 		mld_timer_setter_active = B_FALSE;
291 	}
292 	mutex_exit(&mld_timer_lock);
293 }
294 
295 /*
296  * igmp_input:
297  * Return 0 if the message is OK and should be handed to "raw" receivers.
298  * Callers of igmp_input() may need to reinitialize variables that were copied
299  * from the mblk as this calls pullupmsg().
300  */
301 /* ARGSUSED */
302 int
303 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
304 {
305 	igmpa_t 	*igmpa;
306 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
307 	int		iphlen, igmplen, mblklen;
308 	ilm_t 		*ilm;
309 	uint32_t	src, dst;
310 	uint32_t 	group;
311 	uint_t		next;
312 	ipif_t 		*ipif;
313 
314 	ASSERT(ill != NULL);
315 	ASSERT(!ill->ill_isv6);
316 	++igmpstat.igps_rcv_total;
317 
318 	mblklen = MBLKL(mp);
319 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
320 		++igmpstat.igps_rcv_tooshort;
321 		freemsg(mp);
322 		return (-1);
323 	}
324 	igmplen = ntohs(ipha->ipha_length) - iphlen;
325 	/*
326 	 * Since msg sizes are more variable with v3, just pullup the
327 	 * whole thing now.
328 	 */
329 	if (MBLKL(mp) < (igmplen + iphlen)) {
330 		mblk_t *mp1;
331 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
332 			++igmpstat.igps_rcv_tooshort;
333 			freemsg(mp);
334 			return (-1);
335 		}
336 		freemsg(mp);
337 		mp = mp1;
338 		ipha = (ipha_t *)(mp->b_rptr);
339 	}
340 
341 	/*
342 	 * Validate lengths
343 	 */
344 	if (igmplen < IGMP_MINLEN) {
345 		++igmpstat.igps_rcv_tooshort;
346 		freemsg(mp);
347 		return (-1);
348 	}
349 	/*
350 	 * Validate checksum
351 	 */
352 	if (IP_CSUM(mp, iphlen, 0)) {
353 		++igmpstat.igps_rcv_badsum;
354 		freemsg(mp);
355 		return (-1);
356 	}
357 
358 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
359 	src = ipha->ipha_src;
360 	dst = ipha->ipha_dst;
361 	if (ip_debug > 1)
362 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
363 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
364 		    (int)ntohl(src), (int)ntohl(dst),
365 		    ill->ill_name);
366 
367 	switch (igmpa->igmpa_type) {
368 	case IGMP_MEMBERSHIP_QUERY:
369 		/*
370 		 * packet length differentiates between v1/v2 and v3
371 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
372 		 */
373 		if (igmplen == IGMP_MINLEN) {
374 			next = igmp_query_in(ipha, igmpa, ill);
375 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
376 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
377 			    igmplen);
378 		} else {
379 			++igmpstat.igps_rcv_tooshort;
380 			freemsg(mp);
381 			return (-1);
382 		}
383 		if (next == 0) {
384 			freemsg(mp);
385 			return (-1);
386 		}
387 
388 		if (next != INFINITY)
389 			igmp_start_timers(next);
390 
391 		break;
392 
393 	case IGMP_V1_MEMBERSHIP_REPORT:
394 	case IGMP_V2_MEMBERSHIP_REPORT:
395 		/*
396 		 * For fast leave to work, we have to know that we are the
397 		 * last person to send a report for this group. Reports
398 		 * generated by us are looped back since we could potentially
399 		 * be a multicast router, so discard reports sourced by me.
400 		 */
401 		mutex_enter(&ill->ill_lock);
402 		for (ipif = ill->ill_ipif; ipif != NULL;
403 		    ipif = ipif->ipif_next) {
404 			if (ipif->ipif_lcl_addr == src) {
405 				if (ip_debug > 1) {
406 					(void) mi_strlog(ill->ill_rq,
407 					    1,
408 					    SL_TRACE,
409 					    "igmp_input: we are only "
410 					    "member src 0x%x ipif_local 0x%x",
411 					    (int)ntohl(src),
412 					    (int)
413 					    ntohl(ipif->ipif_lcl_addr));
414 				}
415 				mutex_exit(&ill->ill_lock);
416 				return (0);
417 			}
418 		}
419 		mutex_exit(&ill->ill_lock);
420 
421 		++igmpstat.igps_rcv_reports;
422 		group = igmpa->igmpa_group;
423 		if (!CLASSD(group)) {
424 			++igmpstat.igps_rcv_badreports;
425 			freemsg(mp);
426 			return (-1);
427 		}
428 
429 		/*
430 		 * KLUDGE: if the IP source address of the report has an
431 		 * unspecified (i.e., zero) subnet number, as is allowed for
432 		 * a booting host, replace it with the correct subnet number
433 		 * so that a process-level multicast routing demon can
434 		 * determine which subnet it arrived from.  This is necessary
435 		 * to compensate for the lack of any way for a process to
436 		 * determine the arrival interface of an incoming packet.
437 		 *
438 		 * Requires that a copy of *this* message it passed up
439 		 * to the raw interface which is done by our caller.
440 		 */
441 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
442 			/* Pick the first ipif on this ill */
443 			mutex_enter(&ill->ill_lock);
444 			src = ill->ill_ipif->ipif_subnet;
445 			mutex_exit(&ill->ill_lock);
446 			ip1dbg(("igmp_input: changed src to 0x%x\n",
447 			    (int)ntohl(src)));
448 			ipha->ipha_src = src;
449 		}
450 
451 		/*
452 		 * If we belong to the group being reported, and
453 		 * we are a 'Delaying member' in the RFC terminology,
454 		 * stop our timer for that group and 'clear flag' i.e.
455 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
456 		 * interfaces on the given physical interface.
457 		 */
458 		mutex_enter(&ill->ill_lock);
459 		for (ipif = ill->ill_ipif; ipif != NULL;
460 		    ipif = ipif->ipif_next) {
461 			ilm = ilm_lookup_ipif(ipif, group);
462 			if (ilm != NULL) {
463 				++igmpstat.igps_rcv_ourreports;
464 				ilm->ilm_timer = INFINITY;
465 				ilm->ilm_state = IGMP_OTHERMEMBER;
466 			}
467 		} /* for */
468 		mutex_exit(&ill->ill_lock);
469 		break;
470 
471 	case IGMP_V3_MEMBERSHIP_REPORT:
472 		/*
473 		 * Currently nothing to do here; IGMP router is not
474 		 * implemented in ip, and v3 hosts don't pay attention
475 		 * to membership reports.
476 		 */
477 		break;
478 	}
479 	/*
480 	 * Pass all valid IGMP packets up to any process(es) listening
481 	 * on a raw IGMP socket. Do not free the packet.
482 	 */
483 	return (0);
484 }
485 
486 static uint_t
487 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
488 {
489 	ilm_t	*ilm;
490 	int	timer;
491 	uint_t	next;
492 
493 	++igmpstat.igps_rcv_queries;
494 
495 	/*
496 	 * In the IGMPv2 specification, there are 3 states and a flag.
497 	 *
498 	 * In Non-Member state, we simply don't have a membership record.
499 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
500 	 * < INFINITY).  In Idle Member state, our timer is not running
501 	 * (ilm->ilm_timer == INFINITY).
502 	 *
503 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
504 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
505 	 * if I sent the last report.
506 	 */
507 	if (igmpa->igmpa_code == 0) {
508 		/*
509 		 * Query from an old router.
510 		 * Remember that the querier on this interface is old,
511 		 * and set the timer to the value in RFC 1112.
512 		 */
513 
514 
515 		mutex_enter(&ill->ill_lock);
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 		mutex_exit(&ill->ill_lock);
525 
526 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
527 
528 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
529 		    igmpa->igmpa_group != 0) {
530 			++igmpstat.igps_rcv_badqueries;
531 			return (0);
532 		}
533 
534 	} else {
535 		in_addr_t group;
536 
537 		/*
538 		 * Query from a new router
539 		 * Simply do a validity check
540 		 */
541 		group = igmpa->igmpa_group;
542 		if (group != 0 && (!CLASSD(group))) {
543 			++igmpstat.igps_rcv_badqueries;
544 			return (0);
545 		}
546 
547 		/*
548 		 * Switch interface state to v2 on receipt of a v2 query
549 		 * ONLY IF current state is v3.  Let things be if current
550 		 * state if v1 but do reset the v2-querier-present timer.
551 		 */
552 		mutex_enter(&ill->ill_lock);
553 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
554 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
555 			    "to IGMP_V2_ROUTER", ill->ill_name));
556 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
557 			ill->ill_mcast_type = IGMP_V2_ROUTER;
558 		}
559 		ill->ill_mcast_v2_time = 0;
560 		ill->ill_mcast_v2_tset = 1;
561 		mutex_exit(&ill->ill_lock);
562 
563 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
564 	}
565 
566 	if (ip_debug > 1) {
567 		mutex_enter(&ill->ill_lock);
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 		mutex_exit(&ill->ill_lock);
573 	}
574 
575 	/*
576 	 * -Start the timers in all of our membership records
577 	 *  for the physical interface on which the query
578 	 *  arrived, excluding those that belong to the "all
579 	 *  hosts" group (224.0.0.1).
580 	 *
581 	 * -Restart any timer that is already running but has
582 	 *  a value longer than the requested timeout.
583 	 *
584 	 * -Use the value specified in the query message as
585 	 *  the maximum timeout.
586 	 */
587 	next = (unsigned)INFINITY;
588 	mutex_enter(&ill->ill_lock);
589 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
590 
591 		/*
592 		 * A multicast router joins INADDR_ANY address
593 		 * to enable promiscuous reception of all
594 		 * mcasts from the interface. This INADDR_ANY
595 		 * is stored in the ilm_v6addr as V6 unspec addr
596 		 */
597 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
598 			continue;
599 		if (ilm->ilm_addr == htonl(INADDR_ANY))
600 			continue;
601 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
602 		    (igmpa->igmpa_group == 0) ||
603 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
604 			if (ilm->ilm_timer > timer) {
605 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
606 				if (ilm->ilm_timer < next)
607 					next = ilm->ilm_timer;
608 			}
609 		}
610 	}
611 	mutex_exit(&ill->ill_lock);
612 
613 	return (next);
614 }
615 
616 static uint_t
617 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
618 {
619 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
620 	ilm_t		*ilm;
621 	ipaddr_t	*src_array;
622 	uint8_t		qrv;
623 
624 	/* make sure numsrc matches packet size */
625 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
626 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
627 		++igmpstat.igps_rcv_tooshort;
628 		return (0);
629 	}
630 	src_array = (ipaddr_t *)&igmp3qa[1];
631 
632 	++igmpstat.igps_rcv_queries;
633 
634 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
635 		uint_t hdrval, mant, exp;
636 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
637 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
638 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
639 		mrd = (mant | 0x10) << (exp + 3);
640 	}
641 	if (mrd == 0)
642 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
643 	timer = DSEC_TO_MSEC(mrd);
644 	MCAST_RANDOM_DELAY(delay, timer);
645 	next = (unsigned)INFINITY;
646 
647 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
648 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
649 	else
650 		ill->ill_mcast_rv = qrv;
651 
652 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
653 		uint_t hdrval, mant, exp;
654 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
655 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
656 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
657 		qqi = (mant | 0x10) << (exp + 3);
658 	}
659 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
660 
661 	/*
662 	 * If we have a pending general query response that's scheduled
663 	 * sooner than the delay we calculated for this response, then
664 	 * no action is required (RFC3376 section 5.2 rule 1)
665 	 */
666 	mutex_enter(&ill->ill_lock);
667 	if (ill->ill_global_timer < delay) {
668 		mutex_exit(&ill->ill_lock);
669 		return (next);
670 	}
671 	mutex_exit(&ill->ill_lock);
672 
673 	/*
674 	 * Now take action depending upon query type:
675 	 * general, group specific, or group/source specific.
676 	 */
677 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
678 		/*
679 		 * general query
680 		 * We know global timer is either not running or is
681 		 * greater than our calculated delay, so reset it to
682 		 * our delay (random value in range [0, response time]).
683 		 */
684 		mutex_enter(&ill->ill_lock);
685 		ill->ill_global_timer = delay;
686 		next = ill->ill_global_timer;
687 		mutex_exit(&ill->ill_lock);
688 
689 	} else {
690 		/* group or group/source specific query */
691 		mutex_enter(&ill->ill_lock);
692 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
693 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
694 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
695 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
696 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
697 				continue;
698 			/*
699 			 * If the query is group specific or we have a
700 			 * pending group specific query, the response is
701 			 * group specific (pending sources list should be
702 			 * empty).  Otherwise, need to update the pending
703 			 * sources list for the group and source specific
704 			 * response.
705 			 */
706 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
707 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
708 group_query:
709 				FREE_SLIST(ilm->ilm_pendsrcs);
710 				ilm->ilm_pendsrcs = NULL;
711 			} else {
712 				boolean_t overflow;
713 				slist_t *pktl;
714 				if (numsrc > MAX_FILTER_SIZE ||
715 				    (ilm->ilm_pendsrcs == NULL &&
716 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
717 					/*
718 					 * We've been sent more sources than
719 					 * we can deal with; or we can't deal
720 					 * with a source list at all.  Revert
721 					 * to a group specific query.
722 					 */
723 					goto group_query;
724 				}
725 				if ((pktl = l_alloc()) == NULL)
726 					goto group_query;
727 				pktl->sl_numsrc = numsrc;
728 				for (i = 0; i < numsrc; i++)
729 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
730 					    &(pktl->sl_addr[i]));
731 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
732 				    &overflow);
733 				l_free(pktl);
734 				if (overflow)
735 					goto group_query;
736 			}
737 			/* choose soonest timer */
738 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
739 			if (ilm->ilm_timer < next)
740 				next = ilm->ilm_timer;
741 		}
742 		mutex_exit(&ill->ill_lock);
743 	}
744 
745 	return (next);
746 }
747 
748 void
749 igmp_joingroup(ilm_t *ilm)
750 {
751 	ill_t	*ill;
752 
753 	ill = ilm->ilm_ipif->ipif_ill;
754 
755 	ASSERT(IAM_WRITER_ILL(ill));
756 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
757 
758 	mutex_enter(&ill->ill_lock);
759 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
760 		ilm->ilm_rtx.rtx_timer = INFINITY;
761 		ilm->ilm_state = IGMP_OTHERMEMBER;
762 		mutex_exit(&ill->ill_lock);
763 	} else {
764 		ip1dbg(("Querier mode %d, sending report, group %x\n",
765 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
766 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
767 			mutex_exit(&ill->ill_lock);
768 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
769 			mutex_enter(&ill->ill_lock);
770 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
771 			mutex_exit(&ill->ill_lock);
772 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
773 			mutex_enter(&ill->ill_lock);
774 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
775 			mrec_t *rp;
776 			mcast_record_t rtype;
777 			/*
778 			 * The possible state changes we need to handle here:
779 			 *   Old State	New State	Report
780 			 *
781 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
782 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
783 			 *
784 			 * No need to send the BLOCK(0) report; ALLOW(X)
785 			 * is enough.
786 			 */
787 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
788 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
789 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
790 			    ilm->ilm_filter, NULL);
791 			mutex_exit(&ill->ill_lock);
792 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
793 			mutex_enter(&ill->ill_lock);
794 			/*
795 			 * Set up retransmission state.  Timer is set below,
796 			 * for both v3 and older versions.
797 			 */
798 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
799 			    ilm->ilm_filter);
800 		}
801 
802 		/* Set the ilm timer value */
803 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
804 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
805 		ilm->ilm_state = IGMP_IREPORTEDLAST;
806 		mutex_exit(&ill->ill_lock);
807 
808 		/*
809 		 * To avoid deadlock, we don't call igmp_start_timers from
810 		 * here. igmp_start_timers needs to call untimeout, and we
811 		 * can't hold the ipsq across untimeout since
812 		 * igmp_timeout_handler could be blocking trying to
813 		 * acquire the ipsq. Instead we start the timer after we get
814 		 * out of the ipsq in ipsq_exit.
815 		 */
816 		mutex_enter(&igmp_timer_lock);
817 		igmp_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
818 		    igmp_deferred_next);
819 		mutex_exit(&igmp_timer_lock);
820 	}
821 
822 	if (ip_debug > 1) {
823 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
824 		    "igmp_joingroup: multicast_type %d timer %d",
825 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
826 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
827 	}
828 }
829 
830 void
831 mld_joingroup(ilm_t *ilm)
832 {
833 	ill_t	*ill;
834 
835 	ill = ilm->ilm_ill;
836 
837 	ASSERT(IAM_WRITER_ILL(ill));
838 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
839 
840 	mutex_enter(&ill->ill_lock);
841 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
842 		ilm->ilm_rtx.rtx_timer = INFINITY;
843 		ilm->ilm_state = IGMP_OTHERMEMBER;
844 		mutex_exit(&ill->ill_lock);
845 	} else {
846 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
847 			mutex_exit(&ill->ill_lock);
848 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
849 			mutex_enter(&ill->ill_lock);
850 		} else {
851 			mrec_t *rp;
852 			mcast_record_t rtype;
853 			/*
854 			 * The possible state changes we need to handle here:
855 			 *	Old State   New State	Report
856 			 *
857 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
858 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
859 			 *
860 			 * No need to send the BLOCK(0) report; ALLOW(X)
861 			 * is enough
862 			 */
863 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
864 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
865 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
866 			    ilm->ilm_filter, NULL);
867 			mutex_exit(&ill->ill_lock);
868 			mldv2_sendrpt(ill, rp);
869 			mutex_enter(&ill->ill_lock);
870 			/*
871 			 * Set up retransmission state.  Timer is set below,
872 			 * for both v2 and v1.
873 			 */
874 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
875 			    ilm->ilm_filter);
876 		}
877 
878 		/* Set the ilm timer value */
879 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
880 		    ilm->ilm_rtx.rtx_cnt > 0);
881 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
882 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
883 		ilm->ilm_state = IGMP_IREPORTEDLAST;
884 		mutex_exit(&ill->ill_lock);
885 
886 		/*
887 		 * To avoid deadlock, we don't call mld_start_timers from
888 		 * here. mld_start_timers needs to call untimeout, and we
889 		 * can't hold the ipsq (i.e. the lock) across untimeout
890 		 * since mld_timeout_handler could be blocking trying to
891 		 * acquire the ipsq. Instead we start the timer after we get
892 		 * out of the ipsq in ipsq_exit
893 		 */
894 		mutex_enter(&mld_timer_lock);
895 		mld_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
896 		    mld_deferred_next);
897 		mutex_exit(&mld_timer_lock);
898 	}
899 
900 	if (ip_debug > 1) {
901 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
902 		    "mld_joingroup: multicast_type %d timer %d",
903 		    (ilm->ilm_ill->ill_mcast_type),
904 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
905 	}
906 }
907 
908 void
909 igmp_leavegroup(ilm_t *ilm)
910 {
911 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
912 
913 	ASSERT(ilm->ilm_ill == NULL);
914 	ASSERT(!ill->ill_isv6);
915 
916 	mutex_enter(&ill->ill_lock);
917 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
918 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
919 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
920 		mutex_exit(&ill->ill_lock);
921 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
922 		    (htonl(INADDR_ALLRTRS_GROUP)));
923 		return;
924 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
925 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
926 		mrec_t *rp;
927 		/*
928 		 * The possible state changes we need to handle here:
929 		 *	Old State	New State	Report
930 		 *
931 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
932 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
933 		 *
934 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
935 		 */
936 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
937 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
938 			    ilm->ilm_filter, NULL);
939 		} else {
940 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
941 			    NULL, NULL);
942 		}
943 		mutex_exit(&ill->ill_lock);
944 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
945 		return;
946 	}
947 	mutex_exit(&ill->ill_lock);
948 }
949 
950 void
951 mld_leavegroup(ilm_t *ilm)
952 {
953 	ill_t *ill = ilm->ilm_ill;
954 
955 	ASSERT(ilm->ilm_ipif == NULL);
956 	ASSERT(ill->ill_isv6);
957 
958 	mutex_enter(&ill->ill_lock);
959 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
960 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
961 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
962 		mutex_exit(&ill->ill_lock);
963 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
964 		return;
965 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
966 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
967 		mrec_t *rp;
968 		/*
969 		 * The possible state changes we need to handle here:
970 		 *	Old State	New State	Report
971 		 *
972 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
973 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
974 		 *
975 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
976 		 */
977 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
978 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
979 			    ilm->ilm_filter, NULL);
980 		} else {
981 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
982 			    NULL, NULL);
983 		}
984 		mutex_exit(&ill->ill_lock);
985 		mldv2_sendrpt(ill, rp);
986 		return;
987 	}
988 	mutex_exit(&ill->ill_lock);
989 }
990 
991 void
992 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
993 {
994 	ill_t *ill;
995 	mrec_t *rp;
996 
997 	ASSERT(ilm != NULL);
998 
999 	/* state change reports should only be sent if the router is v3 */
1000 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
1001 		return;
1002 
1003 	if (ilm->ilm_ill == NULL) {
1004 		ASSERT(ilm->ilm_ipif != NULL);
1005 		ill = ilm->ilm_ipif->ipif_ill;
1006 	} else {
1007 		ill = ilm->ilm_ill;
1008 	}
1009 
1010 	mutex_enter(&ill->ill_lock);
1011 
1012 	/*
1013 	 * Compare existing(old) state with the new state and prepare
1014 	 * State Change Report, according to the rules in RFC 3376:
1015 	 *
1016 	 *	Old State	New State	State Change Report
1017 	 *
1018 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1019 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1020 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1021 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1022 	 */
1023 
1024 	if (ilm->ilm_fmode == fmode) {
1025 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1026 		slist_t *allow, *block;
1027 		if (((a_minus_b = l_alloc()) == NULL) ||
1028 		    ((b_minus_a = l_alloc()) == NULL)) {
1029 			l_free(a_minus_b);
1030 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1031 				goto send_to_ex;
1032 			else
1033 				goto send_to_in;
1034 		}
1035 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1036 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1037 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1038 			allow = b_minus_a;
1039 			block = a_minus_b;
1040 		} else {
1041 			allow = a_minus_b;
1042 			block = b_minus_a;
1043 		}
1044 		rp = NULL;
1045 		if (!SLIST_IS_EMPTY(allow))
1046 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1047 			    allow, rp);
1048 		if (!SLIST_IS_EMPTY(block))
1049 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1050 			    block, rp);
1051 		l_free(a_minus_b);
1052 		l_free(b_minus_a);
1053 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1054 send_to_ex:
1055 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1056 		    NULL);
1057 	} else {
1058 send_to_in:
1059 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1060 		    NULL);
1061 	}
1062 
1063 	/*
1064 	 * Need to set up retransmission state; merge the new info with the
1065 	 * current state (which may be null).  If the timer is not currently
1066 	 * running, start it (need to do a delayed start of the timer as
1067 	 * we're currently in the sq).
1068 	 */
1069 	rp = mcast_merge_rtx(ilm, rp, flist);
1070 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1071 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1072 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1073 		mutex_enter(&igmp_timer_lock);
1074 		igmp_deferred_next = MIN(igmp_deferred_next,
1075 		    ilm->ilm_rtx.rtx_timer);
1076 		mutex_exit(&igmp_timer_lock);
1077 	}
1078 
1079 	mutex_exit(&ill->ill_lock);
1080 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1081 }
1082 
1083 void
1084 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1085 {
1086 	ill_t *ill;
1087 	mrec_t *rp = NULL;
1088 
1089 	ASSERT(ilm != NULL);
1090 
1091 	ill = ilm->ilm_ill;
1092 
1093 	/* only need to send if we have an mldv2-capable router */
1094 	mutex_enter(&ill->ill_lock);
1095 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1096 		mutex_exit(&ill->ill_lock);
1097 		return;
1098 	}
1099 
1100 	/*
1101 	 * Compare existing (old) state with the new state passed in
1102 	 * and send appropriate MLDv2 State Change Report.
1103 	 *
1104 	 *	Old State	New State	State Change Report
1105 	 *
1106 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1107 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1108 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1109 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1110 	 */
1111 	if (ilm->ilm_fmode == fmode) {
1112 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1113 		slist_t *allow, *block;
1114 		if (((a_minus_b = l_alloc()) == NULL) ||
1115 		    ((b_minus_a = l_alloc()) == NULL)) {
1116 			l_free(a_minus_b);
1117 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1118 				goto send_to_ex;
1119 			else
1120 				goto send_to_in;
1121 		}
1122 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1123 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1124 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1125 			allow = b_minus_a;
1126 			block = a_minus_b;
1127 		} else {
1128 			allow = a_minus_b;
1129 			block = b_minus_a;
1130 		}
1131 		if (!SLIST_IS_EMPTY(allow))
1132 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1133 			    allow, rp);
1134 		if (!SLIST_IS_EMPTY(block))
1135 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1136 			    block, rp);
1137 		l_free(a_minus_b);
1138 		l_free(b_minus_a);
1139 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1140 send_to_ex:
1141 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1142 		    NULL);
1143 	} else {
1144 send_to_in:
1145 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1146 		    NULL);
1147 	}
1148 
1149 	/*
1150 	 * Need to set up retransmission state; merge the new info with the
1151 	 * current state (which may be null).  If the timer is not currently
1152 	 * running, start it (need to do a deferred start of the timer as
1153 	 * we're currently in the sq).
1154 	 */
1155 	rp = mcast_merge_rtx(ilm, rp, flist);
1156 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1157 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1158 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1159 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1160 		mutex_enter(&mld_timer_lock);
1161 		mld_deferred_next =
1162 		    MIN(mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1163 		mutex_exit(&mld_timer_lock);
1164 	}
1165 
1166 	mutex_exit(&ill->ill_lock);
1167 	mldv2_sendrpt(ill, rp);
1168 }
1169 
1170 uint_t
1171 igmp_timeout_handler_per_ill(ill_t *ill, int elapsed)
1172 {
1173 	uint_t	next = INFINITY;
1174 	ilm_t	*ilm;
1175 	ipif_t	*ipif;
1176 	mrec_t	*rp = NULL;
1177 	mrec_t	*rtxrp = NULL;
1178 	rtx_state_t *rtxp;
1179 	mcast_record_t	rtype;
1180 
1181 	ASSERT(IAM_WRITER_ILL(ill));
1182 
1183 	mutex_enter(&ill->ill_lock);
1184 
1185 	/* First check the global timer on this interface */
1186 	if (ill->ill_global_timer == INFINITY)
1187 		goto per_ilm_timer;
1188 	if (ill->ill_global_timer <= elapsed) {
1189 		ill->ill_global_timer = INFINITY;
1190 		/*
1191 		 * Send report for each group on this interface.
1192 		 * Since we just set the global timer (received a v3 general
1193 		 * query), need to skip the all hosts addr (224.0.0.1), per
1194 		 * RFC 3376 section 5.
1195 		 */
1196 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1197 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1198 				continue;
1199 			ASSERT(ilm->ilm_ipif != NULL);
1200 			ilm->ilm_ipif->ipif_igmp_rpt =
1201 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1202 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1203 			/*
1204 			 * Since we're sending a report on this group, okay
1205 			 * to delete pending group-specific timers.  Note
1206 			 * that group-specific retransmit timers still need
1207 			 * to be checked in the per_ilm_timer for-loop.
1208 			 */
1209 			ilm->ilm_timer = INFINITY;
1210 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1211 			FREE_SLIST(ilm->ilm_pendsrcs);
1212 			ilm->ilm_pendsrcs = NULL;
1213 		}
1214 		/*
1215 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1216 		 * and send a report for each ipif that has an mrec list.
1217 		 */
1218 		for (ipif = ill->ill_ipif; ipif != NULL;
1219 		    ipif = ipif->ipif_next) {
1220 			if (ipif->ipif_igmp_rpt == NULL)
1221 				continue;
1222 			mutex_exit(&ill->ill_lock);
1223 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1224 			mutex_enter(&ill->ill_lock);
1225 			/* mrec list was freed by igmpv3_sendrpt() */
1226 			ipif->ipif_igmp_rpt = NULL;
1227 		}
1228 	} else {
1229 		ill->ill_global_timer -= elapsed;
1230 		if (ill->ill_global_timer < next)
1231 			next = ill->ill_global_timer;
1232 	}
1233 
1234 per_ilm_timer:
1235 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1236 		if (ilm->ilm_timer == INFINITY)
1237 			goto per_ilm_rtxtimer;
1238 
1239 		if (ilm->ilm_timer > elapsed) {
1240 			ilm->ilm_timer -= elapsed;
1241 			if (ilm->ilm_timer < next)
1242 				next = ilm->ilm_timer;
1243 
1244 			if (ip_debug > 1) {
1245 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1246 				    "igmp_timo_hlr 2: ilm_timr %d elap %d "
1247 				    "typ %d nxt %d",
1248 				    (int)ntohl(ilm->ilm_timer), elapsed,
1249 				    (ill->ill_mcast_type), next);
1250 			}
1251 
1252 			goto per_ilm_rtxtimer;
1253 		}
1254 
1255 		/* the timer has expired, need to take action */
1256 		ilm->ilm_timer = INFINITY;
1257 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1258 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1259 			mutex_exit(&ill->ill_lock);
1260 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1261 			mutex_enter(&ill->ill_lock);
1262 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1263 			mutex_exit(&ill->ill_lock);
1264 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1265 			mutex_enter(&ill->ill_lock);
1266 		} else {
1267 			slist_t *rsp;
1268 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1269 			    (rsp = l_alloc()) != NULL) {
1270 				/*
1271 				 * Contents of reply depend on pending
1272 				 * requested source list.
1273 				 */
1274 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1275 					l_intersection(ilm->ilm_filter,
1276 					    ilm->ilm_pendsrcs, rsp);
1277 				} else {
1278 					l_difference(ilm->ilm_pendsrcs,
1279 					    ilm->ilm_filter, rsp);
1280 				}
1281 				FREE_SLIST(ilm->ilm_pendsrcs);
1282 				ilm->ilm_pendsrcs = NULL;
1283 				if (!SLIST_IS_EMPTY(rsp))
1284 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1285 					    &ilm->ilm_v6addr, rsp, rp);
1286 				FREE_SLIST(rsp);
1287 			} else {
1288 				/*
1289 				 * Either the pending request is just group-
1290 				 * specific, or we couldn't get the resources
1291 				 * (rsp) to build a source-specific reply.
1292 				 */
1293 				rp = mcast_bldmrec(ilm->ilm_fmode,
1294 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1295 			}
1296 			mutex_exit(&ill->ill_lock);
1297 			igmpv3_sendrpt(ill->ill_ipif, rp);
1298 			mutex_enter(&ill->ill_lock);
1299 			rp = NULL;
1300 		}
1301 
1302 		if (ip_debug > 1) {
1303 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1304 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1305 			    "typ %d nxt %d",
1306 			    (int)ntohl(ilm->ilm_timer), elapsed,
1307 			    (ill->ill_mcast_type), next);
1308 		}
1309 
1310 per_ilm_rtxtimer:
1311 		rtxp = &ilm->ilm_rtx;
1312 
1313 		if (rtxp->rtx_timer == INFINITY)
1314 			continue;
1315 		if (rtxp->rtx_timer > elapsed) {
1316 			rtxp->rtx_timer -= elapsed;
1317 			if (rtxp->rtx_timer < next)
1318 				next = rtxp->rtx_timer;
1319 			continue;
1320 		}
1321 
1322 		rtxp->rtx_timer = INFINITY;
1323 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1324 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1325 			mutex_exit(&ill->ill_lock);
1326 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1327 			mutex_enter(&ill->ill_lock);
1328 			continue;
1329 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1330 			mutex_exit(&ill->ill_lock);
1331 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1332 			mutex_enter(&ill->ill_lock);
1333 			continue;
1334 		}
1335 
1336 		/*
1337 		 * The retransmit timer has popped, and our router is
1338 		 * IGMPv3.  We have to delve into the retransmit state
1339 		 * stored in the ilm.
1340 		 *
1341 		 * Decrement the retransmit count.  If the fmode rtx
1342 		 * count is active, decrement it, and send a filter
1343 		 * mode change report with the ilm's source list.
1344 		 * Otherwise, send a source list change report with
1345 		 * the current retransmit lists.
1346 		 */
1347 		ASSERT(rtxp->rtx_cnt > 0);
1348 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1349 		rtxp->rtx_cnt--;
1350 		if (rtxp->rtx_fmode_cnt > 0) {
1351 			rtxp->rtx_fmode_cnt--;
1352 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1353 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1354 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1355 			    ilm->ilm_filter, rtxrp);
1356 		} else {
1357 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1358 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1359 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1360 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1361 		}
1362 		if (rtxp->rtx_cnt > 0) {
1363 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1364 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1365 			if (rtxp->rtx_timer < next)
1366 				next = rtxp->rtx_timer;
1367 		} else {
1368 			CLEAR_SLIST(rtxp->rtx_allow);
1369 			CLEAR_SLIST(rtxp->rtx_block);
1370 		}
1371 		mutex_exit(&ill->ill_lock);
1372 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1373 		mutex_enter(&ill->ill_lock);
1374 		rtxrp = NULL;
1375 	}
1376 
1377 	mutex_exit(&ill->ill_lock);
1378 
1379 	return (next);
1380 }
1381 
1382 /*
1383  * igmp_timeout_handler:
1384  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1385  * Returns number of ticks to next event (or 0 if none).
1386  *
1387  * As part of multicast join and leave igmp we may need to send out an
1388  * igmp request. The igmp related state variables in the ilm are protected
1389  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1390  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1391  * starts the igmp timer if needed. It serializes multiple threads trying to
1392  * simultaneously start the timer using the igmp_timer_setter_active flag.
1393  *
1394  * igmp_input() receives igmp queries and responds to the queries
1395  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1396  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1397  * performs the action exclusively after entering each ill's ipsq as writer.
1398  * The actual igmp timeout handler needs to run in the ipsq since it has to
1399  * access the ilm's and we don't want another exclusive operation like
1400  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1401  * another.
1402  *
1403  * The igmp_slowtimeo() function is called thru another timer.
1404  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1405  */
1406 
1407 /* ARGSUSED */
1408 void
1409 igmp_timeout_handler(void *arg)
1410 {
1411 	ill_t	*ill;
1412 	int	elapsed;	/* Since last call */
1413 	uint_t  global_next = INFINITY;
1414 	uint_t  next;
1415 	ill_walk_context_t ctx;
1416 	boolean_t success;
1417 
1418 	mutex_enter(&igmp_timer_lock);
1419 	ASSERT(igmp_timeout_id != 0);
1420 	igmp_timer_fired_last = ddi_get_lbolt();
1421 	elapsed = igmp_time_to_next;
1422 	igmp_time_to_next = 0;
1423 	mutex_exit(&igmp_timer_lock);
1424 
1425 	rw_enter(&ill_g_lock, RW_READER);
1426 	ill = ILL_START_WALK_V4(&ctx);
1427 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1428 		ASSERT(!ill->ill_isv6);
1429 		/*
1430 		 * We may not be able to refhold the ill if the ill/ipif
1431 		 * is changing. But we need to make sure that the ill will
1432 		 * not vanish. So we just bump up the ill_waiter count.
1433 		 */
1434 		if (!ill_waiter_inc(ill))
1435 			continue;
1436 		rw_exit(&ill_g_lock);
1437 		success = ipsq_enter(ill, B_TRUE);
1438 		if (success) {
1439 			next = igmp_timeout_handler_per_ill(ill, elapsed);
1440 			if (next < global_next)
1441 				global_next = next;
1442 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1443 			    B_TRUE);
1444 		}
1445 		rw_enter(&ill_g_lock, RW_READER);
1446 		ill_waiter_dcr(ill);
1447 	}
1448 	rw_exit(&ill_g_lock);
1449 
1450 	mutex_enter(&igmp_timer_lock);
1451 	ASSERT(igmp_timeout_id != 0);
1452 	igmp_timeout_id = 0;
1453 	mutex_exit(&igmp_timer_lock);
1454 
1455 	if (global_next != INFINITY)
1456 		igmp_start_timers(global_next);
1457 }
1458 
1459 /*
1460  * mld_timeout_handler:
1461  * Called when there are timeout events, every next (tick).
1462  * Returns number of ticks to next event (or 0 if none).
1463  */
1464 /* ARGSUSED */
1465 uint_t
1466 mld_timeout_handler_per_ill(ill_t *ill, int elapsed)
1467 {
1468 	ilm_t 	*ilm;
1469 	uint_t	next = INFINITY;
1470 	mrec_t	*rp, *rtxrp;
1471 	rtx_state_t *rtxp;
1472 	mcast_record_t	rtype;
1473 
1474 	ASSERT(IAM_WRITER_ILL(ill));
1475 
1476 	mutex_enter(&ill->ill_lock);
1477 
1478 	/*
1479 	 * First check the global timer on this interface; the global timer
1480 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1481 	 */
1482 	if (ill->ill_global_timer == INFINITY)
1483 		goto per_ilm_timer;
1484 	if (ill->ill_global_timer <= elapsed) {
1485 		ill->ill_global_timer = INFINITY;
1486 		/*
1487 		 * Send report for each group on this interface.
1488 		 * Since we just set the global timer (received a v2 general
1489 		 * query), need to skip the all hosts addr (ff02::1), per
1490 		 * RFC 3810 section 6.
1491 		 */
1492 		rp = NULL;
1493 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1494 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1495 			    &ipv6_all_hosts_mcast))
1496 				continue;
1497 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1498 			    ilm->ilm_filter, rp);
1499 			/*
1500 			 * Since we're sending a report on this group, okay
1501 			 * to delete pending group-specific timers.  Note
1502 			 * that group-specific retransmit timers still need
1503 			 * to be checked in the per_ilm_timer for-loop.
1504 			 */
1505 			ilm->ilm_timer = INFINITY;
1506 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1507 			FREE_SLIST(ilm->ilm_pendsrcs);
1508 			ilm->ilm_pendsrcs = NULL;
1509 		}
1510 		mutex_exit(&ill->ill_lock);
1511 		mldv2_sendrpt(ill, rp);
1512 		mutex_enter(&ill->ill_lock);
1513 	} else {
1514 		ill->ill_global_timer -= elapsed;
1515 		if (ill->ill_global_timer < next)
1516 			next = ill->ill_global_timer;
1517 	}
1518 
1519 per_ilm_timer:
1520 	rp = rtxrp = NULL;
1521 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1522 		if (ilm->ilm_timer == INFINITY)
1523 			goto per_ilm_rtxtimer;
1524 
1525 		if (ilm->ilm_timer > elapsed) {
1526 			ilm->ilm_timer -= elapsed;
1527 			if (ilm->ilm_timer < next)
1528 				next = ilm->ilm_timer;
1529 
1530 			if (ip_debug > 1) {
1531 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1532 				    "igmp_timo_hlr 2: ilm_timr"
1533 				    " %d elap %d typ %d nxt %d",
1534 				    (int)ntohl(ilm->ilm_timer), elapsed,
1535 				    (ill->ill_mcast_type), next);
1536 			}
1537 
1538 			goto per_ilm_rtxtimer;
1539 		}
1540 
1541 		/* the timer has expired, need to take action */
1542 		ilm->ilm_timer = INFINITY;
1543 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1544 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1545 			mutex_exit(&ill->ill_lock);
1546 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1547 			mutex_enter(&ill->ill_lock);
1548 		} else {
1549 			slist_t *rsp;
1550 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1551 			    (rsp = l_alloc()) != NULL) {
1552 				/*
1553 				 * Contents of reply depend on pending
1554 				 * requested source list.
1555 				 */
1556 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1557 					l_intersection(ilm->ilm_filter,
1558 					    ilm->ilm_pendsrcs, rsp);
1559 				} else {
1560 					l_difference(ilm->ilm_pendsrcs,
1561 					    ilm->ilm_filter, rsp);
1562 				}
1563 				FREE_SLIST(ilm->ilm_pendsrcs);
1564 				ilm->ilm_pendsrcs = NULL;
1565 				if (!SLIST_IS_EMPTY(rsp))
1566 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1567 					    &ilm->ilm_v6addr, rsp, rp);
1568 				FREE_SLIST(rsp);
1569 			} else {
1570 				rp = mcast_bldmrec(ilm->ilm_fmode,
1571 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1572 			}
1573 		}
1574 
1575 		if (ip_debug > 1) {
1576 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1577 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1578 			    "typ %d nxt %d",
1579 			    (int)ntohl(ilm->ilm_timer), elapsed,
1580 			    (ill->ill_mcast_type), next);
1581 		}
1582 
1583 per_ilm_rtxtimer:
1584 		rtxp = &ilm->ilm_rtx;
1585 
1586 		if (rtxp->rtx_timer == INFINITY)
1587 			continue;
1588 		if (rtxp->rtx_timer > elapsed) {
1589 			rtxp->rtx_timer -= elapsed;
1590 			if (rtxp->rtx_timer < next)
1591 				next = rtxp->rtx_timer;
1592 			continue;
1593 		}
1594 
1595 		rtxp->rtx_timer = INFINITY;
1596 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1597 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1598 			mutex_exit(&ill->ill_lock);
1599 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1600 			mutex_enter(&ill->ill_lock);
1601 			continue;
1602 		}
1603 
1604 		/*
1605 		 * The retransmit timer has popped, and our router is
1606 		 * MLDv2.  We have to delve into the retransmit state
1607 		 * stored in the ilm.
1608 		 *
1609 		 * Decrement the retransmit count.  If the fmode rtx
1610 		 * count is active, decrement it, and send a filter
1611 		 * mode change report with the ilm's source list.
1612 		 * Otherwise, send a source list change report with
1613 		 * the current retransmit lists.
1614 		 */
1615 		ASSERT(rtxp->rtx_cnt > 0);
1616 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1617 		rtxp->rtx_cnt--;
1618 		if (rtxp->rtx_fmode_cnt > 0) {
1619 			rtxp->rtx_fmode_cnt--;
1620 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1621 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1622 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1623 			    ilm->ilm_filter, rtxrp);
1624 		} else {
1625 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1626 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1627 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1628 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1629 		}
1630 		if (rtxp->rtx_cnt > 0) {
1631 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1632 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1633 			if (rtxp->rtx_timer < next)
1634 				next = rtxp->rtx_timer;
1635 		} else {
1636 			CLEAR_SLIST(rtxp->rtx_allow);
1637 			CLEAR_SLIST(rtxp->rtx_block);
1638 		}
1639 	}
1640 
1641 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1642 		mutex_exit(&ill->ill_lock);
1643 		mldv2_sendrpt(ill, rp);
1644 		mldv2_sendrpt(ill, rtxrp);
1645 		return (next);
1646 	}
1647 
1648 	mutex_exit(&ill->ill_lock);
1649 
1650 	return (next);
1651 }
1652 
1653 /*
1654  * mld_timeout_handler:
1655  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1656  * Returns number of ticks to next event (or 0 if none).
1657  * MT issues are same as igmp_timeout_handler
1658  */
1659 /* ARGSUSED */
1660 void
1661 mld_timeout_handler(void *arg)
1662 {
1663 	ill_t	*ill;
1664 	int	elapsed;	/* Since last call */
1665 	uint_t  global_next = INFINITY;
1666 	uint_t  next;
1667 	ill_walk_context_t ctx;
1668 	boolean_t success;
1669 
1670 	mutex_enter(&mld_timer_lock);
1671 	ASSERT(mld_timeout_id != 0);
1672 	mld_timer_fired_last = ddi_get_lbolt();
1673 	elapsed = mld_time_to_next;
1674 	mld_time_to_next = 0;
1675 	mutex_exit(&mld_timer_lock);
1676 
1677 	rw_enter(&ill_g_lock, RW_READER);
1678 	ill = ILL_START_WALK_V6(&ctx);
1679 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1680 		ASSERT(ill->ill_isv6);
1681 		/*
1682 		 * We may not be able to refhold the ill if the ill/ipif
1683 		 * is changing. But we need to make sure that the ill will
1684 		 * not vanish. So we just bump up the ill_waiter count.
1685 		 */
1686 		if (!ill_waiter_inc(ill))
1687 			continue;
1688 		rw_exit(&ill_g_lock);
1689 		success = ipsq_enter(ill, B_TRUE);
1690 		if (success) {
1691 			next = mld_timeout_handler_per_ill(ill, elapsed);
1692 			if (next < global_next)
1693 				global_next = next;
1694 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1695 			    B_FALSE);
1696 		}
1697 		rw_enter(&ill_g_lock, RW_READER);
1698 		ill_waiter_dcr(ill);
1699 	}
1700 	rw_exit(&ill_g_lock);
1701 
1702 	mutex_enter(&mld_timer_lock);
1703 	ASSERT(mld_timeout_id != 0);
1704 	mld_timeout_id = 0;
1705 	mutex_exit(&mld_timer_lock);
1706 
1707 	if (global_next != INFINITY)
1708 		mld_start_timers(global_next);
1709 }
1710 
1711 /*
1712  * Calculate the Older Version Querier Present timeout value, in number
1713  * of slowtimo intervals, for the given ill.
1714  */
1715 #define	OVQP(ill) \
1716 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1717 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1718 
1719 /*
1720  * igmp_slowtimo:
1721  * - Resets to new router if we didnt we hear from the router
1722  *   in IGMP_AGE_THRESHOLD seconds.
1723  * - Resets slowtimeout.
1724  */
1725 /* ARGSUSED */
1726 void
1727 igmp_slowtimo(void *arg)
1728 {
1729 	ill_t	*ill;
1730 	ill_if_t *ifp;
1731 	avl_tree_t *avl_tree;
1732 
1733 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1734 	rw_enter(&ill_g_lock, RW_READER);
1735 
1736 	/*
1737 	 * The ill_if_t list is circular, hence the odd loop parameters.
1738 	 *
1739 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1740 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1741 	 * structure (allowing us to skip if none of the instances have timers
1742 	 * running).
1743 	 */
1744 	for (ifp = IP_V4_ILL_G_LIST; ifp != (ill_if_t *)&IP_V4_ILL_G_LIST;
1745 	    ifp = ifp->illif_next) {
1746 		/*
1747 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1748 		 * a V1 or V2 query now and we miss seeing the count now,
1749 		 * we will see it the next time igmp_slowtimo is called.
1750 		 */
1751 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1752 			continue;
1753 
1754 		avl_tree = &ifp->illif_avl_by_ppa;
1755 		for (ill = avl_first(avl_tree); ill != NULL;
1756 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1757 			mutex_enter(&ill->ill_lock);
1758 			if (ill->ill_mcast_v1_tset == 1)
1759 				ill->ill_mcast_v1_time++;
1760 			if (ill->ill_mcast_v2_tset == 1)
1761 				ill->ill_mcast_v2_time++;
1762 			if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1763 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1764 					if (ill->ill_mcast_v2_tset > 0) {
1765 						ip1dbg(("V1 query timer "
1766 						    "expired on %s; switching "
1767 						    "mode to IGMP_V2\n",
1768 						    ill->ill_name));
1769 						ill->ill_mcast_type =
1770 						    IGMP_V2_ROUTER;
1771 					} else {
1772 						ip1dbg(("V1 query timer "
1773 						    "expired on %s; switching "
1774 						    "mode to IGMP_V3\n",
1775 						    ill->ill_name));
1776 						ill->ill_mcast_type =
1777 						    IGMP_V3_ROUTER;
1778 					}
1779 					ill->ill_mcast_v1_time = 0;
1780 					ill->ill_mcast_v1_tset = 0;
1781 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1782 				}
1783 			}
1784 			if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1785 				if (ill->ill_mcast_v2_time >= OVQP(ill)) {
1786 					ip1dbg(("V2 query timer expired on "
1787 					    "%s; switching mode to IGMP_V3\n",
1788 					    ill->ill_name));
1789 					ill->ill_mcast_type = IGMP_V3_ROUTER;
1790 					ill->ill_mcast_v2_time = 0;
1791 					ill->ill_mcast_v2_tset = 0;
1792 					atomic_add_16(&ifp->illif_mcast_v2, -1);
1793 				}
1794 			}
1795 			mutex_exit(&ill->ill_lock);
1796 		}
1797 
1798 	}
1799 	rw_exit(&ill_g_lock);
1800 	mutex_enter(&igmp_slowtimeout_lock);
1801 	igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL,
1802 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1803 	mutex_exit(&igmp_slowtimeout_lock);
1804 }
1805 
1806 /*
1807  * mld_slowtimo:
1808  * - Resets to newer version if we didn't hear from the older version router
1809  *   in MLD_AGE_THRESHOLD seconds.
1810  * - Restarts slowtimeout.
1811  */
1812 /* ARGSUSED */
1813 void
1814 mld_slowtimo(void *arg)
1815 {
1816 	ill_t *ill;
1817 	ill_if_t *ifp;
1818 	avl_tree_t *avl_tree;
1819 
1820 	/* See comments in igmp_slowtimo() above... */
1821 	rw_enter(&ill_g_lock, RW_READER);
1822 	for (ifp = IP_V6_ILL_G_LIST; ifp != (ill_if_t *)&IP_V6_ILL_G_LIST;
1823 	    ifp = ifp->illif_next) {
1824 
1825 		if (ifp->illif_mcast_v1 == 0)
1826 			continue;
1827 
1828 		avl_tree = &ifp->illif_avl_by_ppa;
1829 		for (ill = avl_first(avl_tree); ill != NULL;
1830 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1831 			mutex_enter(&ill->ill_lock);
1832 			if (ill->ill_mcast_v1_tset == 1)
1833 				ill->ill_mcast_v1_time++;
1834 			if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1835 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1836 					ip1dbg(("MLD query timer expired on"
1837 					    " %s; switching mode to MLD_V2\n",
1838 					    ill->ill_name));
1839 					ill->ill_mcast_type = MLD_V2_ROUTER;
1840 					ill->ill_mcast_v1_time = 0;
1841 					ill->ill_mcast_v1_tset = 0;
1842 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1843 				}
1844 			}
1845 			mutex_exit(&ill->ill_lock);
1846 		}
1847 	}
1848 	rw_exit(&ill_g_lock);
1849 	mutex_enter(&mld_slowtimeout_lock);
1850 	mld_slowtimeout_id = timeout(mld_slowtimo, NULL,
1851 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1852 	mutex_exit(&mld_slowtimeout_lock);
1853 }
1854 
1855 /*
1856  * igmp_sendpkt:
1857  * This will send to ip_wput like icmp_inbound.
1858  * Note that the lower ill (on which the membership is kept) is used
1859  * as an upper ill to pass in the multicast parameters.
1860  */
1861 static void
1862 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1863 {
1864 	mblk_t	*mp;
1865 	igmpa_t	*igmpa;
1866 	uint8_t *rtralert;
1867 	ipha_t	*ipha;
1868 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1869 	size_t	size  = hdrlen + sizeof (igmpa_t);
1870 	ipif_t 	*ipif = ilm->ilm_ipif;
1871 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1872 	mblk_t	*first_mp;
1873 	ipsec_out_t *io;
1874 
1875 	/*
1876 	 * We need to make sure this packet goes out on an ipif. If
1877 	 * there is some global policy match in ip_wput_ire, we need
1878 	 * to get to the right interface after IPSEC processing.
1879 	 * To make sure this multicast packet goes out on the right
1880 	 * interface, we attach an ipsec_out and initialize ill_index
1881 	 * like we did in ip_wput. To make sure that this packet does
1882 	 * not get forwarded on other interfaces or looped back, we
1883 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1884 	 * to B_FALSE.
1885 	 *
1886 	 * We also need to make sure that this does not get load balanced
1887 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1888 	 * here. If it gets load balanced, switches supporting igmp snooping
1889 	 * will send the packet that it receives for this multicast group
1890 	 * to the interface that we are sending on. As we have joined the
1891 	 * multicast group on this ill, by sending the packet out on this
1892 	 * ill, we receive all the packets back on this ill.
1893 	 */
1894 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1895 	if (first_mp == NULL)
1896 		return;
1897 
1898 	first_mp->b_datap->db_type = M_CTL;
1899 	first_mp->b_wptr += sizeof (ipsec_info_t);
1900 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1901 	/* ipsec_out_secure is B_FALSE now */
1902 	io = (ipsec_out_t *)first_mp->b_rptr;
1903 	io->ipsec_out_type = IPSEC_OUT;
1904 	io->ipsec_out_len = sizeof (ipsec_out_t);
1905 	io->ipsec_out_use_global_policy = B_TRUE;
1906 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1907 	io->ipsec_out_attach_if = B_TRUE;
1908 	io->ipsec_out_multicast_loop = B_FALSE;
1909 	io->ipsec_out_dontroute = B_TRUE;
1910 	io->ipsec_out_zoneid = ilm->ilm_zoneid;
1911 
1912 	mp = allocb(size, BPRI_HI);
1913 	if (mp == NULL) {
1914 		freemsg(first_mp);
1915 		return;
1916 	}
1917 	mp->b_wptr = mp->b_rptr + size;
1918 	first_mp->b_cont = mp;
1919 
1920 	ipha = (ipha_t *)mp->b_rptr;
1921 	rtralert = (uint8_t *)&(ipha[1]);
1922 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1923 	igmpa->igmpa_type   = type;
1924 	igmpa->igmpa_code   = 0;
1925 	igmpa->igmpa_group  = ilm->ilm_addr;
1926 	igmpa->igmpa_cksum  = 0;
1927 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1928 
1929 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
1930 	rtralert[1] = RTRALERT_LEN;
1931 	rtralert[2] = 0;
1932 	rtralert[3] = 0;
1933 
1934 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1935 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1936 	ipha->ipha_type_of_service 	= 0;
1937 	ipha->ipha_length = htons(size);
1938 	ipha->ipha_ident = 0;
1939 	ipha->ipha_fragment_offset_and_flags = 0;
1940 	ipha->ipha_ttl 		= IGMP_TTL;
1941 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1942 	ipha->ipha_hdr_checksum 	= 0;
1943 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1944 	ipha->ipha_src 		= ipif->ipif_src_addr;
1945 	/*
1946 	 * Request loopback of the report if we are acting as a multicast
1947 	 * router, so that the process-level routing demon can hear it.
1948 	 */
1949 	/*
1950 	 * This will run multiple times for the same group if there are members
1951 	 * on the same group for multiple ipif's on the same ill. The
1952 	 * igmp_input code will suppress this due to the loopback thus we
1953 	 * always loopback membership report.
1954 	 */
1955 	ASSERT(ill->ill_rq != NULL);
1956 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1957 
1958 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
1959 
1960 	++igmpstat.igps_snd_reports;
1961 }
1962 
1963 /*
1964  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1965  * with the passed-in ipif.  The report will contain one group record
1966  * for each element of reclist.  If this causes packet length to
1967  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1968  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1969  * and those buffers are freed here.
1970  */
1971 static void
1972 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1973 {
1974 	ipsec_out_t *io;
1975 	igmp3ra_t *igmp3ra;
1976 	grphdra_t *grphdr;
1977 	mblk_t *first_mp, *mp;
1978 	ipha_t *ipha;
1979 	uint8_t *rtralert;
1980 	ipaddr_t *src_array;
1981 	int i, j, numrec, more_src_cnt;
1982 	size_t hdrsize, size, rsize;
1983 	ill_t *ill = ipif->ipif_ill;
1984 	mrec_t *rp, *cur_reclist;
1985 	mrec_t *next_reclist = reclist;
1986 	boolean_t morepkts;
1987 
1988 	/* if there aren't any records, there's nothing to send */
1989 	if (reclist == NULL)
1990 		return;
1991 
1992 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1993 nextpkt:
1994 	size = hdrsize + sizeof (igmp3ra_t);
1995 	morepkts = B_FALSE;
1996 	more_src_cnt = 0;
1997 	cur_reclist = next_reclist;
1998 	numrec = 0;
1999 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2000 		rsize = sizeof (grphdra_t) +
2001 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2002 		if (size + rsize > ill->ill_max_frag) {
2003 			if (rp == cur_reclist) {
2004 				/*
2005 				 * If the first mrec we looked at is too big
2006 				 * to fit in a single packet (i.e the source
2007 				 * list is too big), we must either truncate
2008 				 * the list (if TO_EX or IS_EX), or send
2009 				 * multiple reports for the same group (all
2010 				 * other types).
2011 				 */
2012 				int srcspace, srcsperpkt;
2013 				srcspace = ill->ill_max_frag - (size +
2014 				    sizeof (grphdra_t));
2015 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2016 				/*
2017 				 * Increment size and numrec, because we will
2018 				 * be sending a record for the mrec we're
2019 				 * looking at now.
2020 				 */
2021 				size += sizeof (grphdra_t) +
2022 				    (srcsperpkt * sizeof (ipaddr_t));
2023 				numrec++;
2024 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2025 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2026 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2027 					if (rp->mrec_next == NULL) {
2028 						/* no more packets to send */
2029 						break;
2030 					} else {
2031 						/*
2032 						 * more packets, but we're
2033 						 * done with this mrec.
2034 						 */
2035 						next_reclist = rp->mrec_next;
2036 					}
2037 				} else {
2038 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2039 					    - srcsperpkt;
2040 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2041 					/*
2042 					 * We'll fix up this mrec (remove the
2043 					 * srcs we've already sent) before
2044 					 * returning to nextpkt above.
2045 					 */
2046 					next_reclist = rp;
2047 				}
2048 			} else {
2049 				next_reclist = rp;
2050 			}
2051 			morepkts = B_TRUE;
2052 			break;
2053 		}
2054 		size += rsize;
2055 		numrec++;
2056 	}
2057 
2058 	/*
2059 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2060 	 * load balancing requirements.
2061 	 */
2062 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2063 	if (first_mp == NULL)
2064 		goto free_reclist;
2065 
2066 	first_mp->b_datap->db_type = M_CTL;
2067 	first_mp->b_wptr += sizeof (ipsec_info_t);
2068 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2069 	/* ipsec_out_secure is B_FALSE now */
2070 	io = (ipsec_out_t *)first_mp->b_rptr;
2071 	io->ipsec_out_type = IPSEC_OUT;
2072 	io->ipsec_out_len = sizeof (ipsec_out_t);
2073 	io->ipsec_out_use_global_policy = B_TRUE;
2074 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2075 	io->ipsec_out_attach_if = B_TRUE;
2076 	io->ipsec_out_multicast_loop = B_FALSE;
2077 	io->ipsec_out_dontroute = B_TRUE;
2078 	io->ipsec_out_zoneid = ipif->ipif_zoneid;
2079 
2080 	mp = allocb(size, BPRI_HI);
2081 	if (mp == NULL) {
2082 		freemsg(first_mp);
2083 		goto free_reclist;
2084 	}
2085 	bzero((char *)mp->b_rptr, size);
2086 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2087 	first_mp->b_cont = mp;
2088 
2089 	ipha = (ipha_t *)mp->b_rptr;
2090 	rtralert = (uint8_t *)&(ipha[1]);
2091 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2092 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2093 
2094 	rp = cur_reclist;
2095 	for (i = 0; i < numrec; i++) {
2096 		grphdr->grphdra_type = rp->mrec_type;
2097 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2098 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2099 		src_array = (ipaddr_t *)&(grphdr[1]);
2100 
2101 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2102 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2103 
2104 		grphdr = (grphdra_t *)&(src_array[j]);
2105 		rp = rp->mrec_next;
2106 	}
2107 
2108 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2109 	igmp3ra->igmp3ra_numrec = htons(numrec);
2110 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2111 
2112 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
2113 	rtralert[1] = RTRALERT_LEN;
2114 	rtralert[2] = 0;
2115 	rtralert[3] = 0;
2116 
2117 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2118 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2119 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2120 	ipha->ipha_length = htons(size);
2121 	ipha->ipha_ttl = IGMP_TTL;
2122 	ipha->ipha_protocol = IPPROTO_IGMP;
2123 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2124 	ipha->ipha_src = ipif->ipif_src_addr;
2125 
2126 	/*
2127 	 * Request loopback of the report if we are acting as a multicast
2128 	 * router, so that the process-level routing daemon can hear it.
2129 	 *
2130 	 * This will run multiple times for the same group if there are
2131 	 * members on the same group for multiple ipifs on the same ill.
2132 	 * The igmp_input code will suppress this due to the loopback;
2133 	 * thus we always loopback membership report.
2134 	 */
2135 	ASSERT(ill->ill_rq != NULL);
2136 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2137 
2138 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
2139 
2140 	++igmpstat.igps_snd_reports;
2141 
2142 	if (morepkts) {
2143 		if (more_src_cnt > 0) {
2144 			int index, mvsize;
2145 			slist_t *sl = &next_reclist->mrec_srcs;
2146 			index = sl->sl_numsrc;
2147 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2148 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2149 			    mvsize);
2150 			sl->sl_numsrc = more_src_cnt;
2151 		}
2152 		goto nextpkt;
2153 	}
2154 
2155 free_reclist:
2156 	while (reclist != NULL) {
2157 		rp = reclist->mrec_next;
2158 		mi_free(reclist);
2159 		reclist = rp;
2160 	}
2161 }
2162 
2163 /*
2164  * mld_input:
2165  */
2166 /* ARGSUSED */
2167 void
2168 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2169 {
2170 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2171 	mld_hdr_t	*mldh;
2172 	ilm_t		*ilm;
2173 	ipif_t		*ipif;
2174 	uint16_t	hdr_length, exthdr_length;
2175 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2176 	uint_t		next;
2177 	int		mldlen;
2178 
2179 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2180 
2181 	/* Make sure the src address of the packet is link-local */
2182 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2183 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2184 		freemsg(mp);
2185 		return;
2186 	}
2187 
2188 	if (ip6h->ip6_hlim != 1) {
2189 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2190 		freemsg(mp);
2191 		return;
2192 	}
2193 
2194 	/* Get to the icmp header part */
2195 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2196 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2197 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2198 	} else {
2199 		hdr_length = IPV6_HDR_LEN;
2200 		exthdr_length = 0;
2201 	}
2202 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2203 
2204 	/* An MLD packet must at least be 24 octets to be valid */
2205 	if (mldlen < MLD_MINLEN) {
2206 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2207 		freemsg(mp);
2208 		return;
2209 	}
2210 
2211 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2212 
2213 	switch (mldh->mld_type) {
2214 	case MLD_LISTENER_QUERY:
2215 		/*
2216 		 * packet length differentiates between v1 and v2.  v1
2217 		 * query should be exactly 24 octets long; v2 is >= 28.
2218 		 */
2219 		if (mldlen == MLD_MINLEN) {
2220 			next = mld_query_in(mldh, ill);
2221 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2222 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2223 		} else {
2224 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2225 			freemsg(mp);
2226 			return;
2227 		}
2228 		if (next == 0) {
2229 			freemsg(mp);
2230 			return;
2231 		}
2232 
2233 		if (next != INFINITY)
2234 			mld_start_timers(next);
2235 		break;
2236 
2237 	case MLD_LISTENER_REPORT: {
2238 
2239 		ASSERT(ill->ill_ipif != NULL);
2240 		/*
2241 		 * For fast leave to work, we have to know that we are the
2242 		 * last person to send a report for this group.  Reports
2243 		 * generated by us are looped back since we could potentially
2244 		 * be a multicast router, so discard reports sourced by me.
2245 		 */
2246 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2247 		mutex_enter(&ill->ill_lock);
2248 		for (ipif = ill->ill_ipif; ipif != NULL;
2249 		    ipif = ipif->ipif_next) {
2250 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2251 			    lcladdr_ptr)) {
2252 				if (ip_debug > 1) {
2253 					char    buf1[INET6_ADDRSTRLEN];
2254 					char	buf2[INET6_ADDRSTRLEN];
2255 
2256 					(void) mi_strlog(ill->ill_rq,
2257 					    1,
2258 					    SL_TRACE,
2259 					    "mld_input: we are only "
2260 					    "member src %s ipif_local %s",
2261 					    inet_ntop(AF_INET6, lcladdr_ptr,
2262 					    buf1, sizeof (buf1)),
2263 					    inet_ntop(AF_INET6,
2264 					    &ipif->ipif_v6lcl_addr,
2265 					    buf2, sizeof (buf2)));
2266 				}
2267 				mutex_exit(&ill->ill_lock);
2268 				freemsg(mp);
2269 				return;
2270 			}
2271 		}
2272 		mutex_exit(&ill->ill_lock);
2273 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2274 
2275 		v6group_ptr = &mldh->mld_addr;
2276 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2277 			BUMP_MIB(ill->ill_icmp6_mib,
2278 			    ipv6IfIcmpInGroupMembBadReports);
2279 			freemsg(mp);
2280 			return;
2281 		}
2282 
2283 
2284 		/*
2285 		 * If we belong to the group being reported, and we are a
2286 		 * 'Delaying member' per the RFC terminology, stop our timer
2287 		 * for that group and 'clear flag' i.e. mark ilm_state as
2288 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2289 		 * membership entries for the same group address (one per zone)
2290 		 * so we need to walk the ill_ilm list.
2291 		 */
2292 		mutex_enter(&ill->ill_lock);
2293 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2294 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2295 			    continue;
2296 			BUMP_MIB(ill->ill_icmp6_mib,
2297 			    ipv6IfIcmpInGroupMembOurReports);
2298 
2299 			ilm->ilm_timer = INFINITY;
2300 			ilm->ilm_state = IGMP_OTHERMEMBER;
2301 		}
2302 		mutex_exit(&ill->ill_lock);
2303 		break;
2304 	}
2305 	case MLD_LISTENER_REDUCTION:
2306 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2307 		break;
2308 	}
2309 	/*
2310 	 * All MLD packets have already been passed up to any
2311 	 * process(es) listening on a ICMP6 raw socket. This
2312 	 * has been accomplished in ip_deliver_local_v6 prior to
2313 	 * this function call. It is assumed that the multicast daemon
2314 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2315 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2316 	 * Thus we can free the MLD message block here
2317 	 */
2318 	freemsg(mp);
2319 }
2320 
2321 /*
2322  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2323  * (non-zero, unsigned) timer value to be set on success.
2324  */
2325 static uint_t
2326 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2327 {
2328 	ilm_t	*ilm;
2329 	int	timer;
2330 	uint_t	next;
2331 	in6_addr_t *v6group;
2332 
2333 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2334 
2335 	/*
2336 	 * In the MLD specification, there are 3 states and a flag.
2337 	 *
2338 	 * In Non-Listener state, we simply don't have a membership record.
2339 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2340 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2341 	 * INFINITY)
2342 	 *
2343 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2344 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2345 	 * if I sent the last report.
2346 	 */
2347 	v6group = &mldh->mld_addr;
2348 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2349 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2350 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2351 		return (0);
2352 	}
2353 
2354 	/* Need to do compatibility mode checking */
2355 	mutex_enter(&ill->ill_lock);
2356 	ill->ill_mcast_v1_time = 0;
2357 	ill->ill_mcast_v1_tset = 1;
2358 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2359 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2360 		    "MLD_V1_ROUTER\n", ill->ill_name));
2361 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2362 		ill->ill_mcast_type = MLD_V1_ROUTER;
2363 	}
2364 	mutex_exit(&ill->ill_lock);
2365 
2366 	timer = (int)ntohs(mldh->mld_maxdelay);
2367 	if (ip_debug > 1) {
2368 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2369 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2370 		    timer, (int)mldh->mld_type);
2371 	}
2372 
2373 	/*
2374 	 * -Start the timers in all of our membership records for
2375 	 * the physical interface on which the query arrived,
2376 	 * excl:
2377 	 *	1.  those that belong to the "all hosts" group,
2378 	 *	2.  those with 0 scope, or 1 node-local scope.
2379 	 *
2380 	 * -Restart any timer that is already running but has a value
2381 	 * longer that the requested timeout.
2382 	 * -Use the value specified in the query message as the
2383 	 * maximum timeout.
2384 	 */
2385 	next = INFINITY;
2386 	mutex_enter(&ill->ill_lock);
2387 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2388 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2389 
2390 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2391 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2392 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2393 			continue;
2394 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2395 		    &ipv6_all_hosts_mcast)) &&
2396 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2397 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2398 			if (timer == 0) {
2399 				/* Respond immediately */
2400 				ilm->ilm_timer = INFINITY;
2401 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2402 				mutex_exit(&ill->ill_lock);
2403 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2404 				mutex_enter(&ill->ill_lock);
2405 				break;
2406 			}
2407 			if (ilm->ilm_timer > timer) {
2408 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2409 				if (ilm->ilm_timer < next)
2410 					next = ilm->ilm_timer;
2411 			}
2412 			break;
2413 		}
2414 	}
2415 	mutex_exit(&ill->ill_lock);
2416 
2417 	return (next);
2418 }
2419 
2420 /*
2421  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2422  * returns the appropriate (non-zero, unsigned) timer value (which may
2423  * be INFINITY) to be set.
2424  */
2425 static uint_t
2426 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2427 {
2428 	ilm_t	*ilm;
2429 	in6_addr_t *v6group, *src_array;
2430 	uint_t	next, numsrc, i, mrd, delay, qqi;
2431 	uint8_t	qrv;
2432 
2433 	v6group = &mld2q->mld2q_addr;
2434 	numsrc = ntohs(mld2q->mld2q_numsrc);
2435 
2436 	/* make sure numsrc matches packet size */
2437 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2438 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2439 		return (0);
2440 	}
2441 	src_array = (in6_addr_t *)&mld2q[1];
2442 
2443 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2444 
2445 	/* extract Maximum Response Delay from code in header */
2446 	mrd = ntohs(mld2q->mld2q_mxrc);
2447 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2448 		uint_t hdrval, mant, exp;
2449 		hdrval = mrd;
2450 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2451 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2452 		mrd = (mant | 0x1000) << (exp + 3);
2453 	}
2454 	MCAST_RANDOM_DELAY(delay, mrd);
2455 	next = (unsigned)INFINITY;
2456 
2457 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2458 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2459 	else
2460 		ill->ill_mcast_rv = qrv;
2461 
2462 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2463 		uint_t mant, exp;
2464 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2465 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2466 		qqi = (mant | 0x10) << (exp + 3);
2467 	}
2468 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2469 
2470 	/*
2471 	 * If we have a pending general query response that's scheduled
2472 	 * sooner than the delay we calculated for this response, then
2473 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2474 	 */
2475 	mutex_enter(&ill->ill_lock);
2476 	if (ill->ill_global_timer < delay) {
2477 		mutex_exit(&ill->ill_lock);
2478 		return (next);
2479 	}
2480 	mutex_exit(&ill->ill_lock);
2481 
2482 	/*
2483 	 * Now take action depending on query type: general,
2484 	 * group specific, or group/source specific.
2485 	 */
2486 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2487 		/*
2488 		 * general query
2489 		 * We know global timer is either not running or is
2490 		 * greater than our calculated delay, so reset it to
2491 		 * our delay (random value in range [0, response time])
2492 		 */
2493 		mutex_enter(&ill->ill_lock);
2494 		ill->ill_global_timer = delay;
2495 		next = ill->ill_global_timer;
2496 		mutex_exit(&ill->ill_lock);
2497 
2498 	} else {
2499 		/* group or group/source specific query */
2500 		mutex_enter(&ill->ill_lock);
2501 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2502 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2503 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2504 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2505 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2506 				continue;
2507 
2508 			/*
2509 			 * If the query is group specific or we have a
2510 			 * pending group specific query, the response is
2511 			 * group specific (pending sources list should be
2512 			 * empty).  Otherwise, need to update the pending
2513 			 * sources list for the group and source specific
2514 			 * response.
2515 			 */
2516 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2517 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2518 group_query:
2519 				FREE_SLIST(ilm->ilm_pendsrcs);
2520 				ilm->ilm_pendsrcs = NULL;
2521 			} else {
2522 				boolean_t overflow;
2523 				slist_t *pktl;
2524 				if (numsrc > MAX_FILTER_SIZE ||
2525 				    (ilm->ilm_pendsrcs == NULL &&
2526 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2527 					/*
2528 					 * We've been sent more sources than
2529 					 * we can deal with; or we can't deal
2530 					 * with a source list at all. Revert
2531 					 * to a group specific query.
2532 					 */
2533 					goto group_query;
2534 				}
2535 				if ((pktl = l_alloc()) == NULL)
2536 					goto group_query;
2537 				pktl->sl_numsrc = numsrc;
2538 				for (i = 0; i < numsrc; i++)
2539 					pktl->sl_addr[i] = src_array[i];
2540 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2541 				    &overflow);
2542 				l_free(pktl);
2543 				if (overflow)
2544 					goto group_query;
2545 			}
2546 			/* set timer to soonest value */
2547 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2548 			if (ilm->ilm_timer < next)
2549 				next = ilm->ilm_timer;
2550 			break;
2551 		}
2552 		mutex_exit(&ill->ill_lock);
2553 	}
2554 
2555 	return (next);
2556 }
2557 
2558 /*
2559  * Send MLDv1 response packet with hoplimit 1
2560  */
2561 static void
2562 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2563 {
2564 	mblk_t		*mp;
2565 	mld_hdr_t	*mldh;
2566 	ip6_t 		*ip6h;
2567 	ip6_hbh_t	*ip6hbh;
2568 	struct ip6_opt_router	*ip6router;
2569 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2570 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2571 	ipif_t		*ipif;
2572 	ip6i_t		*ip6i;
2573 
2574 	/*
2575 	 * We need to place a router alert option in this packet.  The length
2576 	 * of the options must be a multiple of 8.  The hbh option header is 2
2577 	 * bytes followed by the 4 byte router alert option.  That leaves
2578 	 * 2 bytes of pad for a total of 8 bytes.
2579 	 */
2580 	const int	router_alert_length = 8;
2581 
2582 	ASSERT(ill->ill_isv6);
2583 
2584 	/*
2585 	 * We need to make sure that this packet does not get load balanced.
2586 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2587 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2588 	 * If it gets load balanced, switches supporting MLD snooping
2589 	 * (in the future) will send the packet that it receives for this
2590 	 * multicast group to the interface that we are sending on. As we have
2591 	 * joined the multicast group on this ill, by sending the packet out
2592 	 * on this ill, we receive all the packets back on this ill.
2593 	 */
2594 	size += sizeof (ip6i_t) + router_alert_length;
2595 	mp = allocb(size, BPRI_HI);
2596 	if (mp == NULL)
2597 		return;
2598 	bzero(mp->b_rptr, size);
2599 	mp->b_wptr = mp->b_rptr + size;
2600 
2601 	ip6i = (ip6i_t *)mp->b_rptr;
2602 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2603 	ip6i->ip6i_nxt = IPPROTO_RAW;
2604 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2605 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2606 
2607 	ip6h = (ip6_t *)&ip6i[1];
2608 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2609 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2610 	/*
2611 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2612 	 * above will pad between ip6router and mld.
2613 	 */
2614 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2615 
2616 	mldh->mld_type = type;
2617 	mldh->mld_addr = ilm->ilm_v6addr;
2618 
2619 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2620 	ip6router->ip6or_len = 2;
2621 	ip6router->ip6or_value[0] = 0;
2622 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2623 
2624 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2625 	ip6hbh->ip6h_len = 0;
2626 
2627 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2628 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2629 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2630 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2631 	if (v6addr == NULL)
2632 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2633 	else
2634 		ip6h->ip6_dst = *v6addr;
2635 
2636 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2637 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2638 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2639 		ipif_refrele(ipif);
2640 	} else {
2641 		/* Otherwise, use IPv6 default address selection. */
2642 		ip6h->ip6_src = ipv6_all_zeros;
2643 	}
2644 
2645 	/*
2646 	 * Prepare for checksum by putting icmp length in the icmp
2647 	 * checksum field. The checksum is calculated in ip_wput_v6.
2648 	 */
2649 	mldh->mld_cksum = htons(sizeof (*mldh));
2650 
2651 	/*
2652 	 * ip_wput will automatically loopback the multicast packet to
2653 	 * the conn if multicast loopback is enabled.
2654 	 * The MIB stats corresponding to this outgoing MLD packet
2655 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2656 	 * ->icmp_update_out_mib_v6 function call.
2657 	 */
2658 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2659 }
2660 
2661 /*
2662  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2663  * report will contain one multicast address record for each element of
2664  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2665  * multiple reports are sent.  reclist is assumed to be made up of
2666  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2667  */
2668 static void
2669 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2670 {
2671 	mblk_t		*mp;
2672 	mld2r_t		*mld2r;
2673 	mld2mar_t	*mld2mar;
2674 	in6_addr_t	*srcarray;
2675 	ip6_t		*ip6h;
2676 	ip6_hbh_t	*ip6hbh;
2677 	ip6i_t		*ip6i;
2678 	struct ip6_opt_router	*ip6router;
2679 	size_t		size, optlen, padlen, icmpsize, rsize;
2680 	ipif_t		*ipif;
2681 	int		i, numrec, more_src_cnt;
2682 	mrec_t		*rp, *cur_reclist;
2683 	mrec_t		*next_reclist = reclist;
2684 	boolean_t	morepkts;
2685 
2686 	/* If there aren't any records, there's nothing to send */
2687 	if (reclist == NULL)
2688 		return;
2689 
2690 	ASSERT(ill->ill_isv6);
2691 
2692 	/*
2693 	 * Total option length (optlen + padlen) must be a multiple of
2694 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2695 	 * length will be 8.  Assert this in case anything ever changes.
2696 	 */
2697 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2698 	ASSERT(optlen <= 8);
2699 	padlen = 8 - optlen;
2700 nextpkt:
2701 	icmpsize = sizeof (mld2r_t);
2702 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2703 	morepkts = B_FALSE;
2704 	more_src_cnt = 0;
2705 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2706 	    rp = rp->mrec_next, numrec++) {
2707 		rsize = sizeof (mld2mar_t) +
2708 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2709 		if (size + rsize > ill->ill_max_frag) {
2710 			if (rp == cur_reclist) {
2711 				/*
2712 				 * If the first mrec we looked at is too big
2713 				 * to fit in a single packet (i.e the source
2714 				 * list is too big), we must either truncate
2715 				 * the list (if TO_EX or IS_EX), or send
2716 				 * multiple reports for the same group (all
2717 				 * other types).
2718 				 */
2719 				int srcspace, srcsperpkt;
2720 				srcspace = ill->ill_max_frag -
2721 				    (size + sizeof (mld2mar_t));
2722 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2723 				/*
2724 				 * Increment icmpsize and size, because we will
2725 				 * be sending a record for the mrec we're
2726 				 * looking at now.
2727 				 */
2728 				rsize = sizeof (mld2mar_t) +
2729 				    (srcsperpkt * sizeof (in6_addr_t));
2730 				icmpsize += rsize;
2731 				size += rsize;
2732 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2733 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2734 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2735 					if (rp->mrec_next == NULL) {
2736 						/* no more packets to send */
2737 						break;
2738 					} else {
2739 						/*
2740 						 * more packets, but we're
2741 						 * done with this mrec.
2742 						 */
2743 						next_reclist = rp->mrec_next;
2744 					}
2745 				} else {
2746 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2747 					    - srcsperpkt;
2748 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2749 					/*
2750 					 * We'll fix up this mrec (remove the
2751 					 * srcs we've already sent) before
2752 					 * returning to nextpkt above.
2753 					 */
2754 					next_reclist = rp;
2755 				}
2756 			} else {
2757 				next_reclist = rp;
2758 			}
2759 			morepkts = B_TRUE;
2760 			break;
2761 		}
2762 		icmpsize += rsize;
2763 		size += rsize;
2764 	}
2765 
2766 	/*
2767 	 * We need to make sure that this packet does not get load balanced.
2768 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2769 	 * ip_newroute_ipif_v6 know how to handle such packets.
2770 	 * If it gets load balanced, switches supporting MLD snooping
2771 	 * (in the future) will send the packet that it receives for this
2772 	 * multicast group to the interface that we are sending on. As we have
2773 	 * joined the multicast group on this ill, by sending the packet out
2774 	 * on this ill, we receive all the packets back on this ill.
2775 	 */
2776 	size += sizeof (ip6i_t);
2777 	mp = allocb(size, BPRI_HI);
2778 	if (mp == NULL)
2779 		goto free_reclist;
2780 	bzero(mp->b_rptr, size);
2781 	mp->b_wptr = mp->b_rptr + size;
2782 
2783 	ip6i = (ip6i_t *)mp->b_rptr;
2784 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2785 	ip6i->ip6i_nxt = IPPROTO_RAW;
2786 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2787 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2788 
2789 	ip6h = (ip6_t *)&(ip6i[1]);
2790 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2791 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2792 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2793 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2794 
2795 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2796 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2797 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2798 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2799 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2800 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2801 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2802 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2803 		ipif_refrele(ipif);
2804 	} else {
2805 		/* otherwise, use IPv6 default address selection. */
2806 		ip6h->ip6_src = ipv6_all_zeros;
2807 	}
2808 
2809 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2810 	/*
2811 	 * ip6h_len is the number of 8-byte words, not including the first
2812 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2813 	 */
2814 	ip6hbh->ip6h_len = 0;
2815 
2816 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2817 	ip6router->ip6or_len = 2;
2818 	ip6router->ip6or_value[0] = 0;
2819 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2820 
2821 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2822 	mld2r->mld2r_nummar = htons(numrec);
2823 	/*
2824 	 * Prepare for the checksum by putting icmp length in the icmp
2825 	 * checksum field. The checksum is calculated in ip_wput_v6.
2826 	 */
2827 	mld2r->mld2r_cksum = htons(icmpsize);
2828 
2829 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2830 		mld2mar->mld2mar_type = rp->mrec_type;
2831 		mld2mar->mld2mar_auxlen = 0;
2832 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2833 		mld2mar->mld2mar_group = rp->mrec_group;
2834 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2835 
2836 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2837 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2838 
2839 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2840 	}
2841 
2842 	/*
2843 	 * ip_wput will automatically loopback the multicast packet to
2844 	 * the conn if multicast loopback is enabled.
2845 	 * The MIB stats corresponding to this outgoing MLD packet
2846 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2847 	 * ->icmp_update_out_mib_v6 function call.
2848 	 */
2849 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2850 
2851 	if (morepkts) {
2852 		if (more_src_cnt > 0) {
2853 			int index, mvsize;
2854 			slist_t *sl = &next_reclist->mrec_srcs;
2855 			index = sl->sl_numsrc;
2856 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2857 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2858 			    mvsize);
2859 			sl->sl_numsrc = more_src_cnt;
2860 		}
2861 		goto nextpkt;
2862 	}
2863 
2864 free_reclist:
2865 	while (reclist != NULL) {
2866 		rp = reclist->mrec_next;
2867 		mi_free(reclist);
2868 		reclist = rp;
2869 	}
2870 }
2871 
2872 static mrec_t *
2873 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2874     mrec_t *next)
2875 {
2876 	mrec_t *rp;
2877 	int i;
2878 
2879 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2880 	    SLIST_IS_EMPTY(srclist))
2881 		return (next);
2882 
2883 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2884 	if (rp == NULL)
2885 		return (next);
2886 
2887 	rp->mrec_next = next;
2888 	rp->mrec_type = type;
2889 	rp->mrec_auxlen = 0;
2890 	rp->mrec_group = *grp;
2891 	if (srclist == NULL) {
2892 		rp->mrec_srcs.sl_numsrc = 0;
2893 	} else {
2894 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2895 		for (i = 0; i < srclist->sl_numsrc; i++)
2896 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2897 	}
2898 
2899 	return (rp);
2900 }
2901 
2902 /*
2903  * Set up initial retransmit state.  If memory cannot be allocated for
2904  * the source lists, simply create as much state as is possible; memory
2905  * allocation failures are considered one type of transient error that
2906  * the retransmissions are designed to overcome (and if they aren't
2907  * transient, there are bigger problems than failing to notify the
2908  * router about multicast group membership state changes).
2909  */
2910 static void
2911 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2912     slist_t *flist)
2913 {
2914 	/*
2915 	 * There are only three possibilities for rtype:
2916 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2917 	 *	  => rtype is ALLOW_NEW_SOURCES
2918 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2919 	 *	  => rtype is CHANGE_TO_EXCLUDE
2920 	 *	State change that involves a filter mode change
2921 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2922 	 */
2923 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2924 	    rtype == ALLOW_NEW_SOURCES);
2925 
2926 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2927 
2928 	switch (rtype) {
2929 	case CHANGE_TO_EXCLUDE:
2930 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2931 		CLEAR_SLIST(rtxp->rtx_allow);
2932 		COPY_SLIST(flist, rtxp->rtx_block);
2933 		break;
2934 	case ALLOW_NEW_SOURCES:
2935 	case CHANGE_TO_INCLUDE:
2936 		rtxp->rtx_fmode_cnt =
2937 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2938 		CLEAR_SLIST(rtxp->rtx_block);
2939 		COPY_SLIST(flist, rtxp->rtx_allow);
2940 		break;
2941 	}
2942 }
2943 
2944 /*
2945  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2946  * RFC 3376 section 5.1, covers three cases:
2947  *	* The current state change is a filter mode change
2948  *		Set filter mode retransmit counter; set retransmit allow or
2949  *		block list to new source list as appropriate, and clear the
2950  *		retransmit list that was not set; send TO_IN or TO_EX with
2951  *		new source list.
2952  *	* The current state change is a source list change, but the filter
2953  *	  mode retransmit counter is > 0
2954  *		Decrement filter mode retransmit counter; set retransmit
2955  *		allow or block list to  new source list as appropriate,
2956  *		and clear the retransmit list that was not set; send TO_IN
2957  *		or TO_EX with new source list.
2958  *	* The current state change is a source list change, and the filter
2959  *	  mode retransmit counter is 0.
2960  *		Merge existing rtx allow and block lists with new state:
2961  *		  rtx_allow = (new allow + rtx_allow) - new block
2962  *		  rtx_block = (new block + rtx_block) - new allow
2963  *		Send ALLOW and BLOCK records for new retransmit lists;
2964  *		decrement retransmit counter.
2965  *
2966  * As is the case for mcast_init_rtx(), memory allocation failures are
2967  * acceptable; we just create as much state as we can.
2968  */
2969 static mrec_t *
2970 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2971 {
2972 	ill_t *ill;
2973 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2974 	mcast_record_t txtype;
2975 	mrec_t *rp, *rpnext, *rtnmrec;
2976 	boolean_t ovf;
2977 
2978 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
2979 
2980 	if (mreclist == NULL)
2981 		return (mreclist);
2982 
2983 	/*
2984 	 * A filter mode change is indicated by a single mrec, which is
2985 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2986 	 * retransmit state as if this were an initial join.  There is
2987 	 * no change to the mrec list.
2988 	 */
2989 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2990 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2991 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2992 		    &mreclist->mrec_srcs);
2993 		return (mreclist);
2994 	}
2995 
2996 	/*
2997 	 * Only the source list has changed
2998 	 */
2999 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3000 	if (rtxp->rtx_fmode_cnt > 0) {
3001 		/* but we're still sending filter mode change reports */
3002 		rtxp->rtx_fmode_cnt--;
3003 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3004 			CLEAR_SLIST(rtxp->rtx_block);
3005 			COPY_SLIST(flist, rtxp->rtx_allow);
3006 			txtype = CHANGE_TO_INCLUDE;
3007 		} else {
3008 			CLEAR_SLIST(rtxp->rtx_allow);
3009 			COPY_SLIST(flist, rtxp->rtx_block);
3010 			txtype = CHANGE_TO_EXCLUDE;
3011 		}
3012 		/* overwrite first mrec with new info */
3013 		mreclist->mrec_type = txtype;
3014 		l_copy(flist, &mreclist->mrec_srcs);
3015 		/* then free any remaining mrecs */
3016 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3017 			rpnext = rp->mrec_next;
3018 			mi_free(rp);
3019 		}
3020 		mreclist->mrec_next = NULL;
3021 		rtnmrec = mreclist;
3022 	} else {
3023 		mrec_t *allow_mrec, *block_mrec;
3024 		/*
3025 		 * Just send the source change reports; but we need to
3026 		 * recalculate the ALLOW and BLOCK lists based on previous
3027 		 * state and new changes.
3028 		 */
3029 		rtnmrec = mreclist;
3030 		allow_mrec = block_mrec = NULL;
3031 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3032 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3033 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3034 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3035 				allow_mrec = rp;
3036 			else
3037 				block_mrec = rp;
3038 		}
3039 		/*
3040 		 * Perform calculations:
3041 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3042 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3043 		 *
3044 		 * Each calc requires two steps, for example:
3045 		 *   rtx_allow = rtx_allow - mrec_block;
3046 		 *   new_allow = mrec_allow + rtx_allow;
3047 		 *
3048 		 * Store results in mrec lists, and then copy into rtx lists.
3049 		 * We do it in this order in case the rtx list hasn't been
3050 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3051 		 * Overflows are also okay.
3052 		 */
3053 		if (block_mrec != NULL) {
3054 			l_difference_in_a(rtxp->rtx_allow,
3055 			    &block_mrec->mrec_srcs);
3056 		}
3057 		if (allow_mrec != NULL) {
3058 			l_difference_in_a(rtxp->rtx_block,
3059 			    &allow_mrec->mrec_srcs);
3060 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3061 			    &ovf);
3062 		}
3063 		if (block_mrec != NULL) {
3064 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3065 			    &ovf);
3066 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3067 		} else {
3068 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3069 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3070 		}
3071 		if (allow_mrec != NULL) {
3072 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3073 		} else {
3074 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3075 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3076 		}
3077 	}
3078 
3079 	return (rtnmrec);
3080 }
3081