xref: /titanic_52/usr/src/uts/common/inet/ip/igmp.c (revision f808c858fa61e7769218966759510a8b1190dfcf)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Internet Group Management Protocol (IGMP) routines.
31  * Multicast Listener Discovery Protocol (MLD) routines.
32  *
33  * Written by Steve Deering, Stanford, May 1988.
34  * Modified by Rosen Sharma, Stanford, Aug 1994.
35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
36  *
37  * MULTICAST 3.5.1.1
38  */
39 
40 #include <sys/types.h>
41 #include <sys/stream.h>
42 #include <sys/stropts.h>
43 #include <sys/strlog.h>
44 #include <sys/strsun.h>
45 #include <sys/systm.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/atomic.h>
50 #include <sys/zone.h>
51 
52 #include <sys/param.h>
53 #include <sys/socket.h>
54 #include <inet/ipclassifier.h>
55 #include <net/if.h>
56 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 
62 #include <inet/common.h>
63 #include <inet/mi.h>
64 #include <inet/nd.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_if.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 
75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
84 		    slist_t *srclist, mrec_t *next);
85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
86 		    mcast_record_t rtype, slist_t *flist);
87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
88 
89 /* Following protected by igmp_timer_lock */
90 static int 	igmp_time_to_next;	/* Time since last timeout */
91 static int 	igmp_timer_fired_last;
92 uint_t		igmp_deferred_next = INFINITY;
93 timeout_id_t	igmp_timeout_id = 0;
94 kmutex_t	igmp_timer_lock;
95 
96 /* Protected by igmp_slowtimeout_lock */
97 timeout_id_t	igmp_slowtimeout_id = 0;
98 kmutex_t	igmp_slowtimeout_lock;
99 
100 /* Following protected by mld_timer_lock */
101 static int 	mld_time_to_next;	/* Time since last timeout */
102 static int 	mld_timer_fired_last;
103 uint_t		mld_deferred_next = INFINITY;
104 timeout_id_t	mld_timeout_id = 0;
105 kmutex_t	mld_timer_lock;
106 
107 /* Protected by mld_slowtimeout_lock */
108 timeout_id_t	mld_slowtimeout_id = 0;
109 kmutex_t	mld_slowtimeout_lock;
110 
111 /*
112  * Macros used to do timer len conversions.  Timer values are always
113  * stored and passed to the timer functions as milliseconds; but the
114  * default values and values from the wire may not be.
115  *
116  * And yes, it's obscure, but decisecond is easier to abbreviate than
117  * "tenths of a second".
118  */
119 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
120 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
121 
122 /*
123  * The first multicast join will trigger the igmp timers / mld timers
124  * The unit for next is milliseconds.
125  */
126 void
127 igmp_start_timers(unsigned next)
128 {
129 	int	time_left;
130 	/* Protected by igmp_timer_lock */
131 	static  boolean_t igmp_timer_setter_active;
132 	int	ret;
133 
134 	ASSERT(next != 0 && next != INFINITY);
135 
136 	mutex_enter(&igmp_timer_lock);
137 
138 	if (igmp_timer_setter_active) {
139 		/*
140 		 * Serialize timer setters, one at a time. If the
141 		 * timer is currently being set by someone,
142 		 * just record the next time when it has to be
143 		 * invoked and return. The current setter will
144 		 * take care.
145 		 */
146 		igmp_time_to_next = MIN(igmp_time_to_next, next);
147 		mutex_exit(&igmp_timer_lock);
148 		return;
149 	} else {
150 		igmp_timer_setter_active = B_TRUE;
151 	}
152 	if (igmp_timeout_id == 0) {
153 		/*
154 		 * The timer is inactive. We need to start a timer
155 		 */
156 		igmp_time_to_next = next;
157 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
158 		    MSEC_TO_TICK(igmp_time_to_next));
159 		igmp_timer_setter_active = B_FALSE;
160 		mutex_exit(&igmp_timer_lock);
161 		return;
162 	}
163 
164 	/*
165 	 * The timer was scheduled sometime back for firing in
166 	 * 'igmp_time_to_next' ms and is active. We need to
167 	 * reschedule the timeout if the new 'next' will happen
168 	 * earlier than the currently scheduled timeout
169 	 */
170 	time_left = igmp_timer_fired_last +
171 	    MSEC_TO_TICK(igmp_time_to_next) - ddi_get_lbolt();
172 	if (time_left < MSEC_TO_TICK(next)) {
173 		igmp_timer_setter_active = B_FALSE;
174 		mutex_exit(&igmp_timer_lock);
175 		return;
176 	}
177 
178 	mutex_exit(&igmp_timer_lock);
179 	ret = untimeout(igmp_timeout_id);
180 	mutex_enter(&igmp_timer_lock);
181 	/*
182 	 * The timeout was cancelled, or the timeout handler
183 	 * completed, while we were blocked in the untimeout.
184 	 * No other thread could have set the timer meanwhile
185 	 * since we serialized all the timer setters. Thus
186 	 * no timer is currently active nor executing nor will
187 	 * any timer fire in the future. We start the timer now
188 	 * if needed.
189 	 */
190 	if (ret == -1) {
191 		ASSERT(igmp_timeout_id == 0);
192 	} else {
193 		ASSERT(igmp_timeout_id != 0);
194 		igmp_timeout_id = 0;
195 	}
196 	if (igmp_time_to_next != 0) {
197 		igmp_time_to_next = MIN(igmp_time_to_next, next);
198 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
199 		    MSEC_TO_TICK(igmp_time_to_next));
200 	}
201 	igmp_timer_setter_active = B_FALSE;
202 	mutex_exit(&igmp_timer_lock);
203 }
204 
205 /*
206  * mld_start_timers:
207  * The unit for next is milliseconds.
208  */
209 void
210 mld_start_timers(unsigned next)
211 {
212 	int	time_left;
213 	/* Protedted by mld_timer_lock */
214 	static  boolean_t mld_timer_setter_active;
215 	int	ret;
216 
217 	ASSERT(next != 0 && next != INFINITY);
218 
219 	mutex_enter(&mld_timer_lock);
220 	if (mld_timer_setter_active) {
221 		/*
222 		 * Serialize timer setters, one at a time. If the
223 		 * timer is currently being set by someone,
224 		 * just record the next time when it has to be
225 		 * invoked and return. The current setter will
226 		 * take care.
227 		 */
228 		mld_time_to_next = MIN(mld_time_to_next, next);
229 		mutex_exit(&mld_timer_lock);
230 		return;
231 	} else {
232 		mld_timer_setter_active = B_TRUE;
233 	}
234 	if (mld_timeout_id == 0) {
235 		/*
236 		 * The timer is inactive. We need to start a timer
237 		 */
238 		mld_time_to_next = next;
239 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
240 		    MSEC_TO_TICK(mld_time_to_next));
241 		mld_timer_setter_active = B_FALSE;
242 		mutex_exit(&mld_timer_lock);
243 		return;
244 	}
245 
246 	/*
247 	 * The timer was scheduled sometime back for firing in
248 	 * 'igmp_time_to_next' ms and is active. We need to
249 	 * reschedule the timeout if the new 'next' will happen
250 	 * earlier than the currently scheduled timeout
251 	 */
252 	time_left = mld_timer_fired_last +
253 	    MSEC_TO_TICK(mld_time_to_next) - ddi_get_lbolt();
254 	if (time_left < MSEC_TO_TICK(next)) {
255 		mld_timer_setter_active = B_FALSE;
256 		mutex_exit(&mld_timer_lock);
257 		return;
258 	}
259 
260 	mutex_exit(&mld_timer_lock);
261 	ret = untimeout(mld_timeout_id);
262 	mutex_enter(&mld_timer_lock);
263 	/*
264 	 * The timeout was cancelled, or the timeout handler
265 	 * completed, while we were blocked in the untimeout.
266 	 * No other thread could have set the timer meanwhile
267 	 * since we serialized all the timer setters. Thus
268 	 * no timer is currently active nor executing nor will
269 	 * any timer fire in the future. We start the timer now
270 	 * if needed.
271 	 */
272 	if (ret == -1) {
273 		ASSERT(mld_timeout_id == 0);
274 	} else {
275 		ASSERT(mld_timeout_id != 0);
276 		mld_timeout_id = 0;
277 	}
278 	if (mld_time_to_next != 0) {
279 		mld_time_to_next = MIN(mld_time_to_next, next);
280 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
281 		    MSEC_TO_TICK(mld_time_to_next));
282 	}
283 	mld_timer_setter_active = B_FALSE;
284 	mutex_exit(&mld_timer_lock);
285 }
286 
287 /*
288  * igmp_input:
289  * Return 0 if the message is OK and should be handed to "raw" receivers.
290  * Callers of igmp_input() may need to reinitialize variables that were copied
291  * from the mblk as this calls pullupmsg().
292  */
293 /* ARGSUSED */
294 int
295 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
296 {
297 	igmpa_t 	*igmpa;
298 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
299 	int		iphlen, igmplen, mblklen;
300 	ilm_t 		*ilm;
301 	uint32_t	src, dst;
302 	uint32_t 	group;
303 	uint_t		next;
304 	ipif_t 		*ipif;
305 
306 	ASSERT(ill != NULL);
307 	ASSERT(!ill->ill_isv6);
308 	++igmpstat.igps_rcv_total;
309 
310 	mblklen = MBLKL(mp);
311 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
312 		++igmpstat.igps_rcv_tooshort;
313 		freemsg(mp);
314 		return (-1);
315 	}
316 	igmplen = ntohs(ipha->ipha_length) - iphlen;
317 	/*
318 	 * Since msg sizes are more variable with v3, just pullup the
319 	 * whole thing now.
320 	 */
321 	if (MBLKL(mp) < (igmplen + iphlen)) {
322 		mblk_t *mp1;
323 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
324 			++igmpstat.igps_rcv_tooshort;
325 			freemsg(mp);
326 			return (-1);
327 		}
328 		freemsg(mp);
329 		mp = mp1;
330 		ipha = (ipha_t *)(mp->b_rptr);
331 	}
332 
333 	/*
334 	 * Validate lengths
335 	 */
336 	if (igmplen < IGMP_MINLEN) {
337 		++igmpstat.igps_rcv_tooshort;
338 		freemsg(mp);
339 		return (-1);
340 	}
341 	/*
342 	 * Validate checksum
343 	 */
344 	if (IP_CSUM(mp, iphlen, 0)) {
345 		++igmpstat.igps_rcv_badsum;
346 		freemsg(mp);
347 		return (-1);
348 	}
349 
350 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
351 	src = ipha->ipha_src;
352 	dst = ipha->ipha_dst;
353 	if (ip_debug > 1)
354 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
355 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
356 		    (int)ntohl(src), (int)ntohl(dst),
357 		    ill->ill_name);
358 
359 	switch (igmpa->igmpa_type) {
360 	case IGMP_MEMBERSHIP_QUERY:
361 		/*
362 		 * packet length differentiates between v1/v2 and v3
363 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
364 		 */
365 		if (igmplen == IGMP_MINLEN) {
366 			next = igmp_query_in(ipha, igmpa, ill);
367 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
368 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
369 			    igmplen);
370 		} else {
371 			++igmpstat.igps_rcv_tooshort;
372 			freemsg(mp);
373 			return (-1);
374 		}
375 		if (next == 0) {
376 			freemsg(mp);
377 			return (-1);
378 		}
379 
380 		if (next != INFINITY)
381 			igmp_start_timers(next);
382 
383 		break;
384 
385 	case IGMP_V1_MEMBERSHIP_REPORT:
386 	case IGMP_V2_MEMBERSHIP_REPORT:
387 		/*
388 		 * For fast leave to work, we have to know that we are the
389 		 * last person to send a report for this group. Reports
390 		 * generated by us are looped back since we could potentially
391 		 * be a multicast router, so discard reports sourced by me.
392 		 */
393 		mutex_enter(&ill->ill_lock);
394 		for (ipif = ill->ill_ipif; ipif != NULL;
395 		    ipif = ipif->ipif_next) {
396 			if (ipif->ipif_lcl_addr == src) {
397 				if (ip_debug > 1) {
398 					(void) mi_strlog(ill->ill_rq,
399 					    1,
400 					    SL_TRACE,
401 					    "igmp_input: we are only "
402 					    "member src 0x%x ipif_local 0x%x",
403 					    (int)ntohl(src),
404 					    (int)
405 					    ntohl(ipif->ipif_lcl_addr));
406 				}
407 				mutex_exit(&ill->ill_lock);
408 				return (0);
409 			}
410 		}
411 		mutex_exit(&ill->ill_lock);
412 
413 		++igmpstat.igps_rcv_reports;
414 		group = igmpa->igmpa_group;
415 		if (!CLASSD(group)) {
416 			++igmpstat.igps_rcv_badreports;
417 			freemsg(mp);
418 			return (-1);
419 		}
420 
421 		/*
422 		 * KLUDGE: if the IP source address of the report has an
423 		 * unspecified (i.e., zero) subnet number, as is allowed for
424 		 * a booting host, replace it with the correct subnet number
425 		 * so that a process-level multicast routing demon can
426 		 * determine which subnet it arrived from.  This is necessary
427 		 * to compensate for the lack of any way for a process to
428 		 * determine the arrival interface of an incoming packet.
429 		 *
430 		 * Requires that a copy of *this* message it passed up
431 		 * to the raw interface which is done by our caller.
432 		 */
433 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
434 			/* Pick the first ipif on this ill */
435 			mutex_enter(&ill->ill_lock);
436 			src = ill->ill_ipif->ipif_subnet;
437 			mutex_exit(&ill->ill_lock);
438 			ip1dbg(("igmp_input: changed src to 0x%x\n",
439 			    (int)ntohl(src)));
440 			ipha->ipha_src = src;
441 		}
442 
443 		/*
444 		 * If we belong to the group being reported, and
445 		 * we are a 'Delaying member' in the RFC terminology,
446 		 * stop our timer for that group and 'clear flag' i.e.
447 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
448 		 * interfaces on the given physical interface.
449 		 */
450 		mutex_enter(&ill->ill_lock);
451 		for (ipif = ill->ill_ipif; ipif != NULL;
452 		    ipif = ipif->ipif_next) {
453 			ilm = ilm_lookup_ipif(ipif, group);
454 			if (ilm != NULL) {
455 				++igmpstat.igps_rcv_ourreports;
456 				ilm->ilm_timer = INFINITY;
457 				ilm->ilm_state = IGMP_OTHERMEMBER;
458 			}
459 		} /* for */
460 		mutex_exit(&ill->ill_lock);
461 		break;
462 
463 	case IGMP_V3_MEMBERSHIP_REPORT:
464 		/*
465 		 * Currently nothing to do here; IGMP router is not
466 		 * implemented in ip, and v3 hosts don't pay attention
467 		 * to membership reports.
468 		 */
469 		break;
470 	}
471 	/*
472 	 * Pass all valid IGMP packets up to any process(es) listening
473 	 * on a raw IGMP socket. Do not free the packet.
474 	 */
475 	return (0);
476 }
477 
478 static uint_t
479 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
480 {
481 	ilm_t	*ilm;
482 	int	timer;
483 	uint_t	next;
484 
485 	++igmpstat.igps_rcv_queries;
486 
487 	/*
488 	 * In the IGMPv2 specification, there are 3 states and a flag.
489 	 *
490 	 * In Non-Member state, we simply don't have a membership record.
491 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
492 	 * < INFINITY).  In Idle Member state, our timer is not running
493 	 * (ilm->ilm_timer == INFINITY).
494 	 *
495 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
496 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
497 	 * if I sent the last report.
498 	 */
499 	if (igmpa->igmpa_code == 0) {
500 		/*
501 		 * Query from an old router.
502 		 * Remember that the querier on this interface is old,
503 		 * and set the timer to the value in RFC 1112.
504 		 */
505 
506 
507 		mutex_enter(&ill->ill_lock);
508 		ill->ill_mcast_v1_time = 0;
509 		ill->ill_mcast_v1_tset = 1;
510 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
511 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
512 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
513 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
514 			ill->ill_mcast_type = IGMP_V1_ROUTER;
515 		}
516 		mutex_exit(&ill->ill_lock);
517 
518 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
519 
520 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
521 		    igmpa->igmpa_group != 0) {
522 			++igmpstat.igps_rcv_badqueries;
523 			return (0);
524 		}
525 
526 	} else {
527 		in_addr_t group;
528 
529 		/*
530 		 * Query from a new router
531 		 * Simply do a validity check
532 		 */
533 		group = igmpa->igmpa_group;
534 		if (group != 0 && (!CLASSD(group))) {
535 			++igmpstat.igps_rcv_badqueries;
536 			return (0);
537 		}
538 
539 		/*
540 		 * Switch interface state to v2 on receipt of a v2 query
541 		 * ONLY IF current state is v3.  Let things be if current
542 		 * state if v1 but do reset the v2-querier-present timer.
543 		 */
544 		mutex_enter(&ill->ill_lock);
545 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
546 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
547 			    "to IGMP_V2_ROUTER", ill->ill_name));
548 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
549 			ill->ill_mcast_type = IGMP_V2_ROUTER;
550 		}
551 		ill->ill_mcast_v2_time = 0;
552 		ill->ill_mcast_v2_tset = 1;
553 		mutex_exit(&ill->ill_lock);
554 
555 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
556 	}
557 
558 	if (ip_debug > 1) {
559 		mutex_enter(&ill->ill_lock);
560 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
561 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
562 		    (int)ntohs(igmpa->igmpa_code),
563 		    (int)ntohs(igmpa->igmpa_type));
564 		mutex_exit(&ill->ill_lock);
565 	}
566 
567 	/*
568 	 * -Start the timers in all of our membership records
569 	 *  for the physical interface on which the query
570 	 *  arrived, excluding those that belong to the "all
571 	 *  hosts" group (224.0.0.1).
572 	 *
573 	 * -Restart any timer that is already running but has
574 	 *  a value longer than the requested timeout.
575 	 *
576 	 * -Use the value specified in the query message as
577 	 *  the maximum timeout.
578 	 */
579 	next = (unsigned)INFINITY;
580 	mutex_enter(&ill->ill_lock);
581 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
582 
583 		/*
584 		 * A multicast router joins INADDR_ANY address
585 		 * to enable promiscuous reception of all
586 		 * mcasts from the interface. This INADDR_ANY
587 		 * is stored in the ilm_v6addr as V6 unspec addr
588 		 */
589 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
590 			continue;
591 		if (ilm->ilm_addr == htonl(INADDR_ANY))
592 			continue;
593 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
594 		    (igmpa->igmpa_group == 0) ||
595 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
596 			if (ilm->ilm_timer > timer) {
597 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
598 				if (ilm->ilm_timer < next)
599 					next = ilm->ilm_timer;
600 			}
601 		}
602 	}
603 	mutex_exit(&ill->ill_lock);
604 
605 	return (next);
606 }
607 
608 static uint_t
609 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
610 {
611 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
612 	ilm_t		*ilm;
613 	ipaddr_t	*src_array;
614 	uint8_t		qrv;
615 
616 	/* make sure numsrc matches packet size */
617 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
618 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
619 		++igmpstat.igps_rcv_tooshort;
620 		return (0);
621 	}
622 	src_array = (ipaddr_t *)&igmp3qa[1];
623 
624 	++igmpstat.igps_rcv_queries;
625 
626 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
627 		uint_t hdrval, mant, exp;
628 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
629 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
630 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
631 		mrd = (mant | 0x10) << (exp + 3);
632 	}
633 	if (mrd == 0)
634 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
635 	timer = DSEC_TO_MSEC(mrd);
636 	MCAST_RANDOM_DELAY(delay, timer);
637 	next = (unsigned)INFINITY;
638 
639 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
640 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
641 	else
642 		ill->ill_mcast_rv = qrv;
643 
644 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
645 		uint_t hdrval, mant, exp;
646 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
647 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
648 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
649 		qqi = (mant | 0x10) << (exp + 3);
650 	}
651 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
652 
653 	/*
654 	 * If we have a pending general query response that's scheduled
655 	 * sooner than the delay we calculated for this response, then
656 	 * no action is required (RFC3376 section 5.2 rule 1)
657 	 */
658 	mutex_enter(&ill->ill_lock);
659 	if (ill->ill_global_timer < delay) {
660 		mutex_exit(&ill->ill_lock);
661 		return (next);
662 	}
663 	mutex_exit(&ill->ill_lock);
664 
665 	/*
666 	 * Now take action depending upon query type:
667 	 * general, group specific, or group/source specific.
668 	 */
669 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
670 		/*
671 		 * general query
672 		 * We know global timer is either not running or is
673 		 * greater than our calculated delay, so reset it to
674 		 * our delay (random value in range [0, response time]).
675 		 */
676 		mutex_enter(&ill->ill_lock);
677 		ill->ill_global_timer = delay;
678 		next = ill->ill_global_timer;
679 		mutex_exit(&ill->ill_lock);
680 
681 	} else {
682 		/* group or group/source specific query */
683 		mutex_enter(&ill->ill_lock);
684 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
685 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
686 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
687 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
688 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
689 				continue;
690 			/*
691 			 * If the query is group specific or we have a
692 			 * pending group specific query, the response is
693 			 * group specific (pending sources list should be
694 			 * empty).  Otherwise, need to update the pending
695 			 * sources list for the group and source specific
696 			 * response.
697 			 */
698 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
699 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
700 group_query:
701 				FREE_SLIST(ilm->ilm_pendsrcs);
702 				ilm->ilm_pendsrcs = NULL;
703 			} else {
704 				boolean_t overflow;
705 				slist_t *pktl;
706 				if (numsrc > MAX_FILTER_SIZE ||
707 				    (ilm->ilm_pendsrcs == NULL &&
708 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
709 					/*
710 					 * We've been sent more sources than
711 					 * we can deal with; or we can't deal
712 					 * with a source list at all.  Revert
713 					 * to a group specific query.
714 					 */
715 					goto group_query;
716 				}
717 				if ((pktl = l_alloc()) == NULL)
718 					goto group_query;
719 				pktl->sl_numsrc = numsrc;
720 				for (i = 0; i < numsrc; i++)
721 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
722 					    &(pktl->sl_addr[i]));
723 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
724 				    &overflow);
725 				l_free(pktl);
726 				if (overflow)
727 					goto group_query;
728 			}
729 			/* choose soonest timer */
730 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
731 			if (ilm->ilm_timer < next)
732 				next = ilm->ilm_timer;
733 		}
734 		mutex_exit(&ill->ill_lock);
735 	}
736 
737 	return (next);
738 }
739 
740 void
741 igmp_joingroup(ilm_t *ilm)
742 {
743 	ill_t	*ill;
744 
745 	ill = ilm->ilm_ipif->ipif_ill;
746 
747 	ASSERT(IAM_WRITER_ILL(ill));
748 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
749 
750 	mutex_enter(&ill->ill_lock);
751 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
752 		ilm->ilm_rtx.rtx_timer = INFINITY;
753 		ilm->ilm_state = IGMP_OTHERMEMBER;
754 		mutex_exit(&ill->ill_lock);
755 	} else {
756 		ip1dbg(("Querier mode %d, sending report, group %x\n",
757 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
758 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
759 			mutex_exit(&ill->ill_lock);
760 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
761 			mutex_enter(&ill->ill_lock);
762 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
763 			mutex_exit(&ill->ill_lock);
764 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
765 			mutex_enter(&ill->ill_lock);
766 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
767 			mrec_t *rp;
768 			mcast_record_t rtype;
769 			/*
770 			 * The possible state changes we need to handle here:
771 			 *   Old State	New State	Report
772 			 *
773 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
774 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
775 			 *
776 			 * No need to send the BLOCK(0) report; ALLOW(X)
777 			 * is enough.
778 			 */
779 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
780 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
781 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
782 			    ilm->ilm_filter, NULL);
783 			mutex_exit(&ill->ill_lock);
784 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
785 			mutex_enter(&ill->ill_lock);
786 			/*
787 			 * Set up retransmission state.  Timer is set below,
788 			 * for both v3 and older versions.
789 			 */
790 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
791 			    ilm->ilm_filter);
792 		}
793 
794 		/* Set the ilm timer value */
795 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
796 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
797 		ilm->ilm_state = IGMP_IREPORTEDLAST;
798 		mutex_exit(&ill->ill_lock);
799 
800 		/*
801 		 * To avoid deadlock, we don't call igmp_start_timers from
802 		 * here. igmp_start_timers needs to call untimeout, and we
803 		 * can't hold the ipsq across untimeout since
804 		 * igmp_timeout_handler could be blocking trying to
805 		 * acquire the ipsq. Instead we start the timer after we get
806 		 * out of the ipsq in ipsq_exit.
807 		 */
808 		mutex_enter(&igmp_timer_lock);
809 		igmp_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
810 		    igmp_deferred_next);
811 		mutex_exit(&igmp_timer_lock);
812 	}
813 
814 	if (ip_debug > 1) {
815 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
816 		    "igmp_joingroup: multicast_type %d timer %d",
817 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
818 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
819 	}
820 }
821 
822 void
823 mld_joingroup(ilm_t *ilm)
824 {
825 	ill_t	*ill;
826 
827 	ill = ilm->ilm_ill;
828 
829 	ASSERT(IAM_WRITER_ILL(ill));
830 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
831 
832 	mutex_enter(&ill->ill_lock);
833 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
834 		ilm->ilm_rtx.rtx_timer = INFINITY;
835 		ilm->ilm_state = IGMP_OTHERMEMBER;
836 		mutex_exit(&ill->ill_lock);
837 	} else {
838 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
839 			mutex_exit(&ill->ill_lock);
840 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
841 			mutex_enter(&ill->ill_lock);
842 		} else {
843 			mrec_t *rp;
844 			mcast_record_t rtype;
845 			/*
846 			 * The possible state changes we need to handle here:
847 			 *	Old State   New State	Report
848 			 *
849 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
850 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
851 			 *
852 			 * No need to send the BLOCK(0) report; ALLOW(X)
853 			 * is enough
854 			 */
855 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
856 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
857 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
858 			    ilm->ilm_filter, NULL);
859 			mutex_exit(&ill->ill_lock);
860 			mldv2_sendrpt(ill, rp);
861 			mutex_enter(&ill->ill_lock);
862 			/*
863 			 * Set up retransmission state.  Timer is set below,
864 			 * for both v2 and v1.
865 			 */
866 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
867 			    ilm->ilm_filter);
868 		}
869 
870 		/* Set the ilm timer value */
871 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
872 		    ilm->ilm_rtx.rtx_cnt > 0);
873 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
874 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
875 		ilm->ilm_state = IGMP_IREPORTEDLAST;
876 		mutex_exit(&ill->ill_lock);
877 
878 		/*
879 		 * To avoid deadlock, we don't call mld_start_timers from
880 		 * here. mld_start_timers needs to call untimeout, and we
881 		 * can't hold the ipsq (i.e. the lock) across untimeout
882 		 * since mld_timeout_handler could be blocking trying to
883 		 * acquire the ipsq. Instead we start the timer after we get
884 		 * out of the ipsq in ipsq_exit
885 		 */
886 		mutex_enter(&mld_timer_lock);
887 		mld_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
888 		    mld_deferred_next);
889 		mutex_exit(&mld_timer_lock);
890 	}
891 
892 	if (ip_debug > 1) {
893 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
894 		    "mld_joingroup: multicast_type %d timer %d",
895 		    (ilm->ilm_ill->ill_mcast_type),
896 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
897 	}
898 }
899 
900 void
901 igmp_leavegroup(ilm_t *ilm)
902 {
903 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
904 
905 	ASSERT(ilm->ilm_ill == NULL);
906 	ASSERT(!ill->ill_isv6);
907 
908 	mutex_enter(&ill->ill_lock);
909 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
910 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
911 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
912 		mutex_exit(&ill->ill_lock);
913 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
914 		    (htonl(INADDR_ALLRTRS_GROUP)));
915 		return;
916 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
917 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
918 		mrec_t *rp;
919 		/*
920 		 * The possible state changes we need to handle here:
921 		 *	Old State	New State	Report
922 		 *
923 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
924 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
925 		 *
926 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
927 		 */
928 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
929 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
930 			    ilm->ilm_filter, NULL);
931 		} else {
932 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
933 			    NULL, NULL);
934 		}
935 		mutex_exit(&ill->ill_lock);
936 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
937 		return;
938 	}
939 	mutex_exit(&ill->ill_lock);
940 }
941 
942 void
943 mld_leavegroup(ilm_t *ilm)
944 {
945 	ill_t *ill = ilm->ilm_ill;
946 
947 	ASSERT(ilm->ilm_ipif == NULL);
948 	ASSERT(ill->ill_isv6);
949 
950 	mutex_enter(&ill->ill_lock);
951 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
952 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
953 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
954 		mutex_exit(&ill->ill_lock);
955 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
956 		return;
957 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
958 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
959 		mrec_t *rp;
960 		/*
961 		 * The possible state changes we need to handle here:
962 		 *	Old State	New State	Report
963 		 *
964 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
965 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
966 		 *
967 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
968 		 */
969 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
970 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
971 			    ilm->ilm_filter, NULL);
972 		} else {
973 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
974 			    NULL, NULL);
975 		}
976 		mutex_exit(&ill->ill_lock);
977 		mldv2_sendrpt(ill, rp);
978 		return;
979 	}
980 	mutex_exit(&ill->ill_lock);
981 }
982 
983 void
984 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
985 {
986 	ill_t *ill;
987 	mrec_t *rp;
988 
989 	ASSERT(ilm != NULL);
990 
991 	/* state change reports should only be sent if the router is v3 */
992 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
993 		return;
994 
995 	if (ilm->ilm_ill == NULL) {
996 		ASSERT(ilm->ilm_ipif != NULL);
997 		ill = ilm->ilm_ipif->ipif_ill;
998 	} else {
999 		ill = ilm->ilm_ill;
1000 	}
1001 
1002 	mutex_enter(&ill->ill_lock);
1003 
1004 	/*
1005 	 * Compare existing(old) state with the new state and prepare
1006 	 * State Change Report, according to the rules in RFC 3376:
1007 	 *
1008 	 *	Old State	New State	State Change Report
1009 	 *
1010 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1011 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1012 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1013 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1014 	 */
1015 
1016 	if (ilm->ilm_fmode == fmode) {
1017 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1018 		slist_t *allow, *block;
1019 		if (((a_minus_b = l_alloc()) == NULL) ||
1020 		    ((b_minus_a = l_alloc()) == NULL)) {
1021 			l_free(a_minus_b);
1022 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1023 				goto send_to_ex;
1024 			else
1025 				goto send_to_in;
1026 		}
1027 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1028 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1029 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1030 			allow = b_minus_a;
1031 			block = a_minus_b;
1032 		} else {
1033 			allow = a_minus_b;
1034 			block = b_minus_a;
1035 		}
1036 		rp = NULL;
1037 		if (!SLIST_IS_EMPTY(allow))
1038 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1039 			    allow, rp);
1040 		if (!SLIST_IS_EMPTY(block))
1041 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1042 			    block, rp);
1043 		l_free(a_minus_b);
1044 		l_free(b_minus_a);
1045 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1046 send_to_ex:
1047 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1048 		    NULL);
1049 	} else {
1050 send_to_in:
1051 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1052 		    NULL);
1053 	}
1054 
1055 	/*
1056 	 * Need to set up retransmission state; merge the new info with the
1057 	 * current state (which may be null).  If the timer is not currently
1058 	 * running, start it (need to do a delayed start of the timer as
1059 	 * we're currently in the sq).
1060 	 */
1061 	rp = mcast_merge_rtx(ilm, rp, flist);
1062 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1063 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1064 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1065 		mutex_enter(&igmp_timer_lock);
1066 		igmp_deferred_next = MIN(igmp_deferred_next,
1067 		    ilm->ilm_rtx.rtx_timer);
1068 		mutex_exit(&igmp_timer_lock);
1069 	}
1070 
1071 	mutex_exit(&ill->ill_lock);
1072 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1073 }
1074 
1075 void
1076 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1077 {
1078 	ill_t *ill;
1079 	mrec_t *rp = NULL;
1080 
1081 	ASSERT(ilm != NULL);
1082 
1083 	ill = ilm->ilm_ill;
1084 
1085 	/* only need to send if we have an mldv2-capable router */
1086 	mutex_enter(&ill->ill_lock);
1087 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1088 		mutex_exit(&ill->ill_lock);
1089 		return;
1090 	}
1091 
1092 	/*
1093 	 * Compare existing (old) state with the new state passed in
1094 	 * and send appropriate MLDv2 State Change Report.
1095 	 *
1096 	 *	Old State	New State	State Change Report
1097 	 *
1098 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1099 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1100 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1101 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1102 	 */
1103 	if (ilm->ilm_fmode == fmode) {
1104 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1105 		slist_t *allow, *block;
1106 		if (((a_minus_b = l_alloc()) == NULL) ||
1107 		    ((b_minus_a = l_alloc()) == NULL)) {
1108 			l_free(a_minus_b);
1109 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1110 				goto send_to_ex;
1111 			else
1112 				goto send_to_in;
1113 		}
1114 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1115 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1116 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1117 			allow = b_minus_a;
1118 			block = a_minus_b;
1119 		} else {
1120 			allow = a_minus_b;
1121 			block = b_minus_a;
1122 		}
1123 		if (!SLIST_IS_EMPTY(allow))
1124 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1125 			    allow, rp);
1126 		if (!SLIST_IS_EMPTY(block))
1127 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1128 			    block, rp);
1129 		l_free(a_minus_b);
1130 		l_free(b_minus_a);
1131 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1132 send_to_ex:
1133 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1134 		    NULL);
1135 	} else {
1136 send_to_in:
1137 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1138 		    NULL);
1139 	}
1140 
1141 	/*
1142 	 * Need to set up retransmission state; merge the new info with the
1143 	 * current state (which may be null).  If the timer is not currently
1144 	 * running, start it (need to do a deferred start of the timer as
1145 	 * we're currently in the sq).
1146 	 */
1147 	rp = mcast_merge_rtx(ilm, rp, flist);
1148 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1149 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1150 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1151 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1152 		mutex_enter(&mld_timer_lock);
1153 		mld_deferred_next =
1154 		    MIN(mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1155 		mutex_exit(&mld_timer_lock);
1156 	}
1157 
1158 	mutex_exit(&ill->ill_lock);
1159 	mldv2_sendrpt(ill, rp);
1160 }
1161 
1162 uint_t
1163 igmp_timeout_handler_per_ill(ill_t *ill, int elapsed)
1164 {
1165 	uint_t	next = INFINITY;
1166 	ilm_t	*ilm;
1167 	ipif_t	*ipif;
1168 	mrec_t	*rp = NULL;
1169 	mrec_t	*rtxrp = NULL;
1170 	rtx_state_t *rtxp;
1171 	mcast_record_t	rtype;
1172 
1173 	ASSERT(IAM_WRITER_ILL(ill));
1174 
1175 	mutex_enter(&ill->ill_lock);
1176 
1177 	/* First check the global timer on this interface */
1178 	if (ill->ill_global_timer == INFINITY)
1179 		goto per_ilm_timer;
1180 	if (ill->ill_global_timer <= elapsed) {
1181 		ill->ill_global_timer = INFINITY;
1182 		/*
1183 		 * Send report for each group on this interface.
1184 		 * Since we just set the global timer (received a v3 general
1185 		 * query), need to skip the all hosts addr (224.0.0.1), per
1186 		 * RFC 3376 section 5.
1187 		 */
1188 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1189 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1190 				continue;
1191 			ASSERT(ilm->ilm_ipif != NULL);
1192 			ilm->ilm_ipif->ipif_igmp_rpt =
1193 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1194 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1195 			/*
1196 			 * Since we're sending a report on this group, okay
1197 			 * to delete pending group-specific timers.  Note
1198 			 * that group-specific retransmit timers still need
1199 			 * to be checked in the per_ilm_timer for-loop.
1200 			 */
1201 			ilm->ilm_timer = INFINITY;
1202 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1203 			FREE_SLIST(ilm->ilm_pendsrcs);
1204 			ilm->ilm_pendsrcs = NULL;
1205 		}
1206 		/*
1207 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1208 		 * and send a report for each ipif that has an mrec list.
1209 		 */
1210 		for (ipif = ill->ill_ipif; ipif != NULL;
1211 		    ipif = ipif->ipif_next) {
1212 			if (ipif->ipif_igmp_rpt == NULL)
1213 				continue;
1214 			mutex_exit(&ill->ill_lock);
1215 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1216 			mutex_enter(&ill->ill_lock);
1217 			/* mrec list was freed by igmpv3_sendrpt() */
1218 			ipif->ipif_igmp_rpt = NULL;
1219 		}
1220 	} else {
1221 		ill->ill_global_timer -= elapsed;
1222 		if (ill->ill_global_timer < next)
1223 			next = ill->ill_global_timer;
1224 	}
1225 
1226 per_ilm_timer:
1227 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1228 		if (ilm->ilm_timer == INFINITY)
1229 			goto per_ilm_rtxtimer;
1230 
1231 		if (ilm->ilm_timer > elapsed) {
1232 			ilm->ilm_timer -= elapsed;
1233 			if (ilm->ilm_timer < next)
1234 				next = ilm->ilm_timer;
1235 
1236 			if (ip_debug > 1) {
1237 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1238 				    "igmp_timo_hlr 2: ilm_timr %d elap %d "
1239 				    "typ %d nxt %d",
1240 				    (int)ntohl(ilm->ilm_timer), elapsed,
1241 				    (ill->ill_mcast_type), next);
1242 			}
1243 
1244 			goto per_ilm_rtxtimer;
1245 		}
1246 
1247 		/* the timer has expired, need to take action */
1248 		ilm->ilm_timer = INFINITY;
1249 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1250 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1251 			mutex_exit(&ill->ill_lock);
1252 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1253 			mutex_enter(&ill->ill_lock);
1254 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1255 			mutex_exit(&ill->ill_lock);
1256 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1257 			mutex_enter(&ill->ill_lock);
1258 		} else {
1259 			slist_t *rsp;
1260 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1261 			    (rsp = l_alloc()) != NULL) {
1262 				/*
1263 				 * Contents of reply depend on pending
1264 				 * requested source list.
1265 				 */
1266 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1267 					l_intersection(ilm->ilm_filter,
1268 					    ilm->ilm_pendsrcs, rsp);
1269 				} else {
1270 					l_difference(ilm->ilm_pendsrcs,
1271 					    ilm->ilm_filter, rsp);
1272 				}
1273 				FREE_SLIST(ilm->ilm_pendsrcs);
1274 				ilm->ilm_pendsrcs = NULL;
1275 				if (!SLIST_IS_EMPTY(rsp))
1276 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1277 					    &ilm->ilm_v6addr, rsp, rp);
1278 				FREE_SLIST(rsp);
1279 			} else {
1280 				/*
1281 				 * Either the pending request is just group-
1282 				 * specific, or we couldn't get the resources
1283 				 * (rsp) to build a source-specific reply.
1284 				 */
1285 				rp = mcast_bldmrec(ilm->ilm_fmode,
1286 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1287 			}
1288 			mutex_exit(&ill->ill_lock);
1289 			igmpv3_sendrpt(ill->ill_ipif, rp);
1290 			mutex_enter(&ill->ill_lock);
1291 			rp = NULL;
1292 		}
1293 
1294 		if (ip_debug > 1) {
1295 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1296 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1297 			    "typ %d nxt %d",
1298 			    (int)ntohl(ilm->ilm_timer), elapsed,
1299 			    (ill->ill_mcast_type), next);
1300 		}
1301 
1302 per_ilm_rtxtimer:
1303 		rtxp = &ilm->ilm_rtx;
1304 
1305 		if (rtxp->rtx_timer == INFINITY)
1306 			continue;
1307 		if (rtxp->rtx_timer > elapsed) {
1308 			rtxp->rtx_timer -= elapsed;
1309 			if (rtxp->rtx_timer < next)
1310 				next = rtxp->rtx_timer;
1311 			continue;
1312 		}
1313 
1314 		rtxp->rtx_timer = INFINITY;
1315 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1316 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1317 			mutex_exit(&ill->ill_lock);
1318 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1319 			mutex_enter(&ill->ill_lock);
1320 			continue;
1321 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1322 			mutex_exit(&ill->ill_lock);
1323 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1324 			mutex_enter(&ill->ill_lock);
1325 			continue;
1326 		}
1327 
1328 		/*
1329 		 * The retransmit timer has popped, and our router is
1330 		 * IGMPv3.  We have to delve into the retransmit state
1331 		 * stored in the ilm.
1332 		 *
1333 		 * Decrement the retransmit count.  If the fmode rtx
1334 		 * count is active, decrement it, and send a filter
1335 		 * mode change report with the ilm's source list.
1336 		 * Otherwise, send a source list change report with
1337 		 * the current retransmit lists.
1338 		 */
1339 		ASSERT(rtxp->rtx_cnt > 0);
1340 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1341 		rtxp->rtx_cnt--;
1342 		if (rtxp->rtx_fmode_cnt > 0) {
1343 			rtxp->rtx_fmode_cnt--;
1344 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1345 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1346 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1347 			    ilm->ilm_filter, rtxrp);
1348 		} else {
1349 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1350 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1351 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1352 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1353 		}
1354 		if (rtxp->rtx_cnt > 0) {
1355 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1356 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1357 			if (rtxp->rtx_timer < next)
1358 				next = rtxp->rtx_timer;
1359 		} else {
1360 			CLEAR_SLIST(rtxp->rtx_allow);
1361 			CLEAR_SLIST(rtxp->rtx_block);
1362 		}
1363 		mutex_exit(&ill->ill_lock);
1364 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1365 		mutex_enter(&ill->ill_lock);
1366 		rtxrp = NULL;
1367 	}
1368 
1369 	mutex_exit(&ill->ill_lock);
1370 
1371 	return (next);
1372 }
1373 
1374 /*
1375  * igmp_timeout_handler:
1376  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1377  * Returns number of ticks to next event (or 0 if none).
1378  *
1379  * As part of multicast join and leave igmp we may need to send out an
1380  * igmp request. The igmp related state variables in the ilm are protected
1381  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1382  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1383  * starts the igmp timer if needed. It serializes multiple threads trying to
1384  * simultaneously start the timer using the igmp_timer_setter_active flag.
1385  *
1386  * igmp_input() receives igmp queries and responds to the queries
1387  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1388  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1389  * performs the action exclusively after entering each ill's ipsq as writer.
1390  * The actual igmp timeout handler needs to run in the ipsq since it has to
1391  * access the ilm's and we don't want another exclusive operation like
1392  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1393  * another.
1394  *
1395  * The igmp_slowtimeo() function is called thru another timer.
1396  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1397  */
1398 
1399 /* ARGSUSED */
1400 void
1401 igmp_timeout_handler(void *arg)
1402 {
1403 	ill_t	*ill;
1404 	int	elapsed;	/* Since last call */
1405 	uint_t  global_next = INFINITY;
1406 	uint_t  next;
1407 	ill_walk_context_t ctx;
1408 	boolean_t success;
1409 
1410 	mutex_enter(&igmp_timer_lock);
1411 	ASSERT(igmp_timeout_id != 0);
1412 	igmp_timer_fired_last = ddi_get_lbolt();
1413 	elapsed = igmp_time_to_next;
1414 	igmp_time_to_next = 0;
1415 	mutex_exit(&igmp_timer_lock);
1416 
1417 	rw_enter(&ill_g_lock, RW_READER);
1418 	ill = ILL_START_WALK_V4(&ctx);
1419 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1420 		ASSERT(!ill->ill_isv6);
1421 		/*
1422 		 * We may not be able to refhold the ill if the ill/ipif
1423 		 * is changing. But we need to make sure that the ill will
1424 		 * not vanish. So we just bump up the ill_waiter count.
1425 		 */
1426 		if (!ill_waiter_inc(ill))
1427 			continue;
1428 		rw_exit(&ill_g_lock);
1429 		success = ipsq_enter(ill, B_TRUE);
1430 		if (success) {
1431 			next = igmp_timeout_handler_per_ill(ill, elapsed);
1432 			if (next < global_next)
1433 				global_next = next;
1434 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1435 			    B_TRUE);
1436 		}
1437 		rw_enter(&ill_g_lock, RW_READER);
1438 		ill_waiter_dcr(ill);
1439 	}
1440 	rw_exit(&ill_g_lock);
1441 
1442 	mutex_enter(&igmp_timer_lock);
1443 	ASSERT(igmp_timeout_id != 0);
1444 	igmp_timeout_id = 0;
1445 	mutex_exit(&igmp_timer_lock);
1446 
1447 	if (global_next != INFINITY)
1448 		igmp_start_timers(global_next);
1449 }
1450 
1451 /*
1452  * mld_timeout_handler:
1453  * Called when there are timeout events, every next (tick).
1454  * Returns number of ticks to next event (or 0 if none).
1455  */
1456 /* ARGSUSED */
1457 uint_t
1458 mld_timeout_handler_per_ill(ill_t *ill, int elapsed)
1459 {
1460 	ilm_t 	*ilm;
1461 	uint_t	next = INFINITY;
1462 	mrec_t	*rp, *rtxrp;
1463 	rtx_state_t *rtxp;
1464 	mcast_record_t	rtype;
1465 
1466 	ASSERT(IAM_WRITER_ILL(ill));
1467 
1468 	mutex_enter(&ill->ill_lock);
1469 
1470 	/*
1471 	 * First check the global timer on this interface; the global timer
1472 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1473 	 */
1474 	if (ill->ill_global_timer == INFINITY)
1475 		goto per_ilm_timer;
1476 	if (ill->ill_global_timer <= elapsed) {
1477 		ill->ill_global_timer = INFINITY;
1478 		/*
1479 		 * Send report for each group on this interface.
1480 		 * Since we just set the global timer (received a v2 general
1481 		 * query), need to skip the all hosts addr (ff02::1), per
1482 		 * RFC 3810 section 6.
1483 		 */
1484 		rp = NULL;
1485 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1486 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1487 			    &ipv6_all_hosts_mcast))
1488 				continue;
1489 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1490 			    ilm->ilm_filter, rp);
1491 			/*
1492 			 * Since we're sending a report on this group, okay
1493 			 * to delete pending group-specific timers.  Note
1494 			 * that group-specific retransmit timers still need
1495 			 * to be checked in the per_ilm_timer for-loop.
1496 			 */
1497 			ilm->ilm_timer = INFINITY;
1498 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1499 			FREE_SLIST(ilm->ilm_pendsrcs);
1500 			ilm->ilm_pendsrcs = NULL;
1501 		}
1502 		mutex_exit(&ill->ill_lock);
1503 		mldv2_sendrpt(ill, rp);
1504 		mutex_enter(&ill->ill_lock);
1505 	} else {
1506 		ill->ill_global_timer -= elapsed;
1507 		if (ill->ill_global_timer < next)
1508 			next = ill->ill_global_timer;
1509 	}
1510 
1511 per_ilm_timer:
1512 	rp = rtxrp = NULL;
1513 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1514 		if (ilm->ilm_timer == INFINITY)
1515 			goto per_ilm_rtxtimer;
1516 
1517 		if (ilm->ilm_timer > elapsed) {
1518 			ilm->ilm_timer -= elapsed;
1519 			if (ilm->ilm_timer < next)
1520 				next = ilm->ilm_timer;
1521 
1522 			if (ip_debug > 1) {
1523 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1524 				    "igmp_timo_hlr 2: ilm_timr"
1525 				    " %d elap %d typ %d nxt %d",
1526 				    (int)ntohl(ilm->ilm_timer), elapsed,
1527 				    (ill->ill_mcast_type), next);
1528 			}
1529 
1530 			goto per_ilm_rtxtimer;
1531 		}
1532 
1533 		/* the timer has expired, need to take action */
1534 		ilm->ilm_timer = INFINITY;
1535 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1536 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1537 			mutex_exit(&ill->ill_lock);
1538 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1539 			mutex_enter(&ill->ill_lock);
1540 		} else {
1541 			slist_t *rsp;
1542 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1543 			    (rsp = l_alloc()) != NULL) {
1544 				/*
1545 				 * Contents of reply depend on pending
1546 				 * requested source list.
1547 				 */
1548 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1549 					l_intersection(ilm->ilm_filter,
1550 					    ilm->ilm_pendsrcs, rsp);
1551 				} else {
1552 					l_difference(ilm->ilm_pendsrcs,
1553 					    ilm->ilm_filter, rsp);
1554 				}
1555 				FREE_SLIST(ilm->ilm_pendsrcs);
1556 				ilm->ilm_pendsrcs = NULL;
1557 				if (!SLIST_IS_EMPTY(rsp))
1558 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1559 					    &ilm->ilm_v6addr, rsp, rp);
1560 				FREE_SLIST(rsp);
1561 			} else {
1562 				rp = mcast_bldmrec(ilm->ilm_fmode,
1563 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1564 			}
1565 		}
1566 
1567 		if (ip_debug > 1) {
1568 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1569 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1570 			    "typ %d nxt %d",
1571 			    (int)ntohl(ilm->ilm_timer), elapsed,
1572 			    (ill->ill_mcast_type), next);
1573 		}
1574 
1575 per_ilm_rtxtimer:
1576 		rtxp = &ilm->ilm_rtx;
1577 
1578 		if (rtxp->rtx_timer == INFINITY)
1579 			continue;
1580 		if (rtxp->rtx_timer > elapsed) {
1581 			rtxp->rtx_timer -= elapsed;
1582 			if (rtxp->rtx_timer < next)
1583 				next = rtxp->rtx_timer;
1584 			continue;
1585 		}
1586 
1587 		rtxp->rtx_timer = INFINITY;
1588 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1589 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1590 			mutex_exit(&ill->ill_lock);
1591 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1592 			mutex_enter(&ill->ill_lock);
1593 			continue;
1594 		}
1595 
1596 		/*
1597 		 * The retransmit timer has popped, and our router is
1598 		 * MLDv2.  We have to delve into the retransmit state
1599 		 * stored in the ilm.
1600 		 *
1601 		 * Decrement the retransmit count.  If the fmode rtx
1602 		 * count is active, decrement it, and send a filter
1603 		 * mode change report with the ilm's source list.
1604 		 * Otherwise, send a source list change report with
1605 		 * the current retransmit lists.
1606 		 */
1607 		ASSERT(rtxp->rtx_cnt > 0);
1608 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1609 		rtxp->rtx_cnt--;
1610 		if (rtxp->rtx_fmode_cnt > 0) {
1611 			rtxp->rtx_fmode_cnt--;
1612 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1613 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1614 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1615 			    ilm->ilm_filter, rtxrp);
1616 		} else {
1617 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1618 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1619 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1620 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1621 		}
1622 		if (rtxp->rtx_cnt > 0) {
1623 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1624 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1625 			if (rtxp->rtx_timer < next)
1626 				next = rtxp->rtx_timer;
1627 		} else {
1628 			CLEAR_SLIST(rtxp->rtx_allow);
1629 			CLEAR_SLIST(rtxp->rtx_block);
1630 		}
1631 	}
1632 
1633 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1634 		mutex_exit(&ill->ill_lock);
1635 		mldv2_sendrpt(ill, rp);
1636 		mldv2_sendrpt(ill, rtxrp);
1637 		return (next);
1638 	}
1639 
1640 	mutex_exit(&ill->ill_lock);
1641 
1642 	return (next);
1643 }
1644 
1645 /*
1646  * mld_timeout_handler:
1647  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1648  * Returns number of ticks to next event (or 0 if none).
1649  * MT issues are same as igmp_timeout_handler
1650  */
1651 /* ARGSUSED */
1652 void
1653 mld_timeout_handler(void *arg)
1654 {
1655 	ill_t	*ill;
1656 	int	elapsed;	/* Since last call */
1657 	uint_t  global_next = INFINITY;
1658 	uint_t  next;
1659 	ill_walk_context_t ctx;
1660 	boolean_t success;
1661 
1662 	mutex_enter(&mld_timer_lock);
1663 	ASSERT(mld_timeout_id != 0);
1664 	mld_timer_fired_last = ddi_get_lbolt();
1665 	elapsed = mld_time_to_next;
1666 	mld_time_to_next = 0;
1667 	mutex_exit(&mld_timer_lock);
1668 
1669 	rw_enter(&ill_g_lock, RW_READER);
1670 	ill = ILL_START_WALK_V6(&ctx);
1671 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1672 		ASSERT(ill->ill_isv6);
1673 		/*
1674 		 * We may not be able to refhold the ill if the ill/ipif
1675 		 * is changing. But we need to make sure that the ill will
1676 		 * not vanish. So we just bump up the ill_waiter count.
1677 		 */
1678 		if (!ill_waiter_inc(ill))
1679 			continue;
1680 		rw_exit(&ill_g_lock);
1681 		success = ipsq_enter(ill, B_TRUE);
1682 		if (success) {
1683 			next = mld_timeout_handler_per_ill(ill, elapsed);
1684 			if (next < global_next)
1685 				global_next = next;
1686 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1687 			    B_FALSE);
1688 		}
1689 		rw_enter(&ill_g_lock, RW_READER);
1690 		ill_waiter_dcr(ill);
1691 	}
1692 	rw_exit(&ill_g_lock);
1693 
1694 	mutex_enter(&mld_timer_lock);
1695 	ASSERT(mld_timeout_id != 0);
1696 	mld_timeout_id = 0;
1697 	mutex_exit(&mld_timer_lock);
1698 
1699 	if (global_next != INFINITY)
1700 		mld_start_timers(global_next);
1701 }
1702 
1703 /*
1704  * Calculate the Older Version Querier Present timeout value, in number
1705  * of slowtimo intervals, for the given ill.
1706  */
1707 #define	OVQP(ill) \
1708 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1709 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1710 
1711 /*
1712  * igmp_slowtimo:
1713  * - Resets to new router if we didnt we hear from the router
1714  *   in IGMP_AGE_THRESHOLD seconds.
1715  * - Resets slowtimeout.
1716  */
1717 /* ARGSUSED */
1718 void
1719 igmp_slowtimo(void *arg)
1720 {
1721 	ill_t	*ill;
1722 	ill_if_t *ifp;
1723 	avl_tree_t *avl_tree;
1724 
1725 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1726 	rw_enter(&ill_g_lock, RW_READER);
1727 
1728 	/*
1729 	 * The ill_if_t list is circular, hence the odd loop parameters.
1730 	 *
1731 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1732 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1733 	 * structure (allowing us to skip if none of the instances have timers
1734 	 * running).
1735 	 */
1736 	for (ifp = IP_V4_ILL_G_LIST; ifp != (ill_if_t *)&IP_V4_ILL_G_LIST;
1737 	    ifp = ifp->illif_next) {
1738 		/*
1739 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1740 		 * a V1 or V2 query now and we miss seeing the count now,
1741 		 * we will see it the next time igmp_slowtimo is called.
1742 		 */
1743 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1744 			continue;
1745 
1746 		avl_tree = &ifp->illif_avl_by_ppa;
1747 		for (ill = avl_first(avl_tree); ill != NULL;
1748 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1749 			mutex_enter(&ill->ill_lock);
1750 			if (ill->ill_mcast_v1_tset == 1)
1751 				ill->ill_mcast_v1_time++;
1752 			if (ill->ill_mcast_v2_tset == 1)
1753 				ill->ill_mcast_v2_time++;
1754 			if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1755 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1756 					if (ill->ill_mcast_v2_tset > 0) {
1757 						ip1dbg(("V1 query timer "
1758 						    "expired on %s; switching "
1759 						    "mode to IGMP_V2\n",
1760 						    ill->ill_name));
1761 						ill->ill_mcast_type =
1762 						    IGMP_V2_ROUTER;
1763 					} else {
1764 						ip1dbg(("V1 query timer "
1765 						    "expired on %s; switching "
1766 						    "mode to IGMP_V3\n",
1767 						    ill->ill_name));
1768 						ill->ill_mcast_type =
1769 						    IGMP_V3_ROUTER;
1770 					}
1771 					ill->ill_mcast_v1_time = 0;
1772 					ill->ill_mcast_v1_tset = 0;
1773 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1774 				}
1775 			}
1776 			if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1777 				if (ill->ill_mcast_v2_time >= OVQP(ill)) {
1778 					ip1dbg(("V2 query timer expired on "
1779 					    "%s; switching mode to IGMP_V3\n",
1780 					    ill->ill_name));
1781 					ill->ill_mcast_type = IGMP_V3_ROUTER;
1782 					ill->ill_mcast_v2_time = 0;
1783 					ill->ill_mcast_v2_tset = 0;
1784 					atomic_add_16(&ifp->illif_mcast_v2, -1);
1785 				}
1786 			}
1787 			mutex_exit(&ill->ill_lock);
1788 		}
1789 
1790 	}
1791 	rw_exit(&ill_g_lock);
1792 	mutex_enter(&igmp_slowtimeout_lock);
1793 	igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL,
1794 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1795 	mutex_exit(&igmp_slowtimeout_lock);
1796 }
1797 
1798 /*
1799  * mld_slowtimo:
1800  * - Resets to newer version if we didn't hear from the older version router
1801  *   in MLD_AGE_THRESHOLD seconds.
1802  * - Restarts slowtimeout.
1803  */
1804 /* ARGSUSED */
1805 void
1806 mld_slowtimo(void *arg)
1807 {
1808 	ill_t *ill;
1809 	ill_if_t *ifp;
1810 	avl_tree_t *avl_tree;
1811 
1812 	/* See comments in igmp_slowtimo() above... */
1813 	rw_enter(&ill_g_lock, RW_READER);
1814 	for (ifp = IP_V6_ILL_G_LIST; ifp != (ill_if_t *)&IP_V6_ILL_G_LIST;
1815 	    ifp = ifp->illif_next) {
1816 
1817 		if (ifp->illif_mcast_v1 == 0)
1818 			continue;
1819 
1820 		avl_tree = &ifp->illif_avl_by_ppa;
1821 		for (ill = avl_first(avl_tree); ill != NULL;
1822 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1823 			mutex_enter(&ill->ill_lock);
1824 			if (ill->ill_mcast_v1_tset == 1)
1825 				ill->ill_mcast_v1_time++;
1826 			if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1827 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1828 					ip1dbg(("MLD query timer expired on"
1829 					    " %s; switching mode to MLD_V2\n",
1830 					    ill->ill_name));
1831 					ill->ill_mcast_type = MLD_V2_ROUTER;
1832 					ill->ill_mcast_v1_time = 0;
1833 					ill->ill_mcast_v1_tset = 0;
1834 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1835 				}
1836 			}
1837 			mutex_exit(&ill->ill_lock);
1838 		}
1839 	}
1840 	rw_exit(&ill_g_lock);
1841 	mutex_enter(&mld_slowtimeout_lock);
1842 	mld_slowtimeout_id = timeout(mld_slowtimo, NULL,
1843 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1844 	mutex_exit(&mld_slowtimeout_lock);
1845 }
1846 
1847 /*
1848  * igmp_sendpkt:
1849  * This will send to ip_wput like icmp_inbound.
1850  * Note that the lower ill (on which the membership is kept) is used
1851  * as an upper ill to pass in the multicast parameters.
1852  */
1853 static void
1854 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1855 {
1856 	mblk_t	*mp;
1857 	igmpa_t	*igmpa;
1858 	uint8_t *rtralert;
1859 	ipha_t	*ipha;
1860 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1861 	size_t	size  = hdrlen + sizeof (igmpa_t);
1862 	ipif_t 	*ipif = ilm->ilm_ipif;
1863 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1864 	mblk_t	*first_mp;
1865 	ipsec_out_t *io;
1866 	zoneid_t zoneid;
1867 
1868 	/*
1869 	 * We need to make sure this packet goes out on an ipif. If
1870 	 * there is some global policy match in ip_wput_ire, we need
1871 	 * to get to the right interface after IPSEC processing.
1872 	 * To make sure this multicast packet goes out on the right
1873 	 * interface, we attach an ipsec_out and initialize ill_index
1874 	 * like we did in ip_wput. To make sure that this packet does
1875 	 * not get forwarded on other interfaces or looped back, we
1876 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1877 	 * to B_FALSE.
1878 	 *
1879 	 * We also need to make sure that this does not get load balanced
1880 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1881 	 * here. If it gets load balanced, switches supporting igmp snooping
1882 	 * will send the packet that it receives for this multicast group
1883 	 * to the interface that we are sending on. As we have joined the
1884 	 * multicast group on this ill, by sending the packet out on this
1885 	 * ill, we receive all the packets back on this ill.
1886 	 */
1887 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1888 	if (first_mp == NULL)
1889 		return;
1890 
1891 	first_mp->b_datap->db_type = M_CTL;
1892 	first_mp->b_wptr += sizeof (ipsec_info_t);
1893 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1894 	/* ipsec_out_secure is B_FALSE now */
1895 	io = (ipsec_out_t *)first_mp->b_rptr;
1896 	io->ipsec_out_type = IPSEC_OUT;
1897 	io->ipsec_out_len = sizeof (ipsec_out_t);
1898 	io->ipsec_out_use_global_policy = B_TRUE;
1899 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1900 	io->ipsec_out_attach_if = B_TRUE;
1901 	io->ipsec_out_multicast_loop = B_FALSE;
1902 	io->ipsec_out_dontroute = B_TRUE;
1903 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1904 		zoneid = GLOBAL_ZONEID;
1905 	io->ipsec_out_zoneid = zoneid;
1906 
1907 	mp = allocb(size, BPRI_HI);
1908 	if (mp == NULL) {
1909 		freemsg(first_mp);
1910 		return;
1911 	}
1912 	mp->b_wptr = mp->b_rptr + size;
1913 	first_mp->b_cont = mp;
1914 
1915 	ipha = (ipha_t *)mp->b_rptr;
1916 	rtralert = (uint8_t *)&(ipha[1]);
1917 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1918 	igmpa->igmpa_type   = type;
1919 	igmpa->igmpa_code   = 0;
1920 	igmpa->igmpa_group  = ilm->ilm_addr;
1921 	igmpa->igmpa_cksum  = 0;
1922 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1923 	if (igmpa->igmpa_cksum == 0)
1924 		igmpa->igmpa_cksum = 0xffff;
1925 
1926 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
1927 	rtralert[1] = RTRALERT_LEN;
1928 	rtralert[2] = 0;
1929 	rtralert[3] = 0;
1930 
1931 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1932 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1933 	ipha->ipha_type_of_service 	= 0;
1934 	ipha->ipha_length = htons(size);
1935 	ipha->ipha_ident = 0;
1936 	ipha->ipha_fragment_offset_and_flags = 0;
1937 	ipha->ipha_ttl 		= IGMP_TTL;
1938 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1939 	ipha->ipha_hdr_checksum 	= 0;
1940 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1941 	ipha->ipha_src 		= ipif->ipif_src_addr;
1942 	/*
1943 	 * Request loopback of the report if we are acting as a multicast
1944 	 * router, so that the process-level routing demon can hear it.
1945 	 */
1946 	/*
1947 	 * This will run multiple times for the same group if there are members
1948 	 * on the same group for multiple ipif's on the same ill. The
1949 	 * igmp_input code will suppress this due to the loopback thus we
1950 	 * always loopback membership report.
1951 	 */
1952 	ASSERT(ill->ill_rq != NULL);
1953 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1954 
1955 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
1956 
1957 	++igmpstat.igps_snd_reports;
1958 }
1959 
1960 /*
1961  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1962  * with the passed-in ipif.  The report will contain one group record
1963  * for each element of reclist.  If this causes packet length to
1964  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1965  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1966  * and those buffers are freed here.
1967  */
1968 static void
1969 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1970 {
1971 	ipsec_out_t *io;
1972 	igmp3ra_t *igmp3ra;
1973 	grphdra_t *grphdr;
1974 	mblk_t *first_mp, *mp;
1975 	ipha_t *ipha;
1976 	uint8_t *rtralert;
1977 	ipaddr_t *src_array;
1978 	int i, j, numrec, more_src_cnt;
1979 	size_t hdrsize, size, rsize;
1980 	ill_t *ill = ipif->ipif_ill;
1981 	mrec_t *rp, *cur_reclist;
1982 	mrec_t *next_reclist = reclist;
1983 	boolean_t morepkts;
1984 	zoneid_t zoneid;
1985 
1986 	/* if there aren't any records, there's nothing to send */
1987 	if (reclist == NULL)
1988 		return;
1989 
1990 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1991 nextpkt:
1992 	size = hdrsize + sizeof (igmp3ra_t);
1993 	morepkts = B_FALSE;
1994 	more_src_cnt = 0;
1995 	cur_reclist = next_reclist;
1996 	numrec = 0;
1997 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1998 		rsize = sizeof (grphdra_t) +
1999 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2000 		if (size + rsize > ill->ill_max_frag) {
2001 			if (rp == cur_reclist) {
2002 				/*
2003 				 * If the first mrec we looked at is too big
2004 				 * to fit in a single packet (i.e the source
2005 				 * list is too big), we must either truncate
2006 				 * the list (if TO_EX or IS_EX), or send
2007 				 * multiple reports for the same group (all
2008 				 * other types).
2009 				 */
2010 				int srcspace, srcsperpkt;
2011 				srcspace = ill->ill_max_frag - (size +
2012 				    sizeof (grphdra_t));
2013 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2014 				/*
2015 				 * Increment size and numrec, because we will
2016 				 * be sending a record for the mrec we're
2017 				 * looking at now.
2018 				 */
2019 				size += sizeof (grphdra_t) +
2020 				    (srcsperpkt * sizeof (ipaddr_t));
2021 				numrec++;
2022 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2023 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2024 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2025 					if (rp->mrec_next == NULL) {
2026 						/* no more packets to send */
2027 						break;
2028 					} else {
2029 						/*
2030 						 * more packets, but we're
2031 						 * done with this mrec.
2032 						 */
2033 						next_reclist = rp->mrec_next;
2034 					}
2035 				} else {
2036 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2037 					    - srcsperpkt;
2038 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2039 					/*
2040 					 * We'll fix up this mrec (remove the
2041 					 * srcs we've already sent) before
2042 					 * returning to nextpkt above.
2043 					 */
2044 					next_reclist = rp;
2045 				}
2046 			} else {
2047 				next_reclist = rp;
2048 			}
2049 			morepkts = B_TRUE;
2050 			break;
2051 		}
2052 		size += rsize;
2053 		numrec++;
2054 	}
2055 
2056 	/*
2057 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2058 	 * load balancing requirements.
2059 	 */
2060 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2061 	if (first_mp == NULL)
2062 		goto free_reclist;
2063 
2064 	first_mp->b_datap->db_type = M_CTL;
2065 	first_mp->b_wptr += sizeof (ipsec_info_t);
2066 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2067 	/* ipsec_out_secure is B_FALSE now */
2068 	io = (ipsec_out_t *)first_mp->b_rptr;
2069 	io->ipsec_out_type = IPSEC_OUT;
2070 	io->ipsec_out_len = sizeof (ipsec_out_t);
2071 	io->ipsec_out_use_global_policy = B_TRUE;
2072 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2073 	io->ipsec_out_attach_if = B_TRUE;
2074 	io->ipsec_out_multicast_loop = B_FALSE;
2075 	io->ipsec_out_dontroute = B_TRUE;
2076 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2077 		zoneid = GLOBAL_ZONEID;
2078 	io->ipsec_out_zoneid = zoneid;
2079 
2080 	mp = allocb(size, BPRI_HI);
2081 	if (mp == NULL) {
2082 		freemsg(first_mp);
2083 		goto free_reclist;
2084 	}
2085 	bzero((char *)mp->b_rptr, size);
2086 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2087 	first_mp->b_cont = mp;
2088 
2089 	ipha = (ipha_t *)mp->b_rptr;
2090 	rtralert = (uint8_t *)&(ipha[1]);
2091 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2092 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2093 
2094 	rp = cur_reclist;
2095 	for (i = 0; i < numrec; i++) {
2096 		grphdr->grphdra_type = rp->mrec_type;
2097 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2098 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2099 		src_array = (ipaddr_t *)&(grphdr[1]);
2100 
2101 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2102 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2103 
2104 		grphdr = (grphdra_t *)&(src_array[j]);
2105 		rp = rp->mrec_next;
2106 	}
2107 
2108 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2109 	igmp3ra->igmp3ra_numrec = htons(numrec);
2110 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2111 
2112 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
2113 	rtralert[1] = RTRALERT_LEN;
2114 	rtralert[2] = 0;
2115 	rtralert[3] = 0;
2116 
2117 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2118 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2119 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2120 	ipha->ipha_length = htons(size);
2121 	ipha->ipha_ttl = IGMP_TTL;
2122 	ipha->ipha_protocol = IPPROTO_IGMP;
2123 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2124 	ipha->ipha_src = ipif->ipif_src_addr;
2125 
2126 	/*
2127 	 * Request loopback of the report if we are acting as a multicast
2128 	 * router, so that the process-level routing daemon can hear it.
2129 	 *
2130 	 * This will run multiple times for the same group if there are
2131 	 * members on the same group for multiple ipifs on the same ill.
2132 	 * The igmp_input code will suppress this due to the loopback;
2133 	 * thus we always loopback membership report.
2134 	 */
2135 	ASSERT(ill->ill_rq != NULL);
2136 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2137 
2138 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
2139 
2140 	++igmpstat.igps_snd_reports;
2141 
2142 	if (morepkts) {
2143 		if (more_src_cnt > 0) {
2144 			int index, mvsize;
2145 			slist_t *sl = &next_reclist->mrec_srcs;
2146 			index = sl->sl_numsrc;
2147 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2148 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2149 			    mvsize);
2150 			sl->sl_numsrc = more_src_cnt;
2151 		}
2152 		goto nextpkt;
2153 	}
2154 
2155 free_reclist:
2156 	while (reclist != NULL) {
2157 		rp = reclist->mrec_next;
2158 		mi_free(reclist);
2159 		reclist = rp;
2160 	}
2161 }
2162 
2163 /*
2164  * mld_input:
2165  */
2166 /* ARGSUSED */
2167 void
2168 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2169 {
2170 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2171 	mld_hdr_t	*mldh;
2172 	ilm_t		*ilm;
2173 	ipif_t		*ipif;
2174 	uint16_t	hdr_length, exthdr_length;
2175 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2176 	uint_t		next;
2177 	int		mldlen;
2178 
2179 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2180 
2181 	/* Make sure the src address of the packet is link-local */
2182 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2183 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2184 		freemsg(mp);
2185 		return;
2186 	}
2187 
2188 	if (ip6h->ip6_hlim != 1) {
2189 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2190 		freemsg(mp);
2191 		return;
2192 	}
2193 
2194 	/* Get to the icmp header part */
2195 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2196 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2197 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2198 	} else {
2199 		hdr_length = IPV6_HDR_LEN;
2200 		exthdr_length = 0;
2201 	}
2202 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2203 
2204 	/* An MLD packet must at least be 24 octets to be valid */
2205 	if (mldlen < MLD_MINLEN) {
2206 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2207 		freemsg(mp);
2208 		return;
2209 	}
2210 
2211 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2212 
2213 	switch (mldh->mld_type) {
2214 	case MLD_LISTENER_QUERY:
2215 		/*
2216 		 * packet length differentiates between v1 and v2.  v1
2217 		 * query should be exactly 24 octets long; v2 is >= 28.
2218 		 */
2219 		if (mldlen == MLD_MINLEN) {
2220 			next = mld_query_in(mldh, ill);
2221 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2222 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2223 		} else {
2224 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2225 			freemsg(mp);
2226 			return;
2227 		}
2228 		if (next == 0) {
2229 			freemsg(mp);
2230 			return;
2231 		}
2232 
2233 		if (next != INFINITY)
2234 			mld_start_timers(next);
2235 		break;
2236 
2237 	case MLD_LISTENER_REPORT: {
2238 
2239 		ASSERT(ill->ill_ipif != NULL);
2240 		/*
2241 		 * For fast leave to work, we have to know that we are the
2242 		 * last person to send a report for this group.  Reports
2243 		 * generated by us are looped back since we could potentially
2244 		 * be a multicast router, so discard reports sourced by me.
2245 		 */
2246 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2247 		mutex_enter(&ill->ill_lock);
2248 		for (ipif = ill->ill_ipif; ipif != NULL;
2249 		    ipif = ipif->ipif_next) {
2250 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2251 			    lcladdr_ptr)) {
2252 				if (ip_debug > 1) {
2253 					char    buf1[INET6_ADDRSTRLEN];
2254 					char	buf2[INET6_ADDRSTRLEN];
2255 
2256 					(void) mi_strlog(ill->ill_rq,
2257 					    1,
2258 					    SL_TRACE,
2259 					    "mld_input: we are only "
2260 					    "member src %s ipif_local %s",
2261 					    inet_ntop(AF_INET6, lcladdr_ptr,
2262 					    buf1, sizeof (buf1)),
2263 					    inet_ntop(AF_INET6,
2264 					    &ipif->ipif_v6lcl_addr,
2265 					    buf2, sizeof (buf2)));
2266 				}
2267 				mutex_exit(&ill->ill_lock);
2268 				freemsg(mp);
2269 				return;
2270 			}
2271 		}
2272 		mutex_exit(&ill->ill_lock);
2273 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2274 
2275 		v6group_ptr = &mldh->mld_addr;
2276 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2277 			BUMP_MIB(ill->ill_icmp6_mib,
2278 			    ipv6IfIcmpInGroupMembBadReports);
2279 			freemsg(mp);
2280 			return;
2281 		}
2282 
2283 
2284 		/*
2285 		 * If we belong to the group being reported, and we are a
2286 		 * 'Delaying member' per the RFC terminology, stop our timer
2287 		 * for that group and 'clear flag' i.e. mark ilm_state as
2288 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2289 		 * membership entries for the same group address (one per zone)
2290 		 * so we need to walk the ill_ilm list.
2291 		 */
2292 		mutex_enter(&ill->ill_lock);
2293 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2294 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2295 			    continue;
2296 			BUMP_MIB(ill->ill_icmp6_mib,
2297 			    ipv6IfIcmpInGroupMembOurReports);
2298 
2299 			ilm->ilm_timer = INFINITY;
2300 			ilm->ilm_state = IGMP_OTHERMEMBER;
2301 		}
2302 		mutex_exit(&ill->ill_lock);
2303 		break;
2304 	}
2305 	case MLD_LISTENER_REDUCTION:
2306 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2307 		break;
2308 	}
2309 	/*
2310 	 * All MLD packets have already been passed up to any
2311 	 * process(es) listening on a ICMP6 raw socket. This
2312 	 * has been accomplished in ip_deliver_local_v6 prior to
2313 	 * this function call. It is assumed that the multicast daemon
2314 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2315 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2316 	 * Thus we can free the MLD message block here
2317 	 */
2318 	freemsg(mp);
2319 }
2320 
2321 /*
2322  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2323  * (non-zero, unsigned) timer value to be set on success.
2324  */
2325 static uint_t
2326 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2327 {
2328 	ilm_t	*ilm;
2329 	int	timer;
2330 	uint_t	next;
2331 	in6_addr_t *v6group;
2332 
2333 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2334 
2335 	/*
2336 	 * In the MLD specification, there are 3 states and a flag.
2337 	 *
2338 	 * In Non-Listener state, we simply don't have a membership record.
2339 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2340 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2341 	 * INFINITY)
2342 	 *
2343 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2344 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2345 	 * if I sent the last report.
2346 	 */
2347 	v6group = &mldh->mld_addr;
2348 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2349 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2350 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2351 		return (0);
2352 	}
2353 
2354 	/* Need to do compatibility mode checking */
2355 	mutex_enter(&ill->ill_lock);
2356 	ill->ill_mcast_v1_time = 0;
2357 	ill->ill_mcast_v1_tset = 1;
2358 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2359 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2360 		    "MLD_V1_ROUTER\n", ill->ill_name));
2361 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2362 		ill->ill_mcast_type = MLD_V1_ROUTER;
2363 	}
2364 	mutex_exit(&ill->ill_lock);
2365 
2366 	timer = (int)ntohs(mldh->mld_maxdelay);
2367 	if (ip_debug > 1) {
2368 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2369 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2370 		    timer, (int)mldh->mld_type);
2371 	}
2372 
2373 	/*
2374 	 * -Start the timers in all of our membership records for
2375 	 * the physical interface on which the query arrived,
2376 	 * excl:
2377 	 *	1.  those that belong to the "all hosts" group,
2378 	 *	2.  those with 0 scope, or 1 node-local scope.
2379 	 *
2380 	 * -Restart any timer that is already running but has a value
2381 	 * longer that the requested timeout.
2382 	 * -Use the value specified in the query message as the
2383 	 * maximum timeout.
2384 	 */
2385 	next = INFINITY;
2386 	mutex_enter(&ill->ill_lock);
2387 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2388 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2389 
2390 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2391 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2392 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2393 			continue;
2394 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2395 		    &ipv6_all_hosts_mcast)) &&
2396 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2397 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2398 			if (timer == 0) {
2399 				/* Respond immediately */
2400 				ilm->ilm_timer = INFINITY;
2401 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2402 				mutex_exit(&ill->ill_lock);
2403 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2404 				mutex_enter(&ill->ill_lock);
2405 				break;
2406 			}
2407 			if (ilm->ilm_timer > timer) {
2408 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2409 				if (ilm->ilm_timer < next)
2410 					next = ilm->ilm_timer;
2411 			}
2412 			break;
2413 		}
2414 	}
2415 	mutex_exit(&ill->ill_lock);
2416 
2417 	return (next);
2418 }
2419 
2420 /*
2421  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2422  * returns the appropriate (non-zero, unsigned) timer value (which may
2423  * be INFINITY) to be set.
2424  */
2425 static uint_t
2426 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2427 {
2428 	ilm_t	*ilm;
2429 	in6_addr_t *v6group, *src_array;
2430 	uint_t	next, numsrc, i, mrd, delay, qqi;
2431 	uint8_t	qrv;
2432 
2433 	v6group = &mld2q->mld2q_addr;
2434 	numsrc = ntohs(mld2q->mld2q_numsrc);
2435 
2436 	/* make sure numsrc matches packet size */
2437 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2438 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2439 		return (0);
2440 	}
2441 	src_array = (in6_addr_t *)&mld2q[1];
2442 
2443 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2444 
2445 	/* extract Maximum Response Delay from code in header */
2446 	mrd = ntohs(mld2q->mld2q_mxrc);
2447 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2448 		uint_t hdrval, mant, exp;
2449 		hdrval = mrd;
2450 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2451 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2452 		mrd = (mant | 0x1000) << (exp + 3);
2453 	}
2454 	MCAST_RANDOM_DELAY(delay, mrd);
2455 	next = (unsigned)INFINITY;
2456 
2457 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2458 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2459 	else
2460 		ill->ill_mcast_rv = qrv;
2461 
2462 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2463 		uint_t mant, exp;
2464 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2465 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2466 		qqi = (mant | 0x10) << (exp + 3);
2467 	}
2468 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2469 
2470 	/*
2471 	 * If we have a pending general query response that's scheduled
2472 	 * sooner than the delay we calculated for this response, then
2473 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2474 	 */
2475 	mutex_enter(&ill->ill_lock);
2476 	if (ill->ill_global_timer < delay) {
2477 		mutex_exit(&ill->ill_lock);
2478 		return (next);
2479 	}
2480 	mutex_exit(&ill->ill_lock);
2481 
2482 	/*
2483 	 * Now take action depending on query type: general,
2484 	 * group specific, or group/source specific.
2485 	 */
2486 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2487 		/*
2488 		 * general query
2489 		 * We know global timer is either not running or is
2490 		 * greater than our calculated delay, so reset it to
2491 		 * our delay (random value in range [0, response time])
2492 		 */
2493 		mutex_enter(&ill->ill_lock);
2494 		ill->ill_global_timer = delay;
2495 		next = ill->ill_global_timer;
2496 		mutex_exit(&ill->ill_lock);
2497 
2498 	} else {
2499 		/* group or group/source specific query */
2500 		mutex_enter(&ill->ill_lock);
2501 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2502 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2503 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2504 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2505 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2506 				continue;
2507 
2508 			/*
2509 			 * If the query is group specific or we have a
2510 			 * pending group specific query, the response is
2511 			 * group specific (pending sources list should be
2512 			 * empty).  Otherwise, need to update the pending
2513 			 * sources list for the group and source specific
2514 			 * response.
2515 			 */
2516 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2517 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2518 group_query:
2519 				FREE_SLIST(ilm->ilm_pendsrcs);
2520 				ilm->ilm_pendsrcs = NULL;
2521 			} else {
2522 				boolean_t overflow;
2523 				slist_t *pktl;
2524 				if (numsrc > MAX_FILTER_SIZE ||
2525 				    (ilm->ilm_pendsrcs == NULL &&
2526 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2527 					/*
2528 					 * We've been sent more sources than
2529 					 * we can deal with; or we can't deal
2530 					 * with a source list at all. Revert
2531 					 * to a group specific query.
2532 					 */
2533 					goto group_query;
2534 				}
2535 				if ((pktl = l_alloc()) == NULL)
2536 					goto group_query;
2537 				pktl->sl_numsrc = numsrc;
2538 				for (i = 0; i < numsrc; i++)
2539 					pktl->sl_addr[i] = src_array[i];
2540 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2541 				    &overflow);
2542 				l_free(pktl);
2543 				if (overflow)
2544 					goto group_query;
2545 			}
2546 			/* set timer to soonest value */
2547 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2548 			if (ilm->ilm_timer < next)
2549 				next = ilm->ilm_timer;
2550 			break;
2551 		}
2552 		mutex_exit(&ill->ill_lock);
2553 	}
2554 
2555 	return (next);
2556 }
2557 
2558 /*
2559  * Send MLDv1 response packet with hoplimit 1
2560  */
2561 static void
2562 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2563 {
2564 	mblk_t		*mp;
2565 	mld_hdr_t	*mldh;
2566 	ip6_t 		*ip6h;
2567 	ip6_hbh_t	*ip6hbh;
2568 	struct ip6_opt_router	*ip6router;
2569 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2570 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2571 	ipif_t		*ipif;
2572 	ip6i_t		*ip6i;
2573 
2574 	/*
2575 	 * We need to place a router alert option in this packet.  The length
2576 	 * of the options must be a multiple of 8.  The hbh option header is 2
2577 	 * bytes followed by the 4 byte router alert option.  That leaves
2578 	 * 2 bytes of pad for a total of 8 bytes.
2579 	 */
2580 	const int	router_alert_length = 8;
2581 
2582 	ASSERT(ill->ill_isv6);
2583 
2584 	/*
2585 	 * We need to make sure that this packet does not get load balanced.
2586 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2587 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2588 	 * If it gets load balanced, switches supporting MLD snooping
2589 	 * (in the future) will send the packet that it receives for this
2590 	 * multicast group to the interface that we are sending on. As we have
2591 	 * joined the multicast group on this ill, by sending the packet out
2592 	 * on this ill, we receive all the packets back on this ill.
2593 	 */
2594 	size += sizeof (ip6i_t) + router_alert_length;
2595 	mp = allocb(size, BPRI_HI);
2596 	if (mp == NULL)
2597 		return;
2598 	bzero(mp->b_rptr, size);
2599 	mp->b_wptr = mp->b_rptr + size;
2600 
2601 	ip6i = (ip6i_t *)mp->b_rptr;
2602 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2603 	ip6i->ip6i_nxt = IPPROTO_RAW;
2604 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2605 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2606 
2607 	ip6h = (ip6_t *)&ip6i[1];
2608 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2609 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2610 	/*
2611 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2612 	 * above will pad between ip6router and mld.
2613 	 */
2614 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2615 
2616 	mldh->mld_type = type;
2617 	mldh->mld_addr = ilm->ilm_v6addr;
2618 
2619 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2620 	ip6router->ip6or_len = 2;
2621 	ip6router->ip6or_value[0] = 0;
2622 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2623 
2624 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2625 	ip6hbh->ip6h_len = 0;
2626 
2627 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2628 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2629 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2630 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2631 	if (v6addr == NULL)
2632 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2633 	else
2634 		ip6h->ip6_dst = *v6addr;
2635 
2636 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2637 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2638 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2639 		ipif_refrele(ipif);
2640 	} else {
2641 		/* Otherwise, use IPv6 default address selection. */
2642 		ip6h->ip6_src = ipv6_all_zeros;
2643 	}
2644 
2645 	/*
2646 	 * Prepare for checksum by putting icmp length in the icmp
2647 	 * checksum field. The checksum is calculated in ip_wput_v6.
2648 	 */
2649 	mldh->mld_cksum = htons(sizeof (*mldh));
2650 
2651 	/*
2652 	 * ip_wput will automatically loopback the multicast packet to
2653 	 * the conn if multicast loopback is enabled.
2654 	 * The MIB stats corresponding to this outgoing MLD packet
2655 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2656 	 * ->icmp_update_out_mib_v6 function call.
2657 	 */
2658 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2659 }
2660 
2661 /*
2662  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2663  * report will contain one multicast address record for each element of
2664  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2665  * multiple reports are sent.  reclist is assumed to be made up of
2666  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2667  */
2668 static void
2669 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2670 {
2671 	mblk_t		*mp;
2672 	mld2r_t		*mld2r;
2673 	mld2mar_t	*mld2mar;
2674 	in6_addr_t	*srcarray;
2675 	ip6_t		*ip6h;
2676 	ip6_hbh_t	*ip6hbh;
2677 	ip6i_t		*ip6i;
2678 	struct ip6_opt_router	*ip6router;
2679 	size_t		size, optlen, padlen, icmpsize, rsize;
2680 	ipif_t		*ipif;
2681 	int		i, numrec, more_src_cnt;
2682 	mrec_t		*rp, *cur_reclist;
2683 	mrec_t		*next_reclist = reclist;
2684 	boolean_t	morepkts;
2685 
2686 	/* If there aren't any records, there's nothing to send */
2687 	if (reclist == NULL)
2688 		return;
2689 
2690 	ASSERT(ill->ill_isv6);
2691 
2692 	/*
2693 	 * Total option length (optlen + padlen) must be a multiple of
2694 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2695 	 * length will be 8.  Assert this in case anything ever changes.
2696 	 */
2697 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2698 	ASSERT(optlen <= 8);
2699 	padlen = 8 - optlen;
2700 nextpkt:
2701 	icmpsize = sizeof (mld2r_t);
2702 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2703 	morepkts = B_FALSE;
2704 	more_src_cnt = 0;
2705 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2706 	    rp = rp->mrec_next, numrec++) {
2707 		rsize = sizeof (mld2mar_t) +
2708 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2709 		if (size + rsize > ill->ill_max_frag) {
2710 			if (rp == cur_reclist) {
2711 				/*
2712 				 * If the first mrec we looked at is too big
2713 				 * to fit in a single packet (i.e the source
2714 				 * list is too big), we must either truncate
2715 				 * the list (if TO_EX or IS_EX), or send
2716 				 * multiple reports for the same group (all
2717 				 * other types).
2718 				 */
2719 				int srcspace, srcsperpkt;
2720 				srcspace = ill->ill_max_frag -
2721 				    (size + sizeof (mld2mar_t));
2722 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2723 				/*
2724 				 * Increment icmpsize and size, because we will
2725 				 * be sending a record for the mrec we're
2726 				 * looking at now.
2727 				 */
2728 				rsize = sizeof (mld2mar_t) +
2729 				    (srcsperpkt * sizeof (in6_addr_t));
2730 				icmpsize += rsize;
2731 				size += rsize;
2732 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2733 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2734 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2735 					if (rp->mrec_next == NULL) {
2736 						/* no more packets to send */
2737 						break;
2738 					} else {
2739 						/*
2740 						 * more packets, but we're
2741 						 * done with this mrec.
2742 						 */
2743 						next_reclist = rp->mrec_next;
2744 					}
2745 				} else {
2746 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2747 					    - srcsperpkt;
2748 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2749 					/*
2750 					 * We'll fix up this mrec (remove the
2751 					 * srcs we've already sent) before
2752 					 * returning to nextpkt above.
2753 					 */
2754 					next_reclist = rp;
2755 				}
2756 			} else {
2757 				next_reclist = rp;
2758 			}
2759 			morepkts = B_TRUE;
2760 			break;
2761 		}
2762 		icmpsize += rsize;
2763 		size += rsize;
2764 	}
2765 
2766 	/*
2767 	 * We need to make sure that this packet does not get load balanced.
2768 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2769 	 * ip_newroute_ipif_v6 know how to handle such packets.
2770 	 * If it gets load balanced, switches supporting MLD snooping
2771 	 * (in the future) will send the packet that it receives for this
2772 	 * multicast group to the interface that we are sending on. As we have
2773 	 * joined the multicast group on this ill, by sending the packet out
2774 	 * on this ill, we receive all the packets back on this ill.
2775 	 */
2776 	size += sizeof (ip6i_t);
2777 	mp = allocb(size, BPRI_HI);
2778 	if (mp == NULL)
2779 		goto free_reclist;
2780 	bzero(mp->b_rptr, size);
2781 	mp->b_wptr = mp->b_rptr + size;
2782 
2783 	ip6i = (ip6i_t *)mp->b_rptr;
2784 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2785 	ip6i->ip6i_nxt = IPPROTO_RAW;
2786 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2787 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2788 
2789 	ip6h = (ip6_t *)&(ip6i[1]);
2790 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2791 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2792 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2793 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2794 
2795 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2796 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2797 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2798 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2799 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2800 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2801 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2802 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2803 		ipif_refrele(ipif);
2804 	} else {
2805 		/* otherwise, use IPv6 default address selection. */
2806 		ip6h->ip6_src = ipv6_all_zeros;
2807 	}
2808 
2809 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2810 	/*
2811 	 * ip6h_len is the number of 8-byte words, not including the first
2812 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2813 	 */
2814 	ip6hbh->ip6h_len = 0;
2815 
2816 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2817 	ip6router->ip6or_len = 2;
2818 	ip6router->ip6or_value[0] = 0;
2819 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2820 
2821 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2822 	mld2r->mld2r_nummar = htons(numrec);
2823 	/*
2824 	 * Prepare for the checksum by putting icmp length in the icmp
2825 	 * checksum field. The checksum is calculated in ip_wput_v6.
2826 	 */
2827 	mld2r->mld2r_cksum = htons(icmpsize);
2828 
2829 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2830 		mld2mar->mld2mar_type = rp->mrec_type;
2831 		mld2mar->mld2mar_auxlen = 0;
2832 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2833 		mld2mar->mld2mar_group = rp->mrec_group;
2834 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2835 
2836 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2837 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2838 
2839 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2840 	}
2841 
2842 	/*
2843 	 * ip_wput will automatically loopback the multicast packet to
2844 	 * the conn if multicast loopback is enabled.
2845 	 * The MIB stats corresponding to this outgoing MLD packet
2846 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2847 	 * ->icmp_update_out_mib_v6 function call.
2848 	 */
2849 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2850 
2851 	if (morepkts) {
2852 		if (more_src_cnt > 0) {
2853 			int index, mvsize;
2854 			slist_t *sl = &next_reclist->mrec_srcs;
2855 			index = sl->sl_numsrc;
2856 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2857 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2858 			    mvsize);
2859 			sl->sl_numsrc = more_src_cnt;
2860 		}
2861 		goto nextpkt;
2862 	}
2863 
2864 free_reclist:
2865 	while (reclist != NULL) {
2866 		rp = reclist->mrec_next;
2867 		mi_free(reclist);
2868 		reclist = rp;
2869 	}
2870 }
2871 
2872 static mrec_t *
2873 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2874     mrec_t *next)
2875 {
2876 	mrec_t *rp;
2877 	int i;
2878 
2879 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2880 	    SLIST_IS_EMPTY(srclist))
2881 		return (next);
2882 
2883 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2884 	if (rp == NULL)
2885 		return (next);
2886 
2887 	rp->mrec_next = next;
2888 	rp->mrec_type = type;
2889 	rp->mrec_auxlen = 0;
2890 	rp->mrec_group = *grp;
2891 	if (srclist == NULL) {
2892 		rp->mrec_srcs.sl_numsrc = 0;
2893 	} else {
2894 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2895 		for (i = 0; i < srclist->sl_numsrc; i++)
2896 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2897 	}
2898 
2899 	return (rp);
2900 }
2901 
2902 /*
2903  * Set up initial retransmit state.  If memory cannot be allocated for
2904  * the source lists, simply create as much state as is possible; memory
2905  * allocation failures are considered one type of transient error that
2906  * the retransmissions are designed to overcome (and if they aren't
2907  * transient, there are bigger problems than failing to notify the
2908  * router about multicast group membership state changes).
2909  */
2910 static void
2911 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2912     slist_t *flist)
2913 {
2914 	/*
2915 	 * There are only three possibilities for rtype:
2916 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2917 	 *	  => rtype is ALLOW_NEW_SOURCES
2918 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2919 	 *	  => rtype is CHANGE_TO_EXCLUDE
2920 	 *	State change that involves a filter mode change
2921 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2922 	 */
2923 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2924 	    rtype == ALLOW_NEW_SOURCES);
2925 
2926 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2927 
2928 	switch (rtype) {
2929 	case CHANGE_TO_EXCLUDE:
2930 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2931 		CLEAR_SLIST(rtxp->rtx_allow);
2932 		COPY_SLIST(flist, rtxp->rtx_block);
2933 		break;
2934 	case ALLOW_NEW_SOURCES:
2935 	case CHANGE_TO_INCLUDE:
2936 		rtxp->rtx_fmode_cnt =
2937 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2938 		CLEAR_SLIST(rtxp->rtx_block);
2939 		COPY_SLIST(flist, rtxp->rtx_allow);
2940 		break;
2941 	}
2942 }
2943 
2944 /*
2945  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2946  * RFC 3376 section 5.1, covers three cases:
2947  *	* The current state change is a filter mode change
2948  *		Set filter mode retransmit counter; set retransmit allow or
2949  *		block list to new source list as appropriate, and clear the
2950  *		retransmit list that was not set; send TO_IN or TO_EX with
2951  *		new source list.
2952  *	* The current state change is a source list change, but the filter
2953  *	  mode retransmit counter is > 0
2954  *		Decrement filter mode retransmit counter; set retransmit
2955  *		allow or block list to  new source list as appropriate,
2956  *		and clear the retransmit list that was not set; send TO_IN
2957  *		or TO_EX with new source list.
2958  *	* The current state change is a source list change, and the filter
2959  *	  mode retransmit counter is 0.
2960  *		Merge existing rtx allow and block lists with new state:
2961  *		  rtx_allow = (new allow + rtx_allow) - new block
2962  *		  rtx_block = (new block + rtx_block) - new allow
2963  *		Send ALLOW and BLOCK records for new retransmit lists;
2964  *		decrement retransmit counter.
2965  *
2966  * As is the case for mcast_init_rtx(), memory allocation failures are
2967  * acceptable; we just create as much state as we can.
2968  */
2969 static mrec_t *
2970 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2971 {
2972 	ill_t *ill;
2973 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2974 	mcast_record_t txtype;
2975 	mrec_t *rp, *rpnext, *rtnmrec;
2976 	boolean_t ovf;
2977 
2978 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
2979 
2980 	if (mreclist == NULL)
2981 		return (mreclist);
2982 
2983 	/*
2984 	 * A filter mode change is indicated by a single mrec, which is
2985 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2986 	 * retransmit state as if this were an initial join.  There is
2987 	 * no change to the mrec list.
2988 	 */
2989 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2990 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2991 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2992 		    &mreclist->mrec_srcs);
2993 		return (mreclist);
2994 	}
2995 
2996 	/*
2997 	 * Only the source list has changed
2998 	 */
2999 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3000 	if (rtxp->rtx_fmode_cnt > 0) {
3001 		/* but we're still sending filter mode change reports */
3002 		rtxp->rtx_fmode_cnt--;
3003 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3004 			CLEAR_SLIST(rtxp->rtx_block);
3005 			COPY_SLIST(flist, rtxp->rtx_allow);
3006 			txtype = CHANGE_TO_INCLUDE;
3007 		} else {
3008 			CLEAR_SLIST(rtxp->rtx_allow);
3009 			COPY_SLIST(flist, rtxp->rtx_block);
3010 			txtype = CHANGE_TO_EXCLUDE;
3011 		}
3012 		/* overwrite first mrec with new info */
3013 		mreclist->mrec_type = txtype;
3014 		l_copy(flist, &mreclist->mrec_srcs);
3015 		/* then free any remaining mrecs */
3016 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3017 			rpnext = rp->mrec_next;
3018 			mi_free(rp);
3019 		}
3020 		mreclist->mrec_next = NULL;
3021 		rtnmrec = mreclist;
3022 	} else {
3023 		mrec_t *allow_mrec, *block_mrec;
3024 		/*
3025 		 * Just send the source change reports; but we need to
3026 		 * recalculate the ALLOW and BLOCK lists based on previous
3027 		 * state and new changes.
3028 		 */
3029 		rtnmrec = mreclist;
3030 		allow_mrec = block_mrec = NULL;
3031 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3032 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3033 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3034 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3035 				allow_mrec = rp;
3036 			else
3037 				block_mrec = rp;
3038 		}
3039 		/*
3040 		 * Perform calculations:
3041 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3042 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3043 		 *
3044 		 * Each calc requires two steps, for example:
3045 		 *   rtx_allow = rtx_allow - mrec_block;
3046 		 *   new_allow = mrec_allow + rtx_allow;
3047 		 *
3048 		 * Store results in mrec lists, and then copy into rtx lists.
3049 		 * We do it in this order in case the rtx list hasn't been
3050 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3051 		 * Overflows are also okay.
3052 		 */
3053 		if (block_mrec != NULL) {
3054 			l_difference_in_a(rtxp->rtx_allow,
3055 			    &block_mrec->mrec_srcs);
3056 		}
3057 		if (allow_mrec != NULL) {
3058 			l_difference_in_a(rtxp->rtx_block,
3059 			    &allow_mrec->mrec_srcs);
3060 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3061 			    &ovf);
3062 		}
3063 		if (block_mrec != NULL) {
3064 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3065 			    &ovf);
3066 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3067 		} else {
3068 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3069 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3070 		}
3071 		if (allow_mrec != NULL) {
3072 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3073 		} else {
3074 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3075 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3076 		}
3077 	}
3078 
3079 	return (rtnmrec);
3080 }
3081