xref: /titanic_44/usr/src/uts/common/inet/ip/igmp.c (revision 7c8de9202c10c8c49a901bff2e373864b545bd57)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 /* Copyright (c) 1990 Mentat Inc. */
27 
28 #pragma ident	"%Z%%M%	%I%	%E% SMI"
29 
30 /*
31  * Internet Group Management Protocol (IGMP) routines.
32  * Multicast Listener Discovery Protocol (MLD) routines.
33  *
34  * Written by Steve Deering, Stanford, May 1988.
35  * Modified by Rosen Sharma, Stanford, Aug 1994.
36  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
37  *
38  * MULTICAST 3.5.1.1
39  */
40 
41 
42 #include <sys/types.h>
43 #include <sys/stream.h>
44 #include <sys/dlpi.h>
45 #include <sys/stropts.h>
46 #include <sys/strlog.h>
47 #include <sys/strsun.h>
48 #include <sys/systm.h>
49 #include <sys/ddi.h>
50 #include <sys/sunddi.h>
51 #include <sys/cmn_err.h>
52 #include <sys/atomic.h>
53 #include <sys/zone.h>
54 
55 #include <sys/param.h>
56 #include <sys/socket.h>
57 #define	_SUN_TPI_VERSION	2
58 #include <sys/tihdr.h>
59 #include <inet/ipclassifier.h>
60 #include <net/if.h>
61 #include <net/if_arp.h>
62 #include <sys/sockio.h>
63 #include <net/route.h>
64 #include <netinet/in.h>
65 #include <netinet/igmp_var.h>
66 #include <netinet/ip6.h>
67 #include <netinet/icmp6.h>
68 
69 #include <inet/common.h>
70 #include <inet/mi.h>
71 #include <inet/nd.h>
72 #include <inet/arp.h>
73 #include <inet/ip.h>
74 #include <inet/ip6.h>
75 #include <inet/ip_multi.h>
76 #include <inet/ip_listutils.h>
77 
78 #include <netinet/igmp.h>
79 #include <inet/ip_if.h>
80 #include <net/pfkeyv2.h>
81 #include <inet/ipsec_info.h>
82 
83 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
84 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
85 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
86 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
87 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
88 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
89 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
90 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
91 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
92 		    slist_t *srclist, mrec_t *next);
93 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
94 		    mcast_record_t rtype, slist_t *flist);
95 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
96 
97 /* Following protected by igmp_timer_lock */
98 static int 	igmp_time_to_next;	/* Time since last timeout */
99 static int 	igmp_timer_fired_last;
100 uint_t		igmp_deferred_next = INFINITY;
101 timeout_id_t	igmp_timeout_id = 0;
102 kmutex_t	igmp_timer_lock;
103 
104 /* Protected by igmp_slowtimeout_lock */
105 timeout_id_t	igmp_slowtimeout_id = 0;
106 kmutex_t	igmp_slowtimeout_lock;
107 
108 /* Following protected by mld_timer_lock */
109 static int 	mld_time_to_next;	/* Time since last timeout */
110 static int 	mld_timer_fired_last;
111 uint_t		mld_deferred_next = INFINITY;
112 timeout_id_t	mld_timeout_id = 0;
113 kmutex_t	mld_timer_lock;
114 
115 /* Protected by mld_slowtimeout_lock */
116 timeout_id_t	mld_slowtimeout_id = 0;
117 kmutex_t	mld_slowtimeout_lock;
118 
119 /*
120  * Macros used to do timer len conversions.  Timer values are always
121  * stored and passed to the timer functions as milliseconds; but the
122  * default values and values from the wire may not be.
123  *
124  * And yes, it's obscure, but decisecond is easier to abbreviate than
125  * "tenths of a second".
126  */
127 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
128 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
129 
130 /*
131  * The first multicast join will trigger the igmp timers / mld timers
132  * The unit for next is milliseconds.
133  */
134 void
135 igmp_start_timers(unsigned next)
136 {
137 	int	time_left;
138 	/* Protected by igmp_timer_lock */
139 	static  boolean_t igmp_timer_setter_active;
140 	int	ret;
141 
142 	ASSERT(next != 0 && next != INFINITY);
143 
144 	mutex_enter(&igmp_timer_lock);
145 
146 	if (igmp_timer_setter_active) {
147 		/*
148 		 * Serialize timer setters, one at a time. If the
149 		 * timer is currently being set by someone,
150 		 * just record the next time when it has to be
151 		 * invoked and return. The current setter will
152 		 * take care.
153 		 */
154 		igmp_time_to_next = MIN(igmp_time_to_next, next);
155 		mutex_exit(&igmp_timer_lock);
156 		return;
157 	} else {
158 		igmp_timer_setter_active = B_TRUE;
159 	}
160 	if (igmp_timeout_id == 0) {
161 		/*
162 		 * The timer is inactive. We need to start a timer
163 		 */
164 		igmp_time_to_next = next;
165 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
166 		    MSEC_TO_TICK(igmp_time_to_next));
167 		igmp_timer_setter_active = B_FALSE;
168 		mutex_exit(&igmp_timer_lock);
169 		return;
170 	}
171 
172 	/*
173 	 * The timer was scheduled sometime back for firing in
174 	 * 'igmp_time_to_next' ms and is active. We need to
175 	 * reschedule the timeout if the new 'next' will happen
176 	 * earlier than the currently scheduled timeout
177 	 */
178 	time_left = igmp_timer_fired_last +
179 	    MSEC_TO_TICK(igmp_time_to_next) - ddi_get_lbolt();
180 	if (time_left < MSEC_TO_TICK(next)) {
181 		igmp_timer_setter_active = B_FALSE;
182 		mutex_exit(&igmp_timer_lock);
183 		return;
184 	}
185 
186 	mutex_exit(&igmp_timer_lock);
187 	ret = untimeout(igmp_timeout_id);
188 	mutex_enter(&igmp_timer_lock);
189 	/*
190 	 * The timeout was cancelled, or the timeout handler
191 	 * completed, while we were blocked in the untimeout.
192 	 * No other thread could have set the timer meanwhile
193 	 * since we serialized all the timer setters. Thus
194 	 * no timer is currently active nor executing nor will
195 	 * any timer fire in the future. We start the timer now
196 	 * if needed.
197 	 */
198 	if (ret == -1) {
199 		ASSERT(igmp_timeout_id == 0);
200 	} else {
201 		ASSERT(igmp_timeout_id != 0);
202 		igmp_timeout_id = 0;
203 	}
204 	if (igmp_time_to_next != 0) {
205 		igmp_time_to_next = MIN(igmp_time_to_next, next);
206 		igmp_timeout_id = timeout(igmp_timeout_handler, NULL,
207 		    MSEC_TO_TICK(igmp_time_to_next));
208 	}
209 	igmp_timer_setter_active = B_FALSE;
210 	mutex_exit(&igmp_timer_lock);
211 }
212 
213 /*
214  * mld_start_timers:
215  * The unit for next is milliseconds.
216  */
217 void
218 mld_start_timers(unsigned next)
219 {
220 	int	time_left;
221 	/* Protedted by mld_timer_lock */
222 	static  boolean_t mld_timer_setter_active;
223 	int	ret;
224 
225 	ASSERT(next != 0 && next != INFINITY);
226 
227 	mutex_enter(&mld_timer_lock);
228 	if (mld_timer_setter_active) {
229 		/*
230 		 * Serialize timer setters, one at a time. If the
231 		 * timer is currently being set by someone,
232 		 * just record the next time when it has to be
233 		 * invoked and return. The current setter will
234 		 * take care.
235 		 */
236 		mld_time_to_next = MIN(mld_time_to_next, next);
237 		mutex_exit(&mld_timer_lock);
238 		return;
239 	} else {
240 		mld_timer_setter_active = B_TRUE;
241 	}
242 	if (mld_timeout_id == 0) {
243 		/*
244 		 * The timer is inactive. We need to start a timer
245 		 */
246 		mld_time_to_next = next;
247 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
248 		    MSEC_TO_TICK(mld_time_to_next));
249 		mld_timer_setter_active = B_FALSE;
250 		mutex_exit(&mld_timer_lock);
251 		return;
252 	}
253 
254 	/*
255 	 * The timer was scheduled sometime back for firing in
256 	 * 'igmp_time_to_next' ms and is active. We need to
257 	 * reschedule the timeout if the new 'next' will happen
258 	 * earlier than the currently scheduled timeout
259 	 */
260 	time_left = mld_timer_fired_last +
261 	    MSEC_TO_TICK(mld_time_to_next) - ddi_get_lbolt();
262 	if (time_left < MSEC_TO_TICK(next)) {
263 		mld_timer_setter_active = B_FALSE;
264 		mutex_exit(&mld_timer_lock);
265 		return;
266 	}
267 
268 	mutex_exit(&mld_timer_lock);
269 	ret = untimeout(mld_timeout_id);
270 	mutex_enter(&mld_timer_lock);
271 	/*
272 	 * The timeout was cancelled, or the timeout handler
273 	 * completed, while we were blocked in the untimeout.
274 	 * No other thread could have set the timer meanwhile
275 	 * since we serialized all the timer setters. Thus
276 	 * no timer is currently active nor executing nor will
277 	 * any timer fire in the future. We start the timer now
278 	 * if needed.
279 	 */
280 	if (ret == -1) {
281 		ASSERT(mld_timeout_id == 0);
282 	} else {
283 		ASSERT(mld_timeout_id != 0);
284 		mld_timeout_id = 0;
285 	}
286 	if (mld_time_to_next != 0) {
287 		mld_time_to_next = MIN(mld_time_to_next, next);
288 		mld_timeout_id = timeout(mld_timeout_handler, NULL,
289 		    MSEC_TO_TICK(mld_time_to_next));
290 	}
291 	mld_timer_setter_active = B_FALSE;
292 	mutex_exit(&mld_timer_lock);
293 }
294 
295 /*
296  * igmp_input:
297  * Return 0 if the message is OK and should be handed to "raw" receivers.
298  * Callers of igmp_input() may need to reinitialize variables that were copied
299  * from the mblk as this calls pullupmsg().
300  */
301 /* ARGSUSED */
302 int
303 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
304 {
305 	igmpa_t 	*igmpa;
306 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
307 	int		iphlen, igmplen, mblklen;
308 	ilm_t 		*ilm;
309 	uint32_t	src, dst;
310 	uint32_t 	group;
311 	uint_t		next;
312 	ipif_t 		*ipif;
313 
314 	ASSERT(ill != NULL);
315 	ASSERT(!ill->ill_isv6);
316 	++igmpstat.igps_rcv_total;
317 
318 	mblklen = MBLKL(mp);
319 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
320 		++igmpstat.igps_rcv_tooshort;
321 		freemsg(mp);
322 		return (-1);
323 	}
324 	igmplen = ntohs(ipha->ipha_length) - iphlen;
325 	/*
326 	 * Since msg sizes are more variable with v3, just pullup the
327 	 * whole thing now.
328 	 */
329 	if (MBLKL(mp) < (igmplen + iphlen)) {
330 		mblk_t *mp1;
331 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
332 			++igmpstat.igps_rcv_tooshort;
333 			freemsg(mp);
334 			return (-1);
335 		}
336 		freemsg(mp);
337 		mp = mp1;
338 		ipha = (ipha_t *)(mp->b_rptr);
339 	}
340 
341 	/*
342 	 * Validate lengths
343 	 */
344 	if (igmplen < IGMP_MINLEN) {
345 		++igmpstat.igps_rcv_tooshort;
346 		freemsg(mp);
347 		return (-1);
348 	}
349 	/*
350 	 * Validate checksum
351 	 */
352 	if (IP_CSUM(mp, iphlen, 0)) {
353 		++igmpstat.igps_rcv_badsum;
354 		freemsg(mp);
355 		return (-1);
356 	}
357 
358 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
359 	src = ipha->ipha_src;
360 	dst = ipha->ipha_dst;
361 	if (ip_debug > 1)
362 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
363 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
364 		    (int)ntohl(src), (int)ntohl(dst),
365 		    ill->ill_name);
366 
367 	switch (igmpa->igmpa_type) {
368 	case IGMP_MEMBERSHIP_QUERY:
369 		/*
370 		 * packet length differentiates between v1/v2 and v3
371 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
372 		 */
373 		if (igmplen == IGMP_MINLEN) {
374 			next = igmp_query_in(ipha, igmpa, ill);
375 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
376 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
377 			    igmplen);
378 		} else {
379 			++igmpstat.igps_rcv_tooshort;
380 			freemsg(mp);
381 			return (-1);
382 		}
383 		if (next == 0) {
384 			freemsg(mp);
385 			return (-1);
386 		}
387 
388 		if (next != INFINITY)
389 			igmp_start_timers(next);
390 
391 		break;
392 
393 	case IGMP_V1_MEMBERSHIP_REPORT:
394 	case IGMP_V2_MEMBERSHIP_REPORT:
395 		/*
396 		 * For fast leave to work, we have to know that we are the
397 		 * last person to send a report for this group. Reports
398 		 * generated by us are looped back since we could potentially
399 		 * be a multicast router, so discard reports sourced by me.
400 		 */
401 		mutex_enter(&ill->ill_lock);
402 		for (ipif = ill->ill_ipif; ipif != NULL;
403 		    ipif = ipif->ipif_next) {
404 			if (ipif->ipif_lcl_addr == src) {
405 				if (ip_debug > 1) {
406 					(void) mi_strlog(ill->ill_rq,
407 					    1,
408 					    SL_TRACE,
409 					    "igmp_input: we are only "
410 					    "member src 0x%x ipif_local 0x%x",
411 					    (int)ntohl(src),
412 					    (int)
413 					    ntohl(ipif->ipif_lcl_addr));
414 				}
415 				mutex_exit(&ill->ill_lock);
416 				return (0);
417 			}
418 		}
419 		mutex_exit(&ill->ill_lock);
420 
421 		++igmpstat.igps_rcv_reports;
422 		group = igmpa->igmpa_group;
423 		if (!CLASSD(group)) {
424 			++igmpstat.igps_rcv_badreports;
425 			freemsg(mp);
426 			return (-1);
427 		}
428 
429 		/*
430 		 * KLUDGE: if the IP source address of the report has an
431 		 * unspecified (i.e., zero) subnet number, as is allowed for
432 		 * a booting host, replace it with the correct subnet number
433 		 * so that a process-level multicast routing demon can
434 		 * determine which subnet it arrived from.  This is necessary
435 		 * to compensate for the lack of any way for a process to
436 		 * determine the arrival interface of an incoming packet.
437 		 *
438 		 * Requires that a copy of *this* message it passed up
439 		 * to the raw interface which is done by our caller.
440 		 */
441 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
442 			/* Pick the first ipif on this ill */
443 			mutex_enter(&ill->ill_lock);
444 			src = ill->ill_ipif->ipif_subnet;
445 			mutex_exit(&ill->ill_lock);
446 			ip1dbg(("igmp_input: changed src to 0x%x\n",
447 			    (int)ntohl(src)));
448 			ipha->ipha_src = src;
449 		}
450 
451 		/*
452 		 * If we belong to the group being reported, and
453 		 * we are a 'Delaying member' in the RFC terminology,
454 		 * stop our timer for that group and 'clear flag' i.e.
455 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
456 		 * interfaces on the given physical interface.
457 		 */
458 		mutex_enter(&ill->ill_lock);
459 		for (ipif = ill->ill_ipif; ipif != NULL;
460 		    ipif = ipif->ipif_next) {
461 			ilm = ilm_lookup_ipif(ipif, group);
462 			if (ilm != NULL) {
463 				++igmpstat.igps_rcv_ourreports;
464 				ilm->ilm_timer = INFINITY;
465 				ilm->ilm_state = IGMP_OTHERMEMBER;
466 			}
467 		} /* for */
468 		mutex_exit(&ill->ill_lock);
469 		break;
470 
471 	case IGMP_V3_MEMBERSHIP_REPORT:
472 		/*
473 		 * Currently nothing to do here; IGMP router is not
474 		 * implemented in ip, and v3 hosts don't pay attention
475 		 * to membership reports.
476 		 */
477 		break;
478 	}
479 	/*
480 	 * Pass all valid IGMP packets up to any process(es) listening
481 	 * on a raw IGMP socket. Do not free the packet.
482 	 */
483 	return (0);
484 }
485 
486 static uint_t
487 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
488 {
489 	ilm_t	*ilm;
490 	int	timer;
491 	uint_t	next;
492 
493 	++igmpstat.igps_rcv_queries;
494 
495 	/*
496 	 * In the IGMPv2 specification, there are 3 states and a flag.
497 	 *
498 	 * In Non-Member state, we simply don't have a membership record.
499 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
500 	 * < INFINITY).  In Idle Member state, our timer is not running
501 	 * (ilm->ilm_timer == INFINITY).
502 	 *
503 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
504 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
505 	 * if I sent the last report.
506 	 */
507 	if (igmpa->igmpa_code == 0) {
508 		/*
509 		 * Query from an old router.
510 		 * Remember that the querier on this interface is old,
511 		 * and set the timer to the value in RFC 1112.
512 		 */
513 
514 
515 		mutex_enter(&ill->ill_lock);
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 		mutex_exit(&ill->ill_lock);
525 
526 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
527 
528 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
529 		    igmpa->igmpa_group != 0) {
530 			++igmpstat.igps_rcv_badqueries;
531 			return (0);
532 		}
533 
534 	} else {
535 		in_addr_t group;
536 
537 		/*
538 		 * Query from a new router
539 		 * Simply do a validity check
540 		 */
541 		group = igmpa->igmpa_group;
542 		if (group != 0 && (!CLASSD(group))) {
543 			++igmpstat.igps_rcv_badqueries;
544 			return (0);
545 		}
546 
547 		/*
548 		 * Switch interface state to v2 on receipt of a v2 query
549 		 * ONLY IF current state is v3.  Let things be if current
550 		 * state if v1 but do reset the v2-querier-present timer.
551 		 */
552 		mutex_enter(&ill->ill_lock);
553 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
554 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
555 			    "to IGMP_V2_ROUTER", ill->ill_name));
556 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
557 			ill->ill_mcast_type = IGMP_V2_ROUTER;
558 		}
559 		ill->ill_mcast_v2_time = 0;
560 		ill->ill_mcast_v2_tset = 1;
561 		mutex_exit(&ill->ill_lock);
562 
563 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
564 	}
565 
566 	if (ip_debug > 1) {
567 		mutex_enter(&ill->ill_lock);
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 		mutex_exit(&ill->ill_lock);
573 	}
574 
575 	/*
576 	 * -Start the timers in all of our membership records
577 	 *  for the physical interface on which the query
578 	 *  arrived, excluding those that belong to the "all
579 	 *  hosts" group (224.0.0.1).
580 	 *
581 	 * -Restart any timer that is already running but has
582 	 *  a value longer than the requested timeout.
583 	 *
584 	 * -Use the value specified in the query message as
585 	 *  the maximum timeout.
586 	 */
587 	next = (unsigned)INFINITY;
588 	mutex_enter(&ill->ill_lock);
589 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
590 
591 		/*
592 		 * A multicast router joins INADDR_ANY address
593 		 * to enable promiscuous reception of all
594 		 * mcasts from the interface. This INADDR_ANY
595 		 * is stored in the ilm_v6addr as V6 unspec addr
596 		 */
597 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
598 			continue;
599 		if (ilm->ilm_addr == htonl(INADDR_ANY))
600 			continue;
601 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
602 		    (igmpa->igmpa_group == 0) ||
603 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
604 			if (ilm->ilm_timer > timer) {
605 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
606 				if (ilm->ilm_timer < next)
607 					next = ilm->ilm_timer;
608 			}
609 		}
610 	}
611 	mutex_exit(&ill->ill_lock);
612 
613 	return (next);
614 }
615 
616 static uint_t
617 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
618 {
619 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
620 	ilm_t		*ilm;
621 	ipaddr_t	*src_array;
622 	uint8_t		qrv;
623 
624 	/* make sure numsrc matches packet size */
625 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
626 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
627 		++igmpstat.igps_rcv_tooshort;
628 		return (0);
629 	}
630 	src_array = (ipaddr_t *)&igmp3qa[1];
631 
632 	++igmpstat.igps_rcv_queries;
633 
634 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
635 		uint_t hdrval, mant, exp;
636 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
637 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
638 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
639 		mrd = (mant | 0x10) << (exp + 3);
640 	}
641 	if (mrd == 0)
642 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
643 	timer = DSEC_TO_MSEC(mrd);
644 	MCAST_RANDOM_DELAY(delay, timer);
645 	next = (unsigned)INFINITY;
646 
647 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
648 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
649 	else
650 		ill->ill_mcast_rv = qrv;
651 
652 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
653 		uint_t hdrval, mant, exp;
654 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
655 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
656 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
657 		qqi = (mant | 0x10) << (exp + 3);
658 	}
659 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
660 
661 	/*
662 	 * If we have a pending general query response that's scheduled
663 	 * sooner than the delay we calculated for this response, then
664 	 * no action is required (RFC3376 section 5.2 rule 1)
665 	 */
666 	mutex_enter(&ill->ill_lock);
667 	if (ill->ill_global_timer < delay) {
668 		mutex_exit(&ill->ill_lock);
669 		return (next);
670 	}
671 	mutex_exit(&ill->ill_lock);
672 
673 	/*
674 	 * Now take action depending upon query type:
675 	 * general, group specific, or group/source specific.
676 	 */
677 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
678 		/*
679 		 * general query
680 		 * We know global timer is either not running or is
681 		 * greater than our calculated delay, so reset it to
682 		 * our delay (random value in range [0, response time]).
683 		 */
684 		mutex_enter(&ill->ill_lock);
685 		ill->ill_global_timer = delay;
686 		next = ill->ill_global_timer;
687 		mutex_exit(&ill->ill_lock);
688 
689 	} else {
690 		/* group or group/source specific query */
691 		mutex_enter(&ill->ill_lock);
692 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
693 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
694 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
695 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
696 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
697 				continue;
698 			/*
699 			 * If the query is group specific or we have a
700 			 * pending group specific query, the response is
701 			 * group specific (pending sources list should be
702 			 * empty).  Otherwise, need to update the pending
703 			 * sources list for the group and source specific
704 			 * response.
705 			 */
706 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
707 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
708 group_query:
709 				FREE_SLIST(ilm->ilm_pendsrcs);
710 				ilm->ilm_pendsrcs = NULL;
711 			} else {
712 				boolean_t overflow;
713 				slist_t *pktl;
714 				if (numsrc > MAX_FILTER_SIZE ||
715 				    (ilm->ilm_pendsrcs == NULL &&
716 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
717 					/*
718 					 * We've been sent more sources than
719 					 * we can deal with; or we can't deal
720 					 * with a source list at all.  Revert
721 					 * to a group specific query.
722 					 */
723 					goto group_query;
724 				}
725 				if ((pktl = l_alloc()) == NULL)
726 					goto group_query;
727 				pktl->sl_numsrc = numsrc;
728 				for (i = 0; i < numsrc; i++)
729 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
730 					    &(pktl->sl_addr[i]));
731 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
732 				    &overflow);
733 				l_free(pktl);
734 				if (overflow)
735 					goto group_query;
736 			}
737 			/* choose soonest timer */
738 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
739 			if (ilm->ilm_timer < next)
740 				next = ilm->ilm_timer;
741 		}
742 		mutex_exit(&ill->ill_lock);
743 	}
744 
745 	return (next);
746 }
747 
748 void
749 igmp_joingroup(ilm_t *ilm)
750 {
751 	ill_t	*ill;
752 
753 	ill = ilm->ilm_ipif->ipif_ill;
754 
755 	ASSERT(IAM_WRITER_ILL(ill));
756 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
757 
758 	mutex_enter(&ill->ill_lock);
759 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
760 		ilm->ilm_rtx.rtx_timer = INFINITY;
761 		ilm->ilm_state = IGMP_OTHERMEMBER;
762 		mutex_exit(&ill->ill_lock);
763 	} else {
764 		ip1dbg(("Querier mode %d, sending report, group %x\n",
765 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
766 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
767 			mutex_exit(&ill->ill_lock);
768 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
769 			mutex_enter(&ill->ill_lock);
770 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
771 			mutex_exit(&ill->ill_lock);
772 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
773 			mutex_enter(&ill->ill_lock);
774 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
775 			mrec_t *rp;
776 			mcast_record_t rtype;
777 			/*
778 			 * The possible state changes we need to handle here:
779 			 *   Old State	New State	Report
780 			 *
781 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
782 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
783 			 *
784 			 * No need to send the BLOCK(0) report; ALLOW(X)
785 			 * is enough.
786 			 */
787 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
788 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
789 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
790 			    ilm->ilm_filter, NULL);
791 			mutex_exit(&ill->ill_lock);
792 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
793 			mutex_enter(&ill->ill_lock);
794 			/*
795 			 * Set up retransmission state.  Timer is set below,
796 			 * for both v3 and older versions.
797 			 */
798 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
799 			    ilm->ilm_filter);
800 		}
801 
802 		/* Set the ilm timer value */
803 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
804 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
805 		ilm->ilm_state = IGMP_IREPORTEDLAST;
806 		mutex_exit(&ill->ill_lock);
807 
808 		/*
809 		 * To avoid deadlock, we don't call igmp_start_timers from
810 		 * here. igmp_start_timers needs to call untimeout, and we
811 		 * can't hold the ipsq across untimeout since
812 		 * igmp_timeout_handler could be blocking trying to
813 		 * acquire the ipsq. Instead we start the timer after we get
814 		 * out of the ipsq in ipsq_exit.
815 		 */
816 		mutex_enter(&igmp_timer_lock);
817 		igmp_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
818 		    igmp_deferred_next);
819 		mutex_exit(&igmp_timer_lock);
820 	}
821 
822 	if (ip_debug > 1) {
823 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
824 		    "igmp_joingroup: multicast_type %d timer %d",
825 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
826 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
827 	}
828 }
829 
830 void
831 mld_joingroup(ilm_t *ilm)
832 {
833 	ill_t	*ill;
834 
835 	ill = ilm->ilm_ill;
836 
837 	ASSERT(IAM_WRITER_ILL(ill));
838 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
839 
840 	mutex_enter(&ill->ill_lock);
841 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
842 		ilm->ilm_rtx.rtx_timer = INFINITY;
843 		ilm->ilm_state = IGMP_OTHERMEMBER;
844 		mutex_exit(&ill->ill_lock);
845 	} else {
846 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
847 			mutex_exit(&ill->ill_lock);
848 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
849 			mutex_enter(&ill->ill_lock);
850 		} else {
851 			mrec_t *rp;
852 			mcast_record_t rtype;
853 			/*
854 			 * The possible state changes we need to handle here:
855 			 *	Old State   New State	Report
856 			 *
857 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
858 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
859 			 *
860 			 * No need to send the BLOCK(0) report; ALLOW(X)
861 			 * is enough
862 			 */
863 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
864 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
865 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
866 			    ilm->ilm_filter, NULL);
867 			mutex_exit(&ill->ill_lock);
868 			mldv2_sendrpt(ill, rp);
869 			mutex_enter(&ill->ill_lock);
870 			/*
871 			 * Set up retransmission state.  Timer is set below,
872 			 * for both v2 and v1.
873 			 */
874 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
875 			    ilm->ilm_filter);
876 		}
877 
878 		/* Set the ilm timer value */
879 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
880 		    ilm->ilm_rtx.rtx_cnt > 0);
881 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
882 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
883 		ilm->ilm_state = IGMP_IREPORTEDLAST;
884 		mutex_exit(&ill->ill_lock);
885 
886 		/*
887 		 * To avoid deadlock, we don't call mld_start_timers from
888 		 * here. mld_start_timers needs to call untimeout, and we
889 		 * can't hold the ipsq (i.e. the lock) across untimeout
890 		 * since mld_timeout_handler could be blocking trying to
891 		 * acquire the ipsq. Instead we start the timer after we get
892 		 * out of the ipsq in ipsq_exit
893 		 */
894 		mutex_enter(&mld_timer_lock);
895 		mld_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
896 		    mld_deferred_next);
897 		mutex_exit(&mld_timer_lock);
898 	}
899 
900 	if (ip_debug > 1) {
901 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
902 		    "mld_joingroup: multicast_type %d timer %d",
903 		    (ilm->ilm_ill->ill_mcast_type),
904 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
905 	}
906 }
907 
908 void
909 igmp_leavegroup(ilm_t *ilm)
910 {
911 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
912 
913 	ASSERT(ilm->ilm_ill == NULL);
914 	ASSERT(!ill->ill_isv6);
915 
916 	mutex_enter(&ill->ill_lock);
917 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
918 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
919 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
920 		mutex_exit(&ill->ill_lock);
921 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
922 		    (htonl(INADDR_ALLRTRS_GROUP)));
923 		return;
924 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
925 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
926 		mrec_t *rp;
927 		/*
928 		 * The possible state changes we need to handle here:
929 		 *	Old State	New State	Report
930 		 *
931 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
932 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
933 		 *
934 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
935 		 */
936 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
937 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
938 			    ilm->ilm_filter, NULL);
939 		} else {
940 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
941 			    NULL, NULL);
942 		}
943 		mutex_exit(&ill->ill_lock);
944 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
945 		return;
946 	}
947 	mutex_exit(&ill->ill_lock);
948 }
949 
950 void
951 mld_leavegroup(ilm_t *ilm)
952 {
953 	ill_t *ill = ilm->ilm_ill;
954 
955 	ASSERT(ilm->ilm_ipif == NULL);
956 	ASSERT(ill->ill_isv6);
957 
958 	mutex_enter(&ill->ill_lock);
959 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
960 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
961 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
962 		mutex_exit(&ill->ill_lock);
963 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
964 		return;
965 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
966 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
967 		mrec_t *rp;
968 		/*
969 		 * The possible state changes we need to handle here:
970 		 *	Old State	New State	Report
971 		 *
972 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
973 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
974 		 *
975 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
976 		 */
977 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
978 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
979 			    ilm->ilm_filter, NULL);
980 		} else {
981 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
982 			    NULL, NULL);
983 		}
984 		mutex_exit(&ill->ill_lock);
985 		mldv2_sendrpt(ill, rp);
986 		return;
987 	}
988 	mutex_exit(&ill->ill_lock);
989 }
990 
991 void
992 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
993 {
994 	ill_t *ill;
995 	mrec_t *rp;
996 
997 	ASSERT(ilm != NULL);
998 
999 	/* state change reports should only be sent if the router is v3 */
1000 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
1001 		return;
1002 
1003 	if (ilm->ilm_ill == NULL) {
1004 		ASSERT(ilm->ilm_ipif != NULL);
1005 		ill = ilm->ilm_ipif->ipif_ill;
1006 	} else {
1007 		ill = ilm->ilm_ill;
1008 	}
1009 
1010 	mutex_enter(&ill->ill_lock);
1011 
1012 	/*
1013 	 * Compare existing(old) state with the new state and prepare
1014 	 * State Change Report, according to the rules in RFC 3376:
1015 	 *
1016 	 *	Old State	New State	State Change Report
1017 	 *
1018 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1019 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1020 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1021 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1022 	 */
1023 
1024 	if (ilm->ilm_fmode == fmode) {
1025 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1026 		slist_t *allow, *block;
1027 		if (((a_minus_b = l_alloc()) == NULL) ||
1028 		    ((b_minus_a = l_alloc()) == NULL)) {
1029 			l_free(a_minus_b);
1030 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1031 				goto send_to_ex;
1032 			else
1033 				goto send_to_in;
1034 		}
1035 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1036 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1037 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1038 			allow = b_minus_a;
1039 			block = a_minus_b;
1040 		} else {
1041 			allow = a_minus_b;
1042 			block = b_minus_a;
1043 		}
1044 		rp = NULL;
1045 		if (!SLIST_IS_EMPTY(allow))
1046 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1047 			    allow, rp);
1048 		if (!SLIST_IS_EMPTY(block))
1049 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1050 			    block, rp);
1051 		l_free(a_minus_b);
1052 		l_free(b_minus_a);
1053 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1054 send_to_ex:
1055 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1056 		    NULL);
1057 	} else {
1058 send_to_in:
1059 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1060 		    NULL);
1061 	}
1062 
1063 	/*
1064 	 * Need to set up retransmission state; merge the new info with the
1065 	 * current state (which may be null).  If the timer is not currently
1066 	 * running, start it (need to do a delayed start of the timer as
1067 	 * we're currently in the sq).
1068 	 */
1069 	rp = mcast_merge_rtx(ilm, rp, flist);
1070 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1071 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1072 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1073 		mutex_enter(&igmp_timer_lock);
1074 		igmp_deferred_next = MIN(igmp_deferred_next,
1075 		    ilm->ilm_rtx.rtx_timer);
1076 		mutex_exit(&igmp_timer_lock);
1077 	}
1078 
1079 	mutex_exit(&ill->ill_lock);
1080 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1081 }
1082 
1083 void
1084 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1085 {
1086 	ill_t *ill;
1087 	mrec_t *rp = NULL;
1088 
1089 	ASSERT(ilm != NULL);
1090 
1091 	ill = ilm->ilm_ill;
1092 
1093 	/* only need to send if we have an mldv2-capable router */
1094 	mutex_enter(&ill->ill_lock);
1095 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1096 		mutex_exit(&ill->ill_lock);
1097 		return;
1098 	}
1099 
1100 	/*
1101 	 * Compare existing (old) state with the new state passed in
1102 	 * and send appropriate MLDv2 State Change Report.
1103 	 *
1104 	 *	Old State	New State	State Change Report
1105 	 *
1106 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1107 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1108 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1109 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1110 	 */
1111 	if (ilm->ilm_fmode == fmode) {
1112 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1113 		slist_t *allow, *block;
1114 		if (((a_minus_b = l_alloc()) == NULL) ||
1115 		    ((b_minus_a = l_alloc()) == NULL)) {
1116 			l_free(a_minus_b);
1117 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1118 				goto send_to_ex;
1119 			else
1120 				goto send_to_in;
1121 		}
1122 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1123 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1124 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1125 			allow = b_minus_a;
1126 			block = a_minus_b;
1127 		} else {
1128 			allow = a_minus_b;
1129 			block = b_minus_a;
1130 		}
1131 		if (!SLIST_IS_EMPTY(allow))
1132 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1133 			    allow, rp);
1134 		if (!SLIST_IS_EMPTY(block))
1135 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1136 			    block, rp);
1137 		l_free(a_minus_b);
1138 		l_free(b_minus_a);
1139 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1140 send_to_ex:
1141 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1142 		    NULL);
1143 	} else {
1144 send_to_in:
1145 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1146 		    NULL);
1147 	}
1148 
1149 	/*
1150 	 * Need to set up retransmission state; merge the new info with the
1151 	 * current state (which may be null).  If the timer is not currently
1152 	 * running, start it (need to do a deferred start of the timer as
1153 	 * we're currently in the sq).
1154 	 */
1155 	rp = mcast_merge_rtx(ilm, rp, flist);
1156 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1157 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1158 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1159 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1160 		mutex_enter(&mld_timer_lock);
1161 		mld_deferred_next =
1162 		    MIN(mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1163 		mutex_exit(&mld_timer_lock);
1164 	}
1165 
1166 	mutex_exit(&ill->ill_lock);
1167 	mldv2_sendrpt(ill, rp);
1168 }
1169 
1170 uint_t
1171 igmp_timeout_handler_per_ill(ill_t *ill, int elapsed)
1172 {
1173 	uint_t	next = INFINITY;
1174 	ilm_t	*ilm;
1175 	ipif_t	*ipif;
1176 	mrec_t	*rp = NULL;
1177 	mrec_t	*rtxrp = NULL;
1178 	rtx_state_t *rtxp;
1179 	mcast_record_t	rtype;
1180 
1181 	ASSERT(IAM_WRITER_ILL(ill));
1182 
1183 	mutex_enter(&ill->ill_lock);
1184 
1185 	/* First check the global timer on this interface */
1186 	if (ill->ill_global_timer == INFINITY)
1187 		goto per_ilm_timer;
1188 	if (ill->ill_global_timer <= elapsed) {
1189 		ill->ill_global_timer = INFINITY;
1190 		/*
1191 		 * Send report for each group on this interface.
1192 		 * Since we just set the global timer (received a v3 general
1193 		 * query), need to skip the all hosts addr (224.0.0.1), per
1194 		 * RFC 3376 section 5.
1195 		 */
1196 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1197 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1198 				continue;
1199 			ASSERT(ilm->ilm_ipif != NULL);
1200 			ilm->ilm_ipif->ipif_igmp_rpt =
1201 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1202 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1203 			/*
1204 			 * Since we're sending a report on this group, okay
1205 			 * to delete pending group-specific timers.  Note
1206 			 * that group-specific retransmit timers still need
1207 			 * to be checked in the per_ilm_timer for-loop.
1208 			 */
1209 			ilm->ilm_timer = INFINITY;
1210 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1211 			FREE_SLIST(ilm->ilm_pendsrcs);
1212 			ilm->ilm_pendsrcs = NULL;
1213 		}
1214 		/*
1215 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1216 		 * and send a report for each ipif that has an mrec list.
1217 		 */
1218 		for (ipif = ill->ill_ipif; ipif != NULL;
1219 		    ipif = ipif->ipif_next) {
1220 			if (ipif->ipif_igmp_rpt == NULL)
1221 				continue;
1222 			mutex_exit(&ill->ill_lock);
1223 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1224 			mutex_enter(&ill->ill_lock);
1225 			/* mrec list was freed by igmpv3_sendrpt() */
1226 			ipif->ipif_igmp_rpt = NULL;
1227 		}
1228 	} else {
1229 		ill->ill_global_timer -= elapsed;
1230 		if (ill->ill_global_timer < next)
1231 			next = ill->ill_global_timer;
1232 	}
1233 
1234 per_ilm_timer:
1235 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1236 		if (ilm->ilm_timer == INFINITY)
1237 			goto per_ilm_rtxtimer;
1238 
1239 		if (ilm->ilm_timer > elapsed) {
1240 			ilm->ilm_timer -= elapsed;
1241 			if (ilm->ilm_timer < next)
1242 				next = ilm->ilm_timer;
1243 
1244 			if (ip_debug > 1) {
1245 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1246 				    "igmp_timo_hlr 2: ilm_timr %d elap %d "
1247 				    "typ %d nxt %d",
1248 				    (int)ntohl(ilm->ilm_timer), elapsed,
1249 				    (ill->ill_mcast_type), next);
1250 			}
1251 
1252 			goto per_ilm_rtxtimer;
1253 		}
1254 
1255 		/* the timer has expired, need to take action */
1256 		ilm->ilm_timer = INFINITY;
1257 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1258 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1259 			mutex_exit(&ill->ill_lock);
1260 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1261 			mutex_enter(&ill->ill_lock);
1262 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1263 			mutex_exit(&ill->ill_lock);
1264 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1265 			mutex_enter(&ill->ill_lock);
1266 		} else {
1267 			slist_t *rsp;
1268 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1269 			    (rsp = l_alloc()) != NULL) {
1270 				/*
1271 				 * Contents of reply depend on pending
1272 				 * requested source list.
1273 				 */
1274 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1275 					l_intersection(ilm->ilm_filter,
1276 					    ilm->ilm_pendsrcs, rsp);
1277 				} else {
1278 					l_difference(ilm->ilm_pendsrcs,
1279 					    ilm->ilm_filter, rsp);
1280 				}
1281 				FREE_SLIST(ilm->ilm_pendsrcs);
1282 				ilm->ilm_pendsrcs = NULL;
1283 				if (!SLIST_IS_EMPTY(rsp))
1284 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1285 					    &ilm->ilm_v6addr, rsp, rp);
1286 				FREE_SLIST(rsp);
1287 			} else {
1288 				/*
1289 				 * Either the pending request is just group-
1290 				 * specific, or we couldn't get the resources
1291 				 * (rsp) to build a source-specific reply.
1292 				 */
1293 				rp = mcast_bldmrec(ilm->ilm_fmode,
1294 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1295 			}
1296 			mutex_exit(&ill->ill_lock);
1297 			igmpv3_sendrpt(ill->ill_ipif, rp);
1298 			mutex_enter(&ill->ill_lock);
1299 			rp = NULL;
1300 		}
1301 
1302 		if (ip_debug > 1) {
1303 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1304 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1305 			    "typ %d nxt %d",
1306 			    (int)ntohl(ilm->ilm_timer), elapsed,
1307 			    (ill->ill_mcast_type), next);
1308 		}
1309 
1310 per_ilm_rtxtimer:
1311 		rtxp = &ilm->ilm_rtx;
1312 
1313 		if (rtxp->rtx_timer == INFINITY)
1314 			continue;
1315 		if (rtxp->rtx_timer > elapsed) {
1316 			rtxp->rtx_timer -= elapsed;
1317 			if (rtxp->rtx_timer < next)
1318 				next = rtxp->rtx_timer;
1319 			continue;
1320 		}
1321 
1322 		rtxp->rtx_timer = INFINITY;
1323 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1324 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1325 			mutex_exit(&ill->ill_lock);
1326 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1327 			mutex_enter(&ill->ill_lock);
1328 			continue;
1329 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1330 			mutex_exit(&ill->ill_lock);
1331 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1332 			mutex_enter(&ill->ill_lock);
1333 			continue;
1334 		}
1335 
1336 		/*
1337 		 * The retransmit timer has popped, and our router is
1338 		 * IGMPv3.  We have to delve into the retransmit state
1339 		 * stored in the ilm.
1340 		 *
1341 		 * Decrement the retransmit count.  If the fmode rtx
1342 		 * count is active, decrement it, and send a filter
1343 		 * mode change report with the ilm's source list.
1344 		 * Otherwise, send a source list change report with
1345 		 * the current retransmit lists.
1346 		 */
1347 		ASSERT(rtxp->rtx_cnt > 0);
1348 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1349 		rtxp->rtx_cnt--;
1350 		if (rtxp->rtx_fmode_cnt > 0) {
1351 			rtxp->rtx_fmode_cnt--;
1352 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1353 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1354 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1355 			    ilm->ilm_filter, rtxrp);
1356 		} else {
1357 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1358 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1359 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1360 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1361 		}
1362 		if (rtxp->rtx_cnt > 0) {
1363 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1364 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1365 			if (rtxp->rtx_timer < next)
1366 				next = rtxp->rtx_timer;
1367 		} else {
1368 			CLEAR_SLIST(rtxp->rtx_allow);
1369 			CLEAR_SLIST(rtxp->rtx_block);
1370 		}
1371 		mutex_exit(&ill->ill_lock);
1372 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1373 		mutex_enter(&ill->ill_lock);
1374 		rtxrp = NULL;
1375 	}
1376 
1377 	mutex_exit(&ill->ill_lock);
1378 
1379 	return (next);
1380 }
1381 
1382 /*
1383  * igmp_timeout_handler:
1384  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1385  * Returns number of ticks to next event (or 0 if none).
1386  *
1387  * As part of multicast join and leave igmp we may need to send out an
1388  * igmp request. The igmp related state variables in the ilm are protected
1389  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1390  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1391  * starts the igmp timer if needed. It serializes multiple threads trying to
1392  * simultaneously start the timer using the igmp_timer_setter_active flag.
1393  *
1394  * igmp_input() receives igmp queries and responds to the queries
1395  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1396  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1397  * performs the action exclusively after entering each ill's ipsq as writer.
1398  * The actual igmp timeout handler needs to run in the ipsq since it has to
1399  * access the ilm's and we don't want another exclusive operation like
1400  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1401  * another.
1402  *
1403  * The igmp_slowtimeo() function is called thru another timer.
1404  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1405  */
1406 
1407 /* ARGSUSED */
1408 void
1409 igmp_timeout_handler(void *arg)
1410 {
1411 	ill_t	*ill;
1412 	int	elapsed;	/* Since last call */
1413 	uint_t  global_next = INFINITY;
1414 	uint_t  next;
1415 	ill_walk_context_t ctx;
1416 	boolean_t success;
1417 
1418 	mutex_enter(&igmp_timer_lock);
1419 	ASSERT(igmp_timeout_id != 0);
1420 	igmp_timer_fired_last = ddi_get_lbolt();
1421 	elapsed = igmp_time_to_next;
1422 	igmp_time_to_next = 0;
1423 	mutex_exit(&igmp_timer_lock);
1424 
1425 	rw_enter(&ill_g_lock, RW_READER);
1426 	ill = ILL_START_WALK_V4(&ctx);
1427 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1428 		ASSERT(!ill->ill_isv6);
1429 		/*
1430 		 * We may not be able to refhold the ill if the ill/ipif
1431 		 * is changing. But we need to make sure that the ill will
1432 		 * not vanish. So we just bump up the ill_waiter count.
1433 		 */
1434 		if (!ill_waiter_inc(ill))
1435 			continue;
1436 		rw_exit(&ill_g_lock);
1437 		success = ipsq_enter(ill, B_TRUE);
1438 		if (success) {
1439 			next = igmp_timeout_handler_per_ill(ill, elapsed);
1440 			if (next < global_next)
1441 				global_next = next;
1442 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1443 			    B_TRUE);
1444 		}
1445 		rw_enter(&ill_g_lock, RW_READER);
1446 		ill_waiter_dcr(ill);
1447 	}
1448 	rw_exit(&ill_g_lock);
1449 
1450 	mutex_enter(&igmp_timer_lock);
1451 	ASSERT(igmp_timeout_id != 0);
1452 	igmp_timeout_id = 0;
1453 	mutex_exit(&igmp_timer_lock);
1454 
1455 	if (global_next != INFINITY)
1456 		igmp_start_timers(global_next);
1457 }
1458 
1459 /*
1460  * mld_timeout_handler:
1461  * Called when there are timeout events, every next (tick).
1462  * Returns number of ticks to next event (or 0 if none).
1463  */
1464 /* ARGSUSED */
1465 uint_t
1466 mld_timeout_handler_per_ill(ill_t *ill, int elapsed)
1467 {
1468 	ilm_t 	*ilm;
1469 	uint_t	next = INFINITY;
1470 	mrec_t	*rp, *rtxrp;
1471 	rtx_state_t *rtxp;
1472 	mcast_record_t	rtype;
1473 
1474 	ASSERT(IAM_WRITER_ILL(ill));
1475 
1476 	mutex_enter(&ill->ill_lock);
1477 
1478 	/*
1479 	 * First check the global timer on this interface; the global timer
1480 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1481 	 */
1482 	if (ill->ill_global_timer == INFINITY)
1483 		goto per_ilm_timer;
1484 	if (ill->ill_global_timer <= elapsed) {
1485 		ill->ill_global_timer = INFINITY;
1486 		/*
1487 		 * Send report for each group on this interface.
1488 		 * Since we just set the global timer (received a v2 general
1489 		 * query), need to skip the all hosts addr (ff02::1), per
1490 		 * RFC 3810 section 6.
1491 		 */
1492 		rp = NULL;
1493 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1494 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1495 			    &ipv6_all_hosts_mcast))
1496 				continue;
1497 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1498 			    ilm->ilm_filter, rp);
1499 			/*
1500 			 * Since we're sending a report on this group, okay
1501 			 * to delete pending group-specific timers.  Note
1502 			 * that group-specific retransmit timers still need
1503 			 * to be checked in the per_ilm_timer for-loop.
1504 			 */
1505 			ilm->ilm_timer = INFINITY;
1506 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1507 			FREE_SLIST(ilm->ilm_pendsrcs);
1508 			ilm->ilm_pendsrcs = NULL;
1509 		}
1510 		mutex_exit(&ill->ill_lock);
1511 		mldv2_sendrpt(ill, rp);
1512 		mutex_enter(&ill->ill_lock);
1513 	} else {
1514 		ill->ill_global_timer -= elapsed;
1515 		if (ill->ill_global_timer < next)
1516 			next = ill->ill_global_timer;
1517 	}
1518 
1519 per_ilm_timer:
1520 	rp = rtxrp = NULL;
1521 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1522 		if (ilm->ilm_timer == INFINITY)
1523 			goto per_ilm_rtxtimer;
1524 
1525 		if (ilm->ilm_timer > elapsed) {
1526 			ilm->ilm_timer -= elapsed;
1527 			if (ilm->ilm_timer < next)
1528 				next = ilm->ilm_timer;
1529 
1530 			if (ip_debug > 1) {
1531 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1532 				    "igmp_timo_hlr 2: ilm_timr"
1533 				    " %d elap %d typ %d nxt %d",
1534 				    (int)ntohl(ilm->ilm_timer), elapsed,
1535 				    (ill->ill_mcast_type), next);
1536 			}
1537 
1538 			goto per_ilm_rtxtimer;
1539 		}
1540 
1541 		/* the timer has expired, need to take action */
1542 		ilm->ilm_timer = INFINITY;
1543 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1544 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1545 			mutex_exit(&ill->ill_lock);
1546 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1547 			mutex_enter(&ill->ill_lock);
1548 		} else {
1549 			slist_t *rsp;
1550 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1551 			    (rsp = l_alloc()) != NULL) {
1552 				/*
1553 				 * Contents of reply depend on pending
1554 				 * requested source list.
1555 				 */
1556 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1557 					l_intersection(ilm->ilm_filter,
1558 					    ilm->ilm_pendsrcs, rsp);
1559 				} else {
1560 					l_difference(ilm->ilm_pendsrcs,
1561 					    ilm->ilm_filter, rsp);
1562 				}
1563 				FREE_SLIST(ilm->ilm_pendsrcs);
1564 				ilm->ilm_pendsrcs = NULL;
1565 				if (!SLIST_IS_EMPTY(rsp))
1566 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1567 					    &ilm->ilm_v6addr, rsp, rp);
1568 				FREE_SLIST(rsp);
1569 			} else {
1570 				rp = mcast_bldmrec(ilm->ilm_fmode,
1571 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1572 			}
1573 		}
1574 
1575 		if (ip_debug > 1) {
1576 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1577 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1578 			    "typ %d nxt %d",
1579 			    (int)ntohl(ilm->ilm_timer), elapsed,
1580 			    (ill->ill_mcast_type), next);
1581 		}
1582 
1583 per_ilm_rtxtimer:
1584 		rtxp = &ilm->ilm_rtx;
1585 
1586 		if (rtxp->rtx_timer == INFINITY)
1587 			continue;
1588 		if (rtxp->rtx_timer > elapsed) {
1589 			rtxp->rtx_timer -= elapsed;
1590 			if (rtxp->rtx_timer < next)
1591 				next = rtxp->rtx_timer;
1592 			continue;
1593 		}
1594 
1595 		rtxp->rtx_timer = INFINITY;
1596 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1597 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1598 			mutex_exit(&ill->ill_lock);
1599 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1600 			mutex_enter(&ill->ill_lock);
1601 			continue;
1602 		}
1603 
1604 		/*
1605 		 * The retransmit timer has popped, and our router is
1606 		 * MLDv2.  We have to delve into the retransmit state
1607 		 * stored in the ilm.
1608 		 *
1609 		 * Decrement the retransmit count.  If the fmode rtx
1610 		 * count is active, decrement it, and send a filter
1611 		 * mode change report with the ilm's source list.
1612 		 * Otherwise, send a source list change report with
1613 		 * the current retransmit lists.
1614 		 */
1615 		ASSERT(rtxp->rtx_cnt > 0);
1616 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1617 		rtxp->rtx_cnt--;
1618 		if (rtxp->rtx_fmode_cnt > 0) {
1619 			rtxp->rtx_fmode_cnt--;
1620 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1621 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1622 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1623 			    ilm->ilm_filter, rtxrp);
1624 		} else {
1625 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1626 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1627 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1628 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1629 		}
1630 		if (rtxp->rtx_cnt > 0) {
1631 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1632 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1633 			if (rtxp->rtx_timer < next)
1634 				next = rtxp->rtx_timer;
1635 		} else {
1636 			CLEAR_SLIST(rtxp->rtx_allow);
1637 			CLEAR_SLIST(rtxp->rtx_block);
1638 		}
1639 	}
1640 
1641 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1642 		mutex_exit(&ill->ill_lock);
1643 		mldv2_sendrpt(ill, rp);
1644 		mldv2_sendrpt(ill, rtxrp);
1645 		return (next);
1646 	}
1647 
1648 	mutex_exit(&ill->ill_lock);
1649 
1650 	return (next);
1651 }
1652 
1653 /*
1654  * mld_timeout_handler:
1655  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1656  * Returns number of ticks to next event (or 0 if none).
1657  * MT issues are same as igmp_timeout_handler
1658  */
1659 /* ARGSUSED */
1660 void
1661 mld_timeout_handler(void *arg)
1662 {
1663 	ill_t	*ill;
1664 	int	elapsed;	/* Since last call */
1665 	uint_t  global_next = INFINITY;
1666 	uint_t  next;
1667 	ill_walk_context_t ctx;
1668 	boolean_t success;
1669 
1670 	mutex_enter(&mld_timer_lock);
1671 	ASSERT(mld_timeout_id != 0);
1672 	mld_timer_fired_last = ddi_get_lbolt();
1673 	elapsed = mld_time_to_next;
1674 	mld_time_to_next = 0;
1675 	mutex_exit(&mld_timer_lock);
1676 
1677 	rw_enter(&ill_g_lock, RW_READER);
1678 	ill = ILL_START_WALK_V6(&ctx);
1679 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1680 		ASSERT(ill->ill_isv6);
1681 		/*
1682 		 * We may not be able to refhold the ill if the ill/ipif
1683 		 * is changing. But we need to make sure that the ill will
1684 		 * not vanish. So we just bump up the ill_waiter count.
1685 		 */
1686 		if (!ill_waiter_inc(ill))
1687 			continue;
1688 		rw_exit(&ill_g_lock);
1689 		success = ipsq_enter(ill, B_TRUE);
1690 		if (success) {
1691 			next = mld_timeout_handler_per_ill(ill, elapsed);
1692 			if (next < global_next)
1693 				global_next = next;
1694 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1695 			    B_FALSE);
1696 		}
1697 		rw_enter(&ill_g_lock, RW_READER);
1698 		ill_waiter_dcr(ill);
1699 	}
1700 	rw_exit(&ill_g_lock);
1701 
1702 	mutex_enter(&mld_timer_lock);
1703 	ASSERT(mld_timeout_id != 0);
1704 	mld_timeout_id = 0;
1705 	mutex_exit(&mld_timer_lock);
1706 
1707 	if (global_next != INFINITY)
1708 		mld_start_timers(global_next);
1709 }
1710 
1711 /*
1712  * Calculate the Older Version Querier Present timeout value, in number
1713  * of slowtimo intervals, for the given ill.
1714  */
1715 #define	OVQP(ill) \
1716 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1717 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1718 
1719 /*
1720  * igmp_slowtimo:
1721  * - Resets to new router if we didnt we hear from the router
1722  *   in IGMP_AGE_THRESHOLD seconds.
1723  * - Resets slowtimeout.
1724  */
1725 /* ARGSUSED */
1726 void
1727 igmp_slowtimo(void *arg)
1728 {
1729 	ill_t	*ill;
1730 	ill_if_t *ifp;
1731 	avl_tree_t *avl_tree;
1732 
1733 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1734 	rw_enter(&ill_g_lock, RW_READER);
1735 
1736 	/*
1737 	 * The ill_if_t list is circular, hence the odd loop parameters.
1738 	 *
1739 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1740 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1741 	 * structure (allowing us to skip if none of the instances have timers
1742 	 * running).
1743 	 */
1744 	for (ifp = IP_V4_ILL_G_LIST; ifp != (ill_if_t *)&IP_V4_ILL_G_LIST;
1745 	    ifp = ifp->illif_next) {
1746 		/*
1747 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1748 		 * a V1 or V2 query now and we miss seeing the count now,
1749 		 * we will see it the next time igmp_slowtimo is called.
1750 		 */
1751 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1752 			continue;
1753 
1754 		avl_tree = &ifp->illif_avl_by_ppa;
1755 		for (ill = avl_first(avl_tree); ill != NULL;
1756 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1757 			mutex_enter(&ill->ill_lock);
1758 			if (ill->ill_mcast_v1_tset == 1)
1759 				ill->ill_mcast_v1_time++;
1760 			if (ill->ill_mcast_v2_tset == 1)
1761 				ill->ill_mcast_v2_time++;
1762 			if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1763 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1764 					if (ill->ill_mcast_v2_tset > 0) {
1765 						ip1dbg(("V1 query timer "
1766 						    "expired on %s; switching "
1767 						    "mode to IGMP_V2\n",
1768 						    ill->ill_name));
1769 						ill->ill_mcast_type =
1770 						    IGMP_V2_ROUTER;
1771 					} else {
1772 						ip1dbg(("V1 query timer "
1773 						    "expired on %s; switching "
1774 						    "mode to IGMP_V3\n",
1775 						    ill->ill_name));
1776 						ill->ill_mcast_type =
1777 						    IGMP_V3_ROUTER;
1778 					}
1779 					ill->ill_mcast_v1_time = 0;
1780 					ill->ill_mcast_v1_tset = 0;
1781 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1782 				}
1783 			}
1784 			if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1785 				if (ill->ill_mcast_v2_time >= OVQP(ill)) {
1786 					ip1dbg(("V2 query timer expired on "
1787 					    "%s; switching mode to IGMP_V3\n",
1788 					    ill->ill_name));
1789 					ill->ill_mcast_type = IGMP_V3_ROUTER;
1790 					ill->ill_mcast_v2_time = 0;
1791 					ill->ill_mcast_v2_tset = 0;
1792 					atomic_add_16(&ifp->illif_mcast_v2, -1);
1793 				}
1794 			}
1795 			mutex_exit(&ill->ill_lock);
1796 		}
1797 
1798 	}
1799 	rw_exit(&ill_g_lock);
1800 	mutex_enter(&igmp_slowtimeout_lock);
1801 	igmp_slowtimeout_id = timeout(igmp_slowtimo, NULL,
1802 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1803 	mutex_exit(&igmp_slowtimeout_lock);
1804 }
1805 
1806 /*
1807  * mld_slowtimo:
1808  * - Resets to newer version if we didn't hear from the older version router
1809  *   in MLD_AGE_THRESHOLD seconds.
1810  * - Restarts slowtimeout.
1811  */
1812 /* ARGSUSED */
1813 void
1814 mld_slowtimo(void *arg)
1815 {
1816 	ill_t *ill;
1817 	ill_if_t *ifp;
1818 	avl_tree_t *avl_tree;
1819 
1820 	/* See comments in igmp_slowtimo() above... */
1821 	rw_enter(&ill_g_lock, RW_READER);
1822 	for (ifp = IP_V6_ILL_G_LIST; ifp != (ill_if_t *)&IP_V6_ILL_G_LIST;
1823 	    ifp = ifp->illif_next) {
1824 
1825 		if (ifp->illif_mcast_v1 == 0)
1826 			continue;
1827 
1828 		avl_tree = &ifp->illif_avl_by_ppa;
1829 		for (ill = avl_first(avl_tree); ill != NULL;
1830 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1831 			mutex_enter(&ill->ill_lock);
1832 			if (ill->ill_mcast_v1_tset == 1)
1833 				ill->ill_mcast_v1_time++;
1834 			if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1835 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1836 					ip1dbg(("MLD query timer expired on"
1837 					    " %s; switching mode to MLD_V2\n",
1838 					    ill->ill_name));
1839 					ill->ill_mcast_type = MLD_V2_ROUTER;
1840 					ill->ill_mcast_v1_time = 0;
1841 					ill->ill_mcast_v1_tset = 0;
1842 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1843 				}
1844 			}
1845 			mutex_exit(&ill->ill_lock);
1846 		}
1847 	}
1848 	rw_exit(&ill_g_lock);
1849 	mutex_enter(&mld_slowtimeout_lock);
1850 	mld_slowtimeout_id = timeout(mld_slowtimo, NULL,
1851 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1852 	mutex_exit(&mld_slowtimeout_lock);
1853 }
1854 
1855 /*
1856  * igmp_sendpkt:
1857  * This will send to ip_wput like icmp_inbound.
1858  * Note that the lower ill (on which the membership is kept) is used
1859  * as an upper ill to pass in the multicast parameters.
1860  */
1861 static void
1862 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1863 {
1864 	mblk_t	*mp;
1865 	igmpa_t	*igmpa;
1866 	uint8_t *rtralert;
1867 	ipha_t	*ipha;
1868 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1869 	size_t	size  = hdrlen + sizeof (igmpa_t);
1870 	ipif_t 	*ipif = ilm->ilm_ipif;
1871 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1872 	mblk_t	*first_mp;
1873 	ipsec_out_t *io;
1874 
1875 	/*
1876 	 * We need to make sure this packet goes out on an ipif. If
1877 	 * there is some global policy match in ip_wput_ire, we need
1878 	 * to get to the right interface after IPSEC processing.
1879 	 * To make sure this multicast packet goes out on the right
1880 	 * interface, we attach an ipsec_out and initialize ill_index
1881 	 * like we did in ip_wput. To make sure that this packet does
1882 	 * not get forwarded on other interfaces or looped back, we
1883 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1884 	 * to B_FALSE.
1885 	 *
1886 	 * We also need to make sure that this does not get load balanced
1887 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1888 	 * here. If it gets load balanced, switches supporting igmp snooping
1889 	 * will send the packet that it receives for this multicast group
1890 	 * to the interface that we are sending on. As we have joined the
1891 	 * multicast group on this ill, by sending the packet out on this
1892 	 * ill, we receive all the packets back on this ill.
1893 	 */
1894 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1895 	if (first_mp == NULL)
1896 		return;
1897 
1898 	first_mp->b_datap->db_type = M_CTL;
1899 	first_mp->b_wptr += sizeof (ipsec_info_t);
1900 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1901 	/* ipsec_out_secure is B_FALSE now */
1902 	io = (ipsec_out_t *)first_mp->b_rptr;
1903 	io->ipsec_out_type = IPSEC_OUT;
1904 	io->ipsec_out_len = sizeof (ipsec_out_t);
1905 	io->ipsec_out_use_global_policy = B_TRUE;
1906 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1907 	io->ipsec_out_attach_if = B_TRUE;
1908 	io->ipsec_out_multicast_loop = B_FALSE;
1909 	io->ipsec_out_dontroute = B_TRUE;
1910 	io->ipsec_out_zoneid = ilm->ilm_zoneid;
1911 
1912 	mp = allocb(size, BPRI_HI);
1913 	if (mp == NULL) {
1914 		freemsg(first_mp);
1915 		return;
1916 	}
1917 	mp->b_wptr = mp->b_rptr + size;
1918 	first_mp->b_cont = mp;
1919 
1920 	ipha = (ipha_t *)mp->b_rptr;
1921 	rtralert = (uint8_t *)&(ipha[1]);
1922 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1923 	igmpa->igmpa_type   = type;
1924 	igmpa->igmpa_code   = 0;
1925 	igmpa->igmpa_group  = ilm->ilm_addr;
1926 	igmpa->igmpa_cksum  = 0;
1927 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1928 	if (igmpa->igmpa_cksum == 0)
1929 		igmpa->igmpa_cksum = 0xffff;
1930 
1931 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
1932 	rtralert[1] = RTRALERT_LEN;
1933 	rtralert[2] = 0;
1934 	rtralert[3] = 0;
1935 
1936 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1937 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1938 	ipha->ipha_type_of_service 	= 0;
1939 	ipha->ipha_length = htons(size);
1940 	ipha->ipha_ident = 0;
1941 	ipha->ipha_fragment_offset_and_flags = 0;
1942 	ipha->ipha_ttl 		= IGMP_TTL;
1943 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1944 	ipha->ipha_hdr_checksum 	= 0;
1945 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1946 	ipha->ipha_src 		= ipif->ipif_src_addr;
1947 	/*
1948 	 * Request loopback of the report if we are acting as a multicast
1949 	 * router, so that the process-level routing demon can hear it.
1950 	 */
1951 	/*
1952 	 * This will run multiple times for the same group if there are members
1953 	 * on the same group for multiple ipif's on the same ill. The
1954 	 * igmp_input code will suppress this due to the loopback thus we
1955 	 * always loopback membership report.
1956 	 */
1957 	ASSERT(ill->ill_rq != NULL);
1958 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1959 
1960 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
1961 
1962 	++igmpstat.igps_snd_reports;
1963 }
1964 
1965 /*
1966  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1967  * with the passed-in ipif.  The report will contain one group record
1968  * for each element of reclist.  If this causes packet length to
1969  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1970  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1971  * and those buffers are freed here.
1972  */
1973 static void
1974 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1975 {
1976 	ipsec_out_t *io;
1977 	igmp3ra_t *igmp3ra;
1978 	grphdra_t *grphdr;
1979 	mblk_t *first_mp, *mp;
1980 	ipha_t *ipha;
1981 	uint8_t *rtralert;
1982 	ipaddr_t *src_array;
1983 	int i, j, numrec, more_src_cnt;
1984 	size_t hdrsize, size, rsize;
1985 	ill_t *ill = ipif->ipif_ill;
1986 	mrec_t *rp, *cur_reclist;
1987 	mrec_t *next_reclist = reclist;
1988 	boolean_t morepkts;
1989 
1990 	/* if there aren't any records, there's nothing to send */
1991 	if (reclist == NULL)
1992 		return;
1993 
1994 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1995 nextpkt:
1996 	size = hdrsize + sizeof (igmp3ra_t);
1997 	morepkts = B_FALSE;
1998 	more_src_cnt = 0;
1999 	cur_reclist = next_reclist;
2000 	numrec = 0;
2001 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2002 		rsize = sizeof (grphdra_t) +
2003 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2004 		if (size + rsize > ill->ill_max_frag) {
2005 			if (rp == cur_reclist) {
2006 				/*
2007 				 * If the first mrec we looked at is too big
2008 				 * to fit in a single packet (i.e the source
2009 				 * list is too big), we must either truncate
2010 				 * the list (if TO_EX or IS_EX), or send
2011 				 * multiple reports for the same group (all
2012 				 * other types).
2013 				 */
2014 				int srcspace, srcsperpkt;
2015 				srcspace = ill->ill_max_frag - (size +
2016 				    sizeof (grphdra_t));
2017 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2018 				/*
2019 				 * Increment size and numrec, because we will
2020 				 * be sending a record for the mrec we're
2021 				 * looking at now.
2022 				 */
2023 				size += sizeof (grphdra_t) +
2024 				    (srcsperpkt * sizeof (ipaddr_t));
2025 				numrec++;
2026 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2027 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2028 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2029 					if (rp->mrec_next == NULL) {
2030 						/* no more packets to send */
2031 						break;
2032 					} else {
2033 						/*
2034 						 * more packets, but we're
2035 						 * done with this mrec.
2036 						 */
2037 						next_reclist = rp->mrec_next;
2038 					}
2039 				} else {
2040 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2041 					    - srcsperpkt;
2042 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2043 					/*
2044 					 * We'll fix up this mrec (remove the
2045 					 * srcs we've already sent) before
2046 					 * returning to nextpkt above.
2047 					 */
2048 					next_reclist = rp;
2049 				}
2050 			} else {
2051 				next_reclist = rp;
2052 			}
2053 			morepkts = B_TRUE;
2054 			break;
2055 		}
2056 		size += rsize;
2057 		numrec++;
2058 	}
2059 
2060 	/*
2061 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2062 	 * load balancing requirements.
2063 	 */
2064 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2065 	if (first_mp == NULL)
2066 		goto free_reclist;
2067 
2068 	first_mp->b_datap->db_type = M_CTL;
2069 	first_mp->b_wptr += sizeof (ipsec_info_t);
2070 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2071 	/* ipsec_out_secure is B_FALSE now */
2072 	io = (ipsec_out_t *)first_mp->b_rptr;
2073 	io->ipsec_out_type = IPSEC_OUT;
2074 	io->ipsec_out_len = sizeof (ipsec_out_t);
2075 	io->ipsec_out_use_global_policy = B_TRUE;
2076 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2077 	io->ipsec_out_attach_if = B_TRUE;
2078 	io->ipsec_out_multicast_loop = B_FALSE;
2079 	io->ipsec_out_dontroute = B_TRUE;
2080 	io->ipsec_out_zoneid = ipif->ipif_zoneid;
2081 
2082 	mp = allocb(size, BPRI_HI);
2083 	if (mp == NULL) {
2084 		freemsg(first_mp);
2085 		goto free_reclist;
2086 	}
2087 	bzero((char *)mp->b_rptr, size);
2088 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2089 	first_mp->b_cont = mp;
2090 
2091 	ipha = (ipha_t *)mp->b_rptr;
2092 	rtralert = (uint8_t *)&(ipha[1]);
2093 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2094 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2095 
2096 	rp = cur_reclist;
2097 	for (i = 0; i < numrec; i++) {
2098 		grphdr->grphdra_type = rp->mrec_type;
2099 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2100 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2101 		src_array = (ipaddr_t *)&(grphdr[1]);
2102 
2103 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2104 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2105 
2106 		grphdr = (grphdra_t *)&(src_array[j]);
2107 		rp = rp->mrec_next;
2108 	}
2109 
2110 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2111 	igmp3ra->igmp3ra_numrec = htons(numrec);
2112 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2113 
2114 	rtralert[0] = IPOPT_COPY & IPOPT_RTRALERT;
2115 	rtralert[1] = RTRALERT_LEN;
2116 	rtralert[2] = 0;
2117 	rtralert[3] = 0;
2118 
2119 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2120 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2121 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2122 	ipha->ipha_length = htons(size);
2123 	ipha->ipha_ttl = IGMP_TTL;
2124 	ipha->ipha_protocol = IPPROTO_IGMP;
2125 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2126 	ipha->ipha_src = ipif->ipif_src_addr;
2127 
2128 	/*
2129 	 * Request loopback of the report if we are acting as a multicast
2130 	 * router, so that the process-level routing daemon can hear it.
2131 	 *
2132 	 * This will run multiple times for the same group if there are
2133 	 * members on the same group for multiple ipifs on the same ill.
2134 	 * The igmp_input code will suppress this due to the loopback;
2135 	 * thus we always loopback membership report.
2136 	 */
2137 	ASSERT(ill->ill_rq != NULL);
2138 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2139 
2140 	ip_wput_multicast(ill->ill_wq, first_mp, ipif);
2141 
2142 	++igmpstat.igps_snd_reports;
2143 
2144 	if (morepkts) {
2145 		if (more_src_cnt > 0) {
2146 			int index, mvsize;
2147 			slist_t *sl = &next_reclist->mrec_srcs;
2148 			index = sl->sl_numsrc;
2149 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2150 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2151 			    mvsize);
2152 			sl->sl_numsrc = more_src_cnt;
2153 		}
2154 		goto nextpkt;
2155 	}
2156 
2157 free_reclist:
2158 	while (reclist != NULL) {
2159 		rp = reclist->mrec_next;
2160 		mi_free(reclist);
2161 		reclist = rp;
2162 	}
2163 }
2164 
2165 /*
2166  * mld_input:
2167  */
2168 /* ARGSUSED */
2169 void
2170 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2171 {
2172 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2173 	mld_hdr_t	*mldh;
2174 	ilm_t		*ilm;
2175 	ipif_t		*ipif;
2176 	uint16_t	hdr_length, exthdr_length;
2177 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2178 	uint_t		next;
2179 	int		mldlen;
2180 
2181 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2182 
2183 	/* Make sure the src address of the packet is link-local */
2184 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2185 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2186 		freemsg(mp);
2187 		return;
2188 	}
2189 
2190 	if (ip6h->ip6_hlim != 1) {
2191 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2192 		freemsg(mp);
2193 		return;
2194 	}
2195 
2196 	/* Get to the icmp header part */
2197 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2198 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2199 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2200 	} else {
2201 		hdr_length = IPV6_HDR_LEN;
2202 		exthdr_length = 0;
2203 	}
2204 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2205 
2206 	/* An MLD packet must at least be 24 octets to be valid */
2207 	if (mldlen < MLD_MINLEN) {
2208 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2209 		freemsg(mp);
2210 		return;
2211 	}
2212 
2213 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2214 
2215 	switch (mldh->mld_type) {
2216 	case MLD_LISTENER_QUERY:
2217 		/*
2218 		 * packet length differentiates between v1 and v2.  v1
2219 		 * query should be exactly 24 octets long; v2 is >= 28.
2220 		 */
2221 		if (mldlen == MLD_MINLEN) {
2222 			next = mld_query_in(mldh, ill);
2223 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2224 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2225 		} else {
2226 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2227 			freemsg(mp);
2228 			return;
2229 		}
2230 		if (next == 0) {
2231 			freemsg(mp);
2232 			return;
2233 		}
2234 
2235 		if (next != INFINITY)
2236 			mld_start_timers(next);
2237 		break;
2238 
2239 	case MLD_LISTENER_REPORT: {
2240 
2241 		ASSERT(ill->ill_ipif != NULL);
2242 		/*
2243 		 * For fast leave to work, we have to know that we are the
2244 		 * last person to send a report for this group.  Reports
2245 		 * generated by us are looped back since we could potentially
2246 		 * be a multicast router, so discard reports sourced by me.
2247 		 */
2248 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2249 		mutex_enter(&ill->ill_lock);
2250 		for (ipif = ill->ill_ipif; ipif != NULL;
2251 		    ipif = ipif->ipif_next) {
2252 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2253 			    lcladdr_ptr)) {
2254 				if (ip_debug > 1) {
2255 					char    buf1[INET6_ADDRSTRLEN];
2256 					char	buf2[INET6_ADDRSTRLEN];
2257 
2258 					(void) mi_strlog(ill->ill_rq,
2259 					    1,
2260 					    SL_TRACE,
2261 					    "mld_input: we are only "
2262 					    "member src %s ipif_local %s",
2263 					    inet_ntop(AF_INET6, lcladdr_ptr,
2264 					    buf1, sizeof (buf1)),
2265 					    inet_ntop(AF_INET6,
2266 					    &ipif->ipif_v6lcl_addr,
2267 					    buf2, sizeof (buf2)));
2268 				}
2269 				mutex_exit(&ill->ill_lock);
2270 				freemsg(mp);
2271 				return;
2272 			}
2273 		}
2274 		mutex_exit(&ill->ill_lock);
2275 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2276 
2277 		v6group_ptr = &mldh->mld_addr;
2278 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2279 			BUMP_MIB(ill->ill_icmp6_mib,
2280 			    ipv6IfIcmpInGroupMembBadReports);
2281 			freemsg(mp);
2282 			return;
2283 		}
2284 
2285 
2286 		/*
2287 		 * If we belong to the group being reported, and we are a
2288 		 * 'Delaying member' per the RFC terminology, stop our timer
2289 		 * for that group and 'clear flag' i.e. mark ilm_state as
2290 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2291 		 * membership entries for the same group address (one per zone)
2292 		 * so we need to walk the ill_ilm list.
2293 		 */
2294 		mutex_enter(&ill->ill_lock);
2295 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2296 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2297 			    continue;
2298 			BUMP_MIB(ill->ill_icmp6_mib,
2299 			    ipv6IfIcmpInGroupMembOurReports);
2300 
2301 			ilm->ilm_timer = INFINITY;
2302 			ilm->ilm_state = IGMP_OTHERMEMBER;
2303 		}
2304 		mutex_exit(&ill->ill_lock);
2305 		break;
2306 	}
2307 	case MLD_LISTENER_REDUCTION:
2308 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2309 		break;
2310 	}
2311 	/*
2312 	 * All MLD packets have already been passed up to any
2313 	 * process(es) listening on a ICMP6 raw socket. This
2314 	 * has been accomplished in ip_deliver_local_v6 prior to
2315 	 * this function call. It is assumed that the multicast daemon
2316 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2317 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2318 	 * Thus we can free the MLD message block here
2319 	 */
2320 	freemsg(mp);
2321 }
2322 
2323 /*
2324  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2325  * (non-zero, unsigned) timer value to be set on success.
2326  */
2327 static uint_t
2328 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2329 {
2330 	ilm_t	*ilm;
2331 	int	timer;
2332 	uint_t	next;
2333 	in6_addr_t *v6group;
2334 
2335 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2336 
2337 	/*
2338 	 * In the MLD specification, there are 3 states and a flag.
2339 	 *
2340 	 * In Non-Listener state, we simply don't have a membership record.
2341 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2342 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2343 	 * INFINITY)
2344 	 *
2345 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2346 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2347 	 * if I sent the last report.
2348 	 */
2349 	v6group = &mldh->mld_addr;
2350 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2351 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2352 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2353 		return (0);
2354 	}
2355 
2356 	/* Need to do compatibility mode checking */
2357 	mutex_enter(&ill->ill_lock);
2358 	ill->ill_mcast_v1_time = 0;
2359 	ill->ill_mcast_v1_tset = 1;
2360 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2361 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2362 		    "MLD_V1_ROUTER\n", ill->ill_name));
2363 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2364 		ill->ill_mcast_type = MLD_V1_ROUTER;
2365 	}
2366 	mutex_exit(&ill->ill_lock);
2367 
2368 	timer = (int)ntohs(mldh->mld_maxdelay);
2369 	if (ip_debug > 1) {
2370 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2371 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2372 		    timer, (int)mldh->mld_type);
2373 	}
2374 
2375 	/*
2376 	 * -Start the timers in all of our membership records for
2377 	 * the physical interface on which the query arrived,
2378 	 * excl:
2379 	 *	1.  those that belong to the "all hosts" group,
2380 	 *	2.  those with 0 scope, or 1 node-local scope.
2381 	 *
2382 	 * -Restart any timer that is already running but has a value
2383 	 * longer that the requested timeout.
2384 	 * -Use the value specified in the query message as the
2385 	 * maximum timeout.
2386 	 */
2387 	next = INFINITY;
2388 	mutex_enter(&ill->ill_lock);
2389 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2390 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2391 
2392 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2393 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2394 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2395 			continue;
2396 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2397 		    &ipv6_all_hosts_mcast)) &&
2398 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2399 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2400 			if (timer == 0) {
2401 				/* Respond immediately */
2402 				ilm->ilm_timer = INFINITY;
2403 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2404 				mutex_exit(&ill->ill_lock);
2405 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2406 				mutex_enter(&ill->ill_lock);
2407 				break;
2408 			}
2409 			if (ilm->ilm_timer > timer) {
2410 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2411 				if (ilm->ilm_timer < next)
2412 					next = ilm->ilm_timer;
2413 			}
2414 			break;
2415 		}
2416 	}
2417 	mutex_exit(&ill->ill_lock);
2418 
2419 	return (next);
2420 }
2421 
2422 /*
2423  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2424  * returns the appropriate (non-zero, unsigned) timer value (which may
2425  * be INFINITY) to be set.
2426  */
2427 static uint_t
2428 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2429 {
2430 	ilm_t	*ilm;
2431 	in6_addr_t *v6group, *src_array;
2432 	uint_t	next, numsrc, i, mrd, delay, qqi;
2433 	uint8_t	qrv;
2434 
2435 	v6group = &mld2q->mld2q_addr;
2436 	numsrc = ntohs(mld2q->mld2q_numsrc);
2437 
2438 	/* make sure numsrc matches packet size */
2439 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2440 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2441 		return (0);
2442 	}
2443 	src_array = (in6_addr_t *)&mld2q[1];
2444 
2445 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2446 
2447 	/* extract Maximum Response Delay from code in header */
2448 	mrd = ntohs(mld2q->mld2q_mxrc);
2449 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2450 		uint_t hdrval, mant, exp;
2451 		hdrval = mrd;
2452 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2453 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2454 		mrd = (mant | 0x1000) << (exp + 3);
2455 	}
2456 	MCAST_RANDOM_DELAY(delay, mrd);
2457 	next = (unsigned)INFINITY;
2458 
2459 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2460 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2461 	else
2462 		ill->ill_mcast_rv = qrv;
2463 
2464 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2465 		uint_t mant, exp;
2466 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2467 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2468 		qqi = (mant | 0x10) << (exp + 3);
2469 	}
2470 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2471 
2472 	/*
2473 	 * If we have a pending general query response that's scheduled
2474 	 * sooner than the delay we calculated for this response, then
2475 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2476 	 */
2477 	mutex_enter(&ill->ill_lock);
2478 	if (ill->ill_global_timer < delay) {
2479 		mutex_exit(&ill->ill_lock);
2480 		return (next);
2481 	}
2482 	mutex_exit(&ill->ill_lock);
2483 
2484 	/*
2485 	 * Now take action depending on query type: general,
2486 	 * group specific, or group/source specific.
2487 	 */
2488 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2489 		/*
2490 		 * general query
2491 		 * We know global timer is either not running or is
2492 		 * greater than our calculated delay, so reset it to
2493 		 * our delay (random value in range [0, response time])
2494 		 */
2495 		mutex_enter(&ill->ill_lock);
2496 		ill->ill_global_timer = delay;
2497 		next = ill->ill_global_timer;
2498 		mutex_exit(&ill->ill_lock);
2499 
2500 	} else {
2501 		/* group or group/source specific query */
2502 		mutex_enter(&ill->ill_lock);
2503 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2504 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2505 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2506 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2507 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2508 				continue;
2509 
2510 			/*
2511 			 * If the query is group specific or we have a
2512 			 * pending group specific query, the response is
2513 			 * group specific (pending sources list should be
2514 			 * empty).  Otherwise, need to update the pending
2515 			 * sources list for the group and source specific
2516 			 * response.
2517 			 */
2518 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2519 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2520 group_query:
2521 				FREE_SLIST(ilm->ilm_pendsrcs);
2522 				ilm->ilm_pendsrcs = NULL;
2523 			} else {
2524 				boolean_t overflow;
2525 				slist_t *pktl;
2526 				if (numsrc > MAX_FILTER_SIZE ||
2527 				    (ilm->ilm_pendsrcs == NULL &&
2528 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2529 					/*
2530 					 * We've been sent more sources than
2531 					 * we can deal with; or we can't deal
2532 					 * with a source list at all. Revert
2533 					 * to a group specific query.
2534 					 */
2535 					goto group_query;
2536 				}
2537 				if ((pktl = l_alloc()) == NULL)
2538 					goto group_query;
2539 				pktl->sl_numsrc = numsrc;
2540 				for (i = 0; i < numsrc; i++)
2541 					pktl->sl_addr[i] = src_array[i];
2542 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2543 				    &overflow);
2544 				l_free(pktl);
2545 				if (overflow)
2546 					goto group_query;
2547 			}
2548 			/* set timer to soonest value */
2549 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2550 			if (ilm->ilm_timer < next)
2551 				next = ilm->ilm_timer;
2552 			break;
2553 		}
2554 		mutex_exit(&ill->ill_lock);
2555 	}
2556 
2557 	return (next);
2558 }
2559 
2560 /*
2561  * Send MLDv1 response packet with hoplimit 1
2562  */
2563 static void
2564 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2565 {
2566 	mblk_t		*mp;
2567 	mld_hdr_t	*mldh;
2568 	ip6_t 		*ip6h;
2569 	ip6_hbh_t	*ip6hbh;
2570 	struct ip6_opt_router	*ip6router;
2571 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2572 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2573 	ipif_t		*ipif;
2574 	ip6i_t		*ip6i;
2575 
2576 	/*
2577 	 * We need to place a router alert option in this packet.  The length
2578 	 * of the options must be a multiple of 8.  The hbh option header is 2
2579 	 * bytes followed by the 4 byte router alert option.  That leaves
2580 	 * 2 bytes of pad for a total of 8 bytes.
2581 	 */
2582 	const int	router_alert_length = 8;
2583 
2584 	ASSERT(ill->ill_isv6);
2585 
2586 	/*
2587 	 * We need to make sure that this packet does not get load balanced.
2588 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2589 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2590 	 * If it gets load balanced, switches supporting MLD snooping
2591 	 * (in the future) will send the packet that it receives for this
2592 	 * multicast group to the interface that we are sending on. As we have
2593 	 * joined the multicast group on this ill, by sending the packet out
2594 	 * on this ill, we receive all the packets back on this ill.
2595 	 */
2596 	size += sizeof (ip6i_t) + router_alert_length;
2597 	mp = allocb(size, BPRI_HI);
2598 	if (mp == NULL)
2599 		return;
2600 	bzero(mp->b_rptr, size);
2601 	mp->b_wptr = mp->b_rptr + size;
2602 
2603 	ip6i = (ip6i_t *)mp->b_rptr;
2604 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2605 	ip6i->ip6i_nxt = IPPROTO_RAW;
2606 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2607 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2608 
2609 	ip6h = (ip6_t *)&ip6i[1];
2610 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2611 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2612 	/*
2613 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2614 	 * above will pad between ip6router and mld.
2615 	 */
2616 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2617 
2618 	mldh->mld_type = type;
2619 	mldh->mld_addr = ilm->ilm_v6addr;
2620 
2621 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2622 	ip6router->ip6or_len = 2;
2623 	ip6router->ip6or_value[0] = 0;
2624 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2625 
2626 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2627 	ip6hbh->ip6h_len = 0;
2628 
2629 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2630 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2631 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2632 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2633 	if (v6addr == NULL)
2634 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2635 	else
2636 		ip6h->ip6_dst = *v6addr;
2637 
2638 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2639 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2640 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2641 		ipif_refrele(ipif);
2642 	} else {
2643 		/* Otherwise, use IPv6 default address selection. */
2644 		ip6h->ip6_src = ipv6_all_zeros;
2645 	}
2646 
2647 	/*
2648 	 * Prepare for checksum by putting icmp length in the icmp
2649 	 * checksum field. The checksum is calculated in ip_wput_v6.
2650 	 */
2651 	mldh->mld_cksum = htons(sizeof (*mldh));
2652 
2653 	/*
2654 	 * ip_wput will automatically loopback the multicast packet to
2655 	 * the conn if multicast loopback is enabled.
2656 	 * The MIB stats corresponding to this outgoing MLD packet
2657 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2658 	 * ->icmp_update_out_mib_v6 function call.
2659 	 */
2660 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2661 }
2662 
2663 /*
2664  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2665  * report will contain one multicast address record for each element of
2666  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2667  * multiple reports are sent.  reclist is assumed to be made up of
2668  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2669  */
2670 static void
2671 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2672 {
2673 	mblk_t		*mp;
2674 	mld2r_t		*mld2r;
2675 	mld2mar_t	*mld2mar;
2676 	in6_addr_t	*srcarray;
2677 	ip6_t		*ip6h;
2678 	ip6_hbh_t	*ip6hbh;
2679 	ip6i_t		*ip6i;
2680 	struct ip6_opt_router	*ip6router;
2681 	size_t		size, optlen, padlen, icmpsize, rsize;
2682 	ipif_t		*ipif;
2683 	int		i, numrec, more_src_cnt;
2684 	mrec_t		*rp, *cur_reclist;
2685 	mrec_t		*next_reclist = reclist;
2686 	boolean_t	morepkts;
2687 
2688 	/* If there aren't any records, there's nothing to send */
2689 	if (reclist == NULL)
2690 		return;
2691 
2692 	ASSERT(ill->ill_isv6);
2693 
2694 	/*
2695 	 * Total option length (optlen + padlen) must be a multiple of
2696 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2697 	 * length will be 8.  Assert this in case anything ever changes.
2698 	 */
2699 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2700 	ASSERT(optlen <= 8);
2701 	padlen = 8 - optlen;
2702 nextpkt:
2703 	icmpsize = sizeof (mld2r_t);
2704 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2705 	morepkts = B_FALSE;
2706 	more_src_cnt = 0;
2707 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2708 	    rp = rp->mrec_next, numrec++) {
2709 		rsize = sizeof (mld2mar_t) +
2710 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2711 		if (size + rsize > ill->ill_max_frag) {
2712 			if (rp == cur_reclist) {
2713 				/*
2714 				 * If the first mrec we looked at is too big
2715 				 * to fit in a single packet (i.e the source
2716 				 * list is too big), we must either truncate
2717 				 * the list (if TO_EX or IS_EX), or send
2718 				 * multiple reports for the same group (all
2719 				 * other types).
2720 				 */
2721 				int srcspace, srcsperpkt;
2722 				srcspace = ill->ill_max_frag -
2723 				    (size + sizeof (mld2mar_t));
2724 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2725 				/*
2726 				 * Increment icmpsize and size, because we will
2727 				 * be sending a record for the mrec we're
2728 				 * looking at now.
2729 				 */
2730 				rsize = sizeof (mld2mar_t) +
2731 				    (srcsperpkt * sizeof (in6_addr_t));
2732 				icmpsize += rsize;
2733 				size += rsize;
2734 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2735 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2736 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2737 					if (rp->mrec_next == NULL) {
2738 						/* no more packets to send */
2739 						break;
2740 					} else {
2741 						/*
2742 						 * more packets, but we're
2743 						 * done with this mrec.
2744 						 */
2745 						next_reclist = rp->mrec_next;
2746 					}
2747 				} else {
2748 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2749 					    - srcsperpkt;
2750 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2751 					/*
2752 					 * We'll fix up this mrec (remove the
2753 					 * srcs we've already sent) before
2754 					 * returning to nextpkt above.
2755 					 */
2756 					next_reclist = rp;
2757 				}
2758 			} else {
2759 				next_reclist = rp;
2760 			}
2761 			morepkts = B_TRUE;
2762 			break;
2763 		}
2764 		icmpsize += rsize;
2765 		size += rsize;
2766 	}
2767 
2768 	/*
2769 	 * We need to make sure that this packet does not get load balanced.
2770 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2771 	 * ip_newroute_ipif_v6 know how to handle such packets.
2772 	 * If it gets load balanced, switches supporting MLD snooping
2773 	 * (in the future) will send the packet that it receives for this
2774 	 * multicast group to the interface that we are sending on. As we have
2775 	 * joined the multicast group on this ill, by sending the packet out
2776 	 * on this ill, we receive all the packets back on this ill.
2777 	 */
2778 	size += sizeof (ip6i_t);
2779 	mp = allocb(size, BPRI_HI);
2780 	if (mp == NULL)
2781 		goto free_reclist;
2782 	bzero(mp->b_rptr, size);
2783 	mp->b_wptr = mp->b_rptr + size;
2784 
2785 	ip6i = (ip6i_t *)mp->b_rptr;
2786 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2787 	ip6i->ip6i_nxt = IPPROTO_RAW;
2788 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2789 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2790 
2791 	ip6h = (ip6_t *)&(ip6i[1]);
2792 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2793 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2794 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2795 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2796 
2797 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2798 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2799 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2800 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2801 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2802 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2803 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2804 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2805 		ipif_refrele(ipif);
2806 	} else {
2807 		/* otherwise, use IPv6 default address selection. */
2808 		ip6h->ip6_src = ipv6_all_zeros;
2809 	}
2810 
2811 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2812 	/*
2813 	 * ip6h_len is the number of 8-byte words, not including the first
2814 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2815 	 */
2816 	ip6hbh->ip6h_len = 0;
2817 
2818 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2819 	ip6router->ip6or_len = 2;
2820 	ip6router->ip6or_value[0] = 0;
2821 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2822 
2823 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2824 	mld2r->mld2r_nummar = htons(numrec);
2825 	/*
2826 	 * Prepare for the checksum by putting icmp length in the icmp
2827 	 * checksum field. The checksum is calculated in ip_wput_v6.
2828 	 */
2829 	mld2r->mld2r_cksum = htons(icmpsize);
2830 
2831 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2832 		mld2mar->mld2mar_type = rp->mrec_type;
2833 		mld2mar->mld2mar_auxlen = 0;
2834 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2835 		mld2mar->mld2mar_group = rp->mrec_group;
2836 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2837 
2838 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2839 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2840 
2841 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2842 	}
2843 
2844 	/*
2845 	 * ip_wput will automatically loopback the multicast packet to
2846 	 * the conn if multicast loopback is enabled.
2847 	 * The MIB stats corresponding to this outgoing MLD packet
2848 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2849 	 * ->icmp_update_out_mib_v6 function call.
2850 	 */
2851 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2852 
2853 	if (morepkts) {
2854 		if (more_src_cnt > 0) {
2855 			int index, mvsize;
2856 			slist_t *sl = &next_reclist->mrec_srcs;
2857 			index = sl->sl_numsrc;
2858 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2859 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2860 			    mvsize);
2861 			sl->sl_numsrc = more_src_cnt;
2862 		}
2863 		goto nextpkt;
2864 	}
2865 
2866 free_reclist:
2867 	while (reclist != NULL) {
2868 		rp = reclist->mrec_next;
2869 		mi_free(reclist);
2870 		reclist = rp;
2871 	}
2872 }
2873 
2874 static mrec_t *
2875 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2876     mrec_t *next)
2877 {
2878 	mrec_t *rp;
2879 	int i;
2880 
2881 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2882 	    SLIST_IS_EMPTY(srclist))
2883 		return (next);
2884 
2885 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2886 	if (rp == NULL)
2887 		return (next);
2888 
2889 	rp->mrec_next = next;
2890 	rp->mrec_type = type;
2891 	rp->mrec_auxlen = 0;
2892 	rp->mrec_group = *grp;
2893 	if (srclist == NULL) {
2894 		rp->mrec_srcs.sl_numsrc = 0;
2895 	} else {
2896 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2897 		for (i = 0; i < srclist->sl_numsrc; i++)
2898 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2899 	}
2900 
2901 	return (rp);
2902 }
2903 
2904 /*
2905  * Set up initial retransmit state.  If memory cannot be allocated for
2906  * the source lists, simply create as much state as is possible; memory
2907  * allocation failures are considered one type of transient error that
2908  * the retransmissions are designed to overcome (and if they aren't
2909  * transient, there are bigger problems than failing to notify the
2910  * router about multicast group membership state changes).
2911  */
2912 static void
2913 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2914     slist_t *flist)
2915 {
2916 	/*
2917 	 * There are only three possibilities for rtype:
2918 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2919 	 *	  => rtype is ALLOW_NEW_SOURCES
2920 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2921 	 *	  => rtype is CHANGE_TO_EXCLUDE
2922 	 *	State change that involves a filter mode change
2923 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2924 	 */
2925 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2926 	    rtype == ALLOW_NEW_SOURCES);
2927 
2928 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2929 
2930 	switch (rtype) {
2931 	case CHANGE_TO_EXCLUDE:
2932 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2933 		CLEAR_SLIST(rtxp->rtx_allow);
2934 		COPY_SLIST(flist, rtxp->rtx_block);
2935 		break;
2936 	case ALLOW_NEW_SOURCES:
2937 	case CHANGE_TO_INCLUDE:
2938 		rtxp->rtx_fmode_cnt =
2939 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2940 		CLEAR_SLIST(rtxp->rtx_block);
2941 		COPY_SLIST(flist, rtxp->rtx_allow);
2942 		break;
2943 	}
2944 }
2945 
2946 /*
2947  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2948  * RFC 3376 section 5.1, covers three cases:
2949  *	* The current state change is a filter mode change
2950  *		Set filter mode retransmit counter; set retransmit allow or
2951  *		block list to new source list as appropriate, and clear the
2952  *		retransmit list that was not set; send TO_IN or TO_EX with
2953  *		new source list.
2954  *	* The current state change is a source list change, but the filter
2955  *	  mode retransmit counter is > 0
2956  *		Decrement filter mode retransmit counter; set retransmit
2957  *		allow or block list to  new source list as appropriate,
2958  *		and clear the retransmit list that was not set; send TO_IN
2959  *		or TO_EX with new source list.
2960  *	* The current state change is a source list change, and the filter
2961  *	  mode retransmit counter is 0.
2962  *		Merge existing rtx allow and block lists with new state:
2963  *		  rtx_allow = (new allow + rtx_allow) - new block
2964  *		  rtx_block = (new block + rtx_block) - new allow
2965  *		Send ALLOW and BLOCK records for new retransmit lists;
2966  *		decrement retransmit counter.
2967  *
2968  * As is the case for mcast_init_rtx(), memory allocation failures are
2969  * acceptable; we just create as much state as we can.
2970  */
2971 static mrec_t *
2972 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2973 {
2974 	ill_t *ill;
2975 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2976 	mcast_record_t txtype;
2977 	mrec_t *rp, *rpnext, *rtnmrec;
2978 	boolean_t ovf;
2979 
2980 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
2981 
2982 	if (mreclist == NULL)
2983 		return (mreclist);
2984 
2985 	/*
2986 	 * A filter mode change is indicated by a single mrec, which is
2987 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2988 	 * retransmit state as if this were an initial join.  There is
2989 	 * no change to the mrec list.
2990 	 */
2991 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2992 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2993 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2994 		    &mreclist->mrec_srcs);
2995 		return (mreclist);
2996 	}
2997 
2998 	/*
2999 	 * Only the source list has changed
3000 	 */
3001 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3002 	if (rtxp->rtx_fmode_cnt > 0) {
3003 		/* but we're still sending filter mode change reports */
3004 		rtxp->rtx_fmode_cnt--;
3005 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3006 			CLEAR_SLIST(rtxp->rtx_block);
3007 			COPY_SLIST(flist, rtxp->rtx_allow);
3008 			txtype = CHANGE_TO_INCLUDE;
3009 		} else {
3010 			CLEAR_SLIST(rtxp->rtx_allow);
3011 			COPY_SLIST(flist, rtxp->rtx_block);
3012 			txtype = CHANGE_TO_EXCLUDE;
3013 		}
3014 		/* overwrite first mrec with new info */
3015 		mreclist->mrec_type = txtype;
3016 		l_copy(flist, &mreclist->mrec_srcs);
3017 		/* then free any remaining mrecs */
3018 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3019 			rpnext = rp->mrec_next;
3020 			mi_free(rp);
3021 		}
3022 		mreclist->mrec_next = NULL;
3023 		rtnmrec = mreclist;
3024 	} else {
3025 		mrec_t *allow_mrec, *block_mrec;
3026 		/*
3027 		 * Just send the source change reports; but we need to
3028 		 * recalculate the ALLOW and BLOCK lists based on previous
3029 		 * state and new changes.
3030 		 */
3031 		rtnmrec = mreclist;
3032 		allow_mrec = block_mrec = NULL;
3033 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3034 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3035 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3036 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3037 				allow_mrec = rp;
3038 			else
3039 				block_mrec = rp;
3040 		}
3041 		/*
3042 		 * Perform calculations:
3043 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3044 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3045 		 *
3046 		 * Each calc requires two steps, for example:
3047 		 *   rtx_allow = rtx_allow - mrec_block;
3048 		 *   new_allow = mrec_allow + rtx_allow;
3049 		 *
3050 		 * Store results in mrec lists, and then copy into rtx lists.
3051 		 * We do it in this order in case the rtx list hasn't been
3052 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3053 		 * Overflows are also okay.
3054 		 */
3055 		if (block_mrec != NULL) {
3056 			l_difference_in_a(rtxp->rtx_allow,
3057 			    &block_mrec->mrec_srcs);
3058 		}
3059 		if (allow_mrec != NULL) {
3060 			l_difference_in_a(rtxp->rtx_block,
3061 			    &allow_mrec->mrec_srcs);
3062 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3063 			    &ovf);
3064 		}
3065 		if (block_mrec != NULL) {
3066 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3067 			    &ovf);
3068 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3069 		} else {
3070 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3071 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3072 		}
3073 		if (allow_mrec != NULL) {
3074 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3075 		} else {
3076 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3077 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3078 		}
3079 	}
3080 
3081 	return (rtnmrec);
3082 }
3083