xref: /titanic_41/usr/src/uts/common/inet/ip/igmp.c (revision 672986541be54a7a471bb088e60780c37e371d7e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Internet Group Management Protocol (IGMP) routines.
31  * Multicast Listener Discovery Protocol (MLD) routines.
32  *
33  * Written by Steve Deering, Stanford, May 1988.
34  * Modified by Rosen Sharma, Stanford, Aug 1994.
35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
36  *
37  * MULTICAST 3.5.1.1
38  */
39 
40 #include <sys/types.h>
41 #include <sys/stream.h>
42 #include <sys/stropts.h>
43 #include <sys/strlog.h>
44 #include <sys/strsun.h>
45 #include <sys/systm.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/atomic.h>
50 #include <sys/zone.h>
51 
52 #include <sys/param.h>
53 #include <sys/socket.h>
54 #include <inet/ipclassifier.h>
55 #include <net/if.h>
56 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 
62 #include <inet/common.h>
63 #include <inet/mi.h>
64 #include <inet/nd.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_if.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 
75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
84 		    slist_t *srclist, mrec_t *next);
85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
86 		    mcast_record_t rtype, slist_t *flist);
87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
88 
89 
90 /*
91  * Macros used to do timer len conversions.  Timer values are always
92  * stored and passed to the timer functions as milliseconds; but the
93  * default values and values from the wire may not be.
94  *
95  * And yes, it's obscure, but decisecond is easier to abbreviate than
96  * "tenths of a second".
97  */
98 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
99 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
100 
101 /*
102  * The first multicast join will trigger the igmp timers / mld timers
103  * The unit for next is milliseconds.
104  */
105 void
106 igmp_start_timers(unsigned next, ip_stack_t *ipst)
107 {
108 	int	time_left;
109 	int	ret;
110 
111 	ASSERT(next != 0 && next != INFINITY);
112 
113 	mutex_enter(&ipst->ips_igmp_timer_lock);
114 
115 	if (ipst->ips_igmp_timer_setter_active) {
116 		/*
117 		 * Serialize timer setters, one at a time. If the
118 		 * timer is currently being set by someone,
119 		 * just record the next time when it has to be
120 		 * invoked and return. The current setter will
121 		 * take care.
122 		 */
123 		ipst->ips_igmp_time_to_next =
124 		    MIN(ipst->ips_igmp_time_to_next, next);
125 		mutex_exit(&ipst->ips_igmp_timer_lock);
126 		return;
127 	} else {
128 		ipst->ips_igmp_timer_setter_active = B_TRUE;
129 	}
130 	if (ipst->ips_igmp_timeout_id == 0) {
131 		/*
132 		 * The timer is inactive. We need to start a timer
133 		 */
134 		ipst->ips_igmp_time_to_next = next;
135 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
136 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
137 		ipst->ips_igmp_timer_setter_active = B_FALSE;
138 		mutex_exit(&ipst->ips_igmp_timer_lock);
139 		return;
140 	}
141 
142 	/*
143 	 * The timer was scheduled sometime back for firing in
144 	 * 'igmp_time_to_next' ms and is active. We need to
145 	 * reschedule the timeout if the new 'next' will happen
146 	 * earlier than the currently scheduled timeout
147 	 */
148 	time_left = ipst->ips_igmp_timer_fired_last +
149 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
150 	if (time_left < MSEC_TO_TICK(next)) {
151 		ipst->ips_igmp_timer_setter_active = B_FALSE;
152 		mutex_exit(&ipst->ips_igmp_timer_lock);
153 		return;
154 	}
155 
156 	mutex_exit(&ipst->ips_igmp_timer_lock);
157 	ret = untimeout(ipst->ips_igmp_timeout_id);
158 	mutex_enter(&ipst->ips_igmp_timer_lock);
159 	/*
160 	 * The timeout was cancelled, or the timeout handler
161 	 * completed, while we were blocked in the untimeout.
162 	 * No other thread could have set the timer meanwhile
163 	 * since we serialized all the timer setters. Thus
164 	 * no timer is currently active nor executing nor will
165 	 * any timer fire in the future. We start the timer now
166 	 * if needed.
167 	 */
168 	if (ret == -1) {
169 		ASSERT(ipst->ips_igmp_timeout_id == 0);
170 	} else {
171 		ASSERT(ipst->ips_igmp_timeout_id != 0);
172 		ipst->ips_igmp_timeout_id = 0;
173 	}
174 	if (ipst->ips_igmp_time_to_next != 0) {
175 		ipst->ips_igmp_time_to_next =
176 		    MIN(ipst->ips_igmp_time_to_next, next);
177 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
178 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
179 	}
180 	ipst->ips_igmp_timer_setter_active = B_FALSE;
181 	mutex_exit(&ipst->ips_igmp_timer_lock);
182 }
183 
184 /*
185  * mld_start_timers:
186  * The unit for next is milliseconds.
187  */
188 void
189 mld_start_timers(unsigned next, ip_stack_t *ipst)
190 {
191 	int	time_left;
192 	int	ret;
193 
194 	ASSERT(next != 0 && next != INFINITY);
195 
196 	mutex_enter(&ipst->ips_mld_timer_lock);
197 	if (ipst->ips_mld_timer_setter_active) {
198 		/*
199 		 * Serialize timer setters, one at a time. If the
200 		 * timer is currently being set by someone,
201 		 * just record the next time when it has to be
202 		 * invoked and return. The current setter will
203 		 * take care.
204 		 */
205 		ipst->ips_mld_time_to_next =
206 		    MIN(ipst->ips_mld_time_to_next, next);
207 		mutex_exit(&ipst->ips_mld_timer_lock);
208 		return;
209 	} else {
210 		ipst->ips_mld_timer_setter_active = B_TRUE;
211 	}
212 	if (ipst->ips_mld_timeout_id == 0) {
213 		/*
214 		 * The timer is inactive. We need to start a timer
215 		 */
216 		ipst->ips_mld_time_to_next = next;
217 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
218 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
219 		ipst->ips_mld_timer_setter_active = B_FALSE;
220 		mutex_exit(&ipst->ips_mld_timer_lock);
221 		return;
222 	}
223 
224 	/*
225 	 * The timer was scheduled sometime back for firing in
226 	 * 'igmp_time_to_next' ms and is active. We need to
227 	 * reschedule the timeout if the new 'next' will happen
228 	 * earlier than the currently scheduled timeout
229 	 */
230 	time_left = ipst->ips_mld_timer_fired_last +
231 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
232 	if (time_left < MSEC_TO_TICK(next)) {
233 		ipst->ips_mld_timer_setter_active = B_FALSE;
234 		mutex_exit(&ipst->ips_mld_timer_lock);
235 		return;
236 	}
237 
238 	mutex_exit(&ipst->ips_mld_timer_lock);
239 	ret = untimeout(ipst->ips_mld_timeout_id);
240 	mutex_enter(&ipst->ips_mld_timer_lock);
241 	/*
242 	 * The timeout was cancelled, or the timeout handler
243 	 * completed, while we were blocked in the untimeout.
244 	 * No other thread could have set the timer meanwhile
245 	 * since we serialized all the timer setters. Thus
246 	 * no timer is currently active nor executing nor will
247 	 * any timer fire in the future. We start the timer now
248 	 * if needed.
249 	 */
250 	if (ret == -1) {
251 		ASSERT(ipst->ips_mld_timeout_id == 0);
252 	} else {
253 		ASSERT(ipst->ips_mld_timeout_id != 0);
254 		ipst->ips_mld_timeout_id = 0;
255 	}
256 	if (ipst->ips_mld_time_to_next != 0) {
257 		ipst->ips_mld_time_to_next =
258 		    MIN(ipst->ips_mld_time_to_next, next);
259 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
260 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
261 	}
262 	ipst->ips_mld_timer_setter_active = B_FALSE;
263 	mutex_exit(&ipst->ips_mld_timer_lock);
264 }
265 
266 /*
267  * igmp_input:
268  * Return NULL for a bad packet that is discarded here.
269  * Return mp if the message is OK and should be handed to "raw" receivers.
270  * Callers of igmp_input() may need to reinitialize variables that were copied
271  * from the mblk as this calls pullupmsg().
272  */
273 /* ARGSUSED */
274 mblk_t *
275 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
276 {
277 	igmpa_t 	*igmpa;
278 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
279 	int		iphlen, igmplen, mblklen;
280 	ilm_t 		*ilm;
281 	uint32_t	src, dst;
282 	uint32_t 	group;
283 	uint_t		next;
284 	ipif_t 		*ipif;
285 	ip_stack_t	 *ipst;
286 
287 	ASSERT(ill != NULL);
288 	ASSERT(!ill->ill_isv6);
289 	ipst = ill->ill_ipst;
290 	++ipst->ips_igmpstat.igps_rcv_total;
291 
292 	mblklen = MBLKL(mp);
293 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
294 		++ipst->ips_igmpstat.igps_rcv_tooshort;
295 		goto bad_pkt;
296 	}
297 	igmplen = ntohs(ipha->ipha_length) - iphlen;
298 	/*
299 	 * Since msg sizes are more variable with v3, just pullup the
300 	 * whole thing now.
301 	 */
302 	if (MBLKL(mp) < (igmplen + iphlen)) {
303 		mblk_t *mp1;
304 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
305 			++ipst->ips_igmpstat.igps_rcv_tooshort;
306 			goto bad_pkt;
307 		}
308 		freemsg(mp);
309 		mp = mp1;
310 		ipha = (ipha_t *)(mp->b_rptr);
311 	}
312 
313 	/*
314 	 * Validate lengths
315 	 */
316 	if (igmplen < IGMP_MINLEN) {
317 		++ipst->ips_igmpstat.igps_rcv_tooshort;
318 		goto bad_pkt;
319 	}
320 	/*
321 	 * Validate checksum
322 	 */
323 	if (IP_CSUM(mp, iphlen, 0)) {
324 		++ipst->ips_igmpstat.igps_rcv_badsum;
325 		goto bad_pkt;
326 	}
327 
328 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
329 	src = ipha->ipha_src;
330 	dst = ipha->ipha_dst;
331 	if (ip_debug > 1)
332 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
333 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
334 		    (int)ntohl(src), (int)ntohl(dst),
335 		    ill->ill_name);
336 
337 	switch (igmpa->igmpa_type) {
338 	case IGMP_MEMBERSHIP_QUERY:
339 		/*
340 		 * packet length differentiates between v1/v2 and v3
341 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
342 		 */
343 		if (igmplen == IGMP_MINLEN) {
344 			next = igmp_query_in(ipha, igmpa, ill);
345 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
346 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
347 			    igmplen);
348 		} else {
349 			++ipst->ips_igmpstat.igps_rcv_tooshort;
350 			goto bad_pkt;
351 		}
352 		if (next == 0)
353 			goto bad_pkt;
354 
355 		if (next != INFINITY)
356 			igmp_start_timers(next, ipst);
357 
358 		break;
359 
360 	case IGMP_V1_MEMBERSHIP_REPORT:
361 	case IGMP_V2_MEMBERSHIP_REPORT:
362 		/*
363 		 * For fast leave to work, we have to know that we are the
364 		 * last person to send a report for this group. Reports
365 		 * generated by us are looped back since we could potentially
366 		 * be a multicast router, so discard reports sourced by me.
367 		 */
368 		mutex_enter(&ill->ill_lock);
369 		for (ipif = ill->ill_ipif; ipif != NULL;
370 		    ipif = ipif->ipif_next) {
371 			if (ipif->ipif_lcl_addr == src) {
372 				if (ip_debug > 1) {
373 					(void) mi_strlog(ill->ill_rq,
374 					    1,
375 					    SL_TRACE,
376 					    "igmp_input: we are only "
377 					    "member src 0x%x ipif_local 0x%x",
378 					    (int)ntohl(src),
379 					    (int)
380 					    ntohl(ipif->ipif_lcl_addr));
381 				}
382 				mutex_exit(&ill->ill_lock);
383 				return (mp);
384 			}
385 		}
386 		mutex_exit(&ill->ill_lock);
387 
388 		++ipst->ips_igmpstat.igps_rcv_reports;
389 		group = igmpa->igmpa_group;
390 		if (!CLASSD(group)) {
391 			++ipst->ips_igmpstat.igps_rcv_badreports;
392 			goto bad_pkt;
393 		}
394 
395 		/*
396 		 * KLUDGE: if the IP source address of the report has an
397 		 * unspecified (i.e., zero) subnet number, as is allowed for
398 		 * a booting host, replace it with the correct subnet number
399 		 * so that a process-level multicast routing demon can
400 		 * determine which subnet it arrived from.  This is necessary
401 		 * to compensate for the lack of any way for a process to
402 		 * determine the arrival interface of an incoming packet.
403 		 *
404 		 * Requires that a copy of *this* message it passed up
405 		 * to the raw interface which is done by our caller.
406 		 */
407 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
408 			/* Pick the first ipif on this ill */
409 			mutex_enter(&ill->ill_lock);
410 			src = ill->ill_ipif->ipif_subnet;
411 			mutex_exit(&ill->ill_lock);
412 			ip1dbg(("igmp_input: changed src to 0x%x\n",
413 			    (int)ntohl(src)));
414 			ipha->ipha_src = src;
415 		}
416 
417 		/*
418 		 * If we belong to the group being reported, and
419 		 * we are a 'Delaying member' in the RFC terminology,
420 		 * stop our timer for that group and 'clear flag' i.e.
421 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
422 		 * interfaces on the given physical interface.
423 		 */
424 		mutex_enter(&ill->ill_lock);
425 		for (ipif = ill->ill_ipif; ipif != NULL;
426 		    ipif = ipif->ipif_next) {
427 			ilm = ilm_lookup_ipif(ipif, group);
428 			if (ilm != NULL) {
429 				++ipst->ips_igmpstat.igps_rcv_ourreports;
430 				ilm->ilm_timer = INFINITY;
431 				ilm->ilm_state = IGMP_OTHERMEMBER;
432 			}
433 		} /* for */
434 		mutex_exit(&ill->ill_lock);
435 		break;
436 
437 	case IGMP_V3_MEMBERSHIP_REPORT:
438 		/*
439 		 * Currently nothing to do here; IGMP router is not
440 		 * implemented in ip, and v3 hosts don't pay attention
441 		 * to membership reports.
442 		 */
443 		break;
444 	}
445 	/*
446 	 * Pass all valid IGMP packets up to any process(es) listening
447 	 * on a raw IGMP socket. Do not free the packet.
448 	 */
449 	return (mp);
450 
451 bad_pkt:
452 	freemsg(mp);
453 	return (NULL);
454 }
455 
456 static uint_t
457 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
458 {
459 	ilm_t	*ilm;
460 	int	timer;
461 	uint_t	next;
462 	ip_stack_t	 *ipst;
463 
464 	ipst = ill->ill_ipst;
465 	++ipst->ips_igmpstat.igps_rcv_queries;
466 
467 	/*
468 	 * In the IGMPv2 specification, there are 3 states and a flag.
469 	 *
470 	 * In Non-Member state, we simply don't have a membership record.
471 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
472 	 * < INFINITY).  In Idle Member state, our timer is not running
473 	 * (ilm->ilm_timer == INFINITY).
474 	 *
475 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
476 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
477 	 * if I sent the last report.
478 	 */
479 	if (igmpa->igmpa_code == 0) {
480 		/*
481 		 * Query from an old router.
482 		 * Remember that the querier on this interface is old,
483 		 * and set the timer to the value in RFC 1112.
484 		 */
485 
486 
487 		mutex_enter(&ill->ill_lock);
488 		ill->ill_mcast_v1_time = 0;
489 		ill->ill_mcast_v1_tset = 1;
490 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
491 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
492 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
493 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
494 			ill->ill_mcast_type = IGMP_V1_ROUTER;
495 		}
496 		mutex_exit(&ill->ill_lock);
497 
498 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
499 
500 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
501 		    igmpa->igmpa_group != 0) {
502 			++ipst->ips_igmpstat.igps_rcv_badqueries;
503 			return (0);
504 		}
505 
506 	} else {
507 		in_addr_t group;
508 
509 		/*
510 		 * Query from a new router
511 		 * Simply do a validity check
512 		 */
513 		group = igmpa->igmpa_group;
514 		if (group != 0 && (!CLASSD(group))) {
515 			++ipst->ips_igmpstat.igps_rcv_badqueries;
516 			return (0);
517 		}
518 
519 		/*
520 		 * Switch interface state to v2 on receipt of a v2 query
521 		 * ONLY IF current state is v3.  Let things be if current
522 		 * state if v1 but do reset the v2-querier-present timer.
523 		 */
524 		mutex_enter(&ill->ill_lock);
525 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
526 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
527 			    "to IGMP_V2_ROUTER", ill->ill_name));
528 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
529 			ill->ill_mcast_type = IGMP_V2_ROUTER;
530 		}
531 		ill->ill_mcast_v2_time = 0;
532 		ill->ill_mcast_v2_tset = 1;
533 		mutex_exit(&ill->ill_lock);
534 
535 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
536 	}
537 
538 	if (ip_debug > 1) {
539 		mutex_enter(&ill->ill_lock);
540 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
541 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
542 		    (int)ntohs(igmpa->igmpa_code),
543 		    (int)ntohs(igmpa->igmpa_type));
544 		mutex_exit(&ill->ill_lock);
545 	}
546 
547 	/*
548 	 * -Start the timers in all of our membership records
549 	 *  for the physical interface on which the query
550 	 *  arrived, excluding those that belong to the "all
551 	 *  hosts" group (224.0.0.1).
552 	 *
553 	 * -Restart any timer that is already running but has
554 	 *  a value longer than the requested timeout.
555 	 *
556 	 * -Use the value specified in the query message as
557 	 *  the maximum timeout.
558 	 */
559 	next = (unsigned)INFINITY;
560 	mutex_enter(&ill->ill_lock);
561 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
562 
563 		/*
564 		 * A multicast router joins INADDR_ANY address
565 		 * to enable promiscuous reception of all
566 		 * mcasts from the interface. This INADDR_ANY
567 		 * is stored in the ilm_v6addr as V6 unspec addr
568 		 */
569 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
570 			continue;
571 		if (ilm->ilm_addr == htonl(INADDR_ANY))
572 			continue;
573 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
574 		    (igmpa->igmpa_group == 0) ||
575 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
576 			if (ilm->ilm_timer > timer) {
577 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
578 				if (ilm->ilm_timer < next)
579 					next = ilm->ilm_timer;
580 			}
581 		}
582 	}
583 	mutex_exit(&ill->ill_lock);
584 
585 	return (next);
586 }
587 
588 static uint_t
589 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
590 {
591 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
592 	ilm_t		*ilm;
593 	ipaddr_t	*src_array;
594 	uint8_t		qrv;
595 	ip_stack_t	 *ipst;
596 
597 	ipst = ill->ill_ipst;
598 	/* make sure numsrc matches packet size */
599 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
600 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
601 		++ipst->ips_igmpstat.igps_rcv_tooshort;
602 		return (0);
603 	}
604 	src_array = (ipaddr_t *)&igmp3qa[1];
605 
606 	++ipst->ips_igmpstat.igps_rcv_queries;
607 
608 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
609 		uint_t hdrval, mant, exp;
610 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
611 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
612 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
613 		mrd = (mant | 0x10) << (exp + 3);
614 	}
615 	if (mrd == 0)
616 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
617 	timer = DSEC_TO_MSEC(mrd);
618 	MCAST_RANDOM_DELAY(delay, timer);
619 	next = (unsigned)INFINITY;
620 
621 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
622 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
623 	else
624 		ill->ill_mcast_rv = qrv;
625 
626 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
627 		uint_t hdrval, mant, exp;
628 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
629 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
630 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
631 		qqi = (mant | 0x10) << (exp + 3);
632 	}
633 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
634 
635 	/*
636 	 * If we have a pending general query response that's scheduled
637 	 * sooner than the delay we calculated for this response, then
638 	 * no action is required (RFC3376 section 5.2 rule 1)
639 	 */
640 	mutex_enter(&ill->ill_lock);
641 	if (ill->ill_global_timer < delay) {
642 		mutex_exit(&ill->ill_lock);
643 		return (next);
644 	}
645 	mutex_exit(&ill->ill_lock);
646 
647 	/*
648 	 * Now take action depending upon query type:
649 	 * general, group specific, or group/source specific.
650 	 */
651 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
652 		/*
653 		 * general query
654 		 * We know global timer is either not running or is
655 		 * greater than our calculated delay, so reset it to
656 		 * our delay (random value in range [0, response time]).
657 		 */
658 		mutex_enter(&ill->ill_lock);
659 		ill->ill_global_timer = delay;
660 		next = ill->ill_global_timer;
661 		mutex_exit(&ill->ill_lock);
662 
663 	} else {
664 		/* group or group/source specific query */
665 		mutex_enter(&ill->ill_lock);
666 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
667 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
668 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
669 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
670 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
671 				continue;
672 			/*
673 			 * If the query is group specific or we have a
674 			 * pending group specific query, the response is
675 			 * group specific (pending sources list should be
676 			 * empty).  Otherwise, need to update the pending
677 			 * sources list for the group and source specific
678 			 * response.
679 			 */
680 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
681 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
682 group_query:
683 				FREE_SLIST(ilm->ilm_pendsrcs);
684 				ilm->ilm_pendsrcs = NULL;
685 			} else {
686 				boolean_t overflow;
687 				slist_t *pktl;
688 				if (numsrc > MAX_FILTER_SIZE ||
689 				    (ilm->ilm_pendsrcs == NULL &&
690 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
691 					/*
692 					 * We've been sent more sources than
693 					 * we can deal with; or we can't deal
694 					 * with a source list at all.  Revert
695 					 * to a group specific query.
696 					 */
697 					goto group_query;
698 				}
699 				if ((pktl = l_alloc()) == NULL)
700 					goto group_query;
701 				pktl->sl_numsrc = numsrc;
702 				for (i = 0; i < numsrc; i++)
703 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
704 					    &(pktl->sl_addr[i]));
705 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
706 				    &overflow);
707 				l_free(pktl);
708 				if (overflow)
709 					goto group_query;
710 			}
711 			/* choose soonest timer */
712 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
713 			if (ilm->ilm_timer < next)
714 				next = ilm->ilm_timer;
715 		}
716 		mutex_exit(&ill->ill_lock);
717 	}
718 
719 	return (next);
720 }
721 
722 void
723 igmp_joingroup(ilm_t *ilm)
724 {
725 	ill_t	*ill;
726 	ip_stack_t	*ipst = ilm->ilm_ipst;
727 
728 	ill = ilm->ilm_ipif->ipif_ill;
729 
730 	ASSERT(IAM_WRITER_ILL(ill));
731 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
732 
733 	mutex_enter(&ill->ill_lock);
734 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
735 		ilm->ilm_rtx.rtx_timer = INFINITY;
736 		ilm->ilm_state = IGMP_OTHERMEMBER;
737 		mutex_exit(&ill->ill_lock);
738 	} else {
739 		ip1dbg(("Querier mode %d, sending report, group %x\n",
740 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
741 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
742 			mutex_exit(&ill->ill_lock);
743 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
744 			mutex_enter(&ill->ill_lock);
745 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
746 			mutex_exit(&ill->ill_lock);
747 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
748 			mutex_enter(&ill->ill_lock);
749 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
750 			mrec_t *rp;
751 			mcast_record_t rtype;
752 			/*
753 			 * The possible state changes we need to handle here:
754 			 *   Old State	New State	Report
755 			 *
756 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
757 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
758 			 *
759 			 * No need to send the BLOCK(0) report; ALLOW(X)
760 			 * is enough.
761 			 */
762 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
763 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
764 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
765 			    ilm->ilm_filter, NULL);
766 			mutex_exit(&ill->ill_lock);
767 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
768 			mutex_enter(&ill->ill_lock);
769 			/*
770 			 * Set up retransmission state.  Timer is set below,
771 			 * for both v3 and older versions.
772 			 */
773 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
774 			    ilm->ilm_filter);
775 		}
776 
777 		/* Set the ilm timer value */
778 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
779 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
780 		ilm->ilm_state = IGMP_IREPORTEDLAST;
781 		mutex_exit(&ill->ill_lock);
782 
783 		/*
784 		 * To avoid deadlock, we don't call igmp_start_timers from
785 		 * here. igmp_start_timers needs to call untimeout, and we
786 		 * can't hold the ipsq across untimeout since
787 		 * igmp_timeout_handler could be blocking trying to
788 		 * acquire the ipsq. Instead we start the timer after we get
789 		 * out of the ipsq in ipsq_exit.
790 		 */
791 		mutex_enter(&ipst->ips_igmp_timer_lock);
792 		ipst->ips_igmp_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
793 		    ipst->ips_igmp_deferred_next);
794 		mutex_exit(&ipst->ips_igmp_timer_lock);
795 	}
796 
797 	if (ip_debug > 1) {
798 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
799 		    "igmp_joingroup: multicast_type %d timer %d",
800 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
801 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
802 	}
803 }
804 
805 void
806 mld_joingroup(ilm_t *ilm)
807 {
808 	ill_t	*ill;
809 	ip_stack_t	*ipst = ilm->ilm_ipst;
810 
811 	ill = ilm->ilm_ill;
812 
813 	ASSERT(IAM_WRITER_ILL(ill));
814 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
815 
816 	mutex_enter(&ill->ill_lock);
817 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
818 		ilm->ilm_rtx.rtx_timer = INFINITY;
819 		ilm->ilm_state = IGMP_OTHERMEMBER;
820 		mutex_exit(&ill->ill_lock);
821 	} else {
822 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
823 			mutex_exit(&ill->ill_lock);
824 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
825 			mutex_enter(&ill->ill_lock);
826 		} else {
827 			mrec_t *rp;
828 			mcast_record_t rtype;
829 			/*
830 			 * The possible state changes we need to handle here:
831 			 *	Old State   New State	Report
832 			 *
833 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
834 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
835 			 *
836 			 * No need to send the BLOCK(0) report; ALLOW(X)
837 			 * is enough
838 			 */
839 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
840 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
841 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
842 			    ilm->ilm_filter, NULL);
843 			mutex_exit(&ill->ill_lock);
844 			mldv2_sendrpt(ill, rp);
845 			mutex_enter(&ill->ill_lock);
846 			/*
847 			 * Set up retransmission state.  Timer is set below,
848 			 * for both v2 and v1.
849 			 */
850 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
851 			    ilm->ilm_filter);
852 		}
853 
854 		/* Set the ilm timer value */
855 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
856 		    ilm->ilm_rtx.rtx_cnt > 0);
857 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
858 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
859 		ilm->ilm_state = IGMP_IREPORTEDLAST;
860 		mutex_exit(&ill->ill_lock);
861 
862 		/*
863 		 * To avoid deadlock, we don't call mld_start_timers from
864 		 * here. mld_start_timers needs to call untimeout, and we
865 		 * can't hold the ipsq (i.e. the lock) across untimeout
866 		 * since mld_timeout_handler could be blocking trying to
867 		 * acquire the ipsq. Instead we start the timer after we get
868 		 * out of the ipsq in ipsq_exit
869 		 */
870 		mutex_enter(&ipst->ips_mld_timer_lock);
871 		ipst->ips_mld_deferred_next = MIN(ilm->ilm_rtx.rtx_timer,
872 		    ipst->ips_mld_deferred_next);
873 		mutex_exit(&ipst->ips_mld_timer_lock);
874 	}
875 
876 	if (ip_debug > 1) {
877 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
878 		    "mld_joingroup: multicast_type %d timer %d",
879 		    (ilm->ilm_ill->ill_mcast_type),
880 		    (int)ntohl(ilm->ilm_rtx.rtx_timer));
881 	}
882 }
883 
884 void
885 igmp_leavegroup(ilm_t *ilm)
886 {
887 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
888 
889 	ASSERT(ilm->ilm_ill == NULL);
890 	ASSERT(!ill->ill_isv6);
891 
892 	mutex_enter(&ill->ill_lock);
893 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
894 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
895 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
896 		mutex_exit(&ill->ill_lock);
897 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
898 		    (htonl(INADDR_ALLRTRS_GROUP)));
899 		return;
900 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
901 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
902 		mrec_t *rp;
903 		/*
904 		 * The possible state changes we need to handle here:
905 		 *	Old State	New State	Report
906 		 *
907 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
908 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
909 		 *
910 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
911 		 */
912 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
913 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
914 			    ilm->ilm_filter, NULL);
915 		} else {
916 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
917 			    NULL, NULL);
918 		}
919 		mutex_exit(&ill->ill_lock);
920 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
921 		return;
922 	}
923 	mutex_exit(&ill->ill_lock);
924 }
925 
926 void
927 mld_leavegroup(ilm_t *ilm)
928 {
929 	ill_t *ill = ilm->ilm_ill;
930 
931 	ASSERT(ilm->ilm_ipif == NULL);
932 	ASSERT(ill->ill_isv6);
933 
934 	mutex_enter(&ill->ill_lock);
935 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
936 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
937 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
938 		mutex_exit(&ill->ill_lock);
939 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
940 		return;
941 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
942 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
943 		mrec_t *rp;
944 		/*
945 		 * The possible state changes we need to handle here:
946 		 *	Old State	New State	Report
947 		 *
948 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
949 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
950 		 *
951 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
952 		 */
953 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
954 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
955 			    ilm->ilm_filter, NULL);
956 		} else {
957 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
958 			    NULL, NULL);
959 		}
960 		mutex_exit(&ill->ill_lock);
961 		mldv2_sendrpt(ill, rp);
962 		return;
963 	}
964 	mutex_exit(&ill->ill_lock);
965 }
966 
967 void
968 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
969 {
970 	ill_t *ill;
971 	mrec_t *rp;
972 	ip_stack_t	*ipst = ilm->ilm_ipst;
973 
974 	ASSERT(ilm != NULL);
975 
976 	/* state change reports should only be sent if the router is v3 */
977 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
978 		return;
979 
980 	if (ilm->ilm_ill == NULL) {
981 		ASSERT(ilm->ilm_ipif != NULL);
982 		ill = ilm->ilm_ipif->ipif_ill;
983 	} else {
984 		ill = ilm->ilm_ill;
985 	}
986 
987 	mutex_enter(&ill->ill_lock);
988 
989 	/*
990 	 * Compare existing(old) state with the new state and prepare
991 	 * State Change Report, according to the rules in RFC 3376:
992 	 *
993 	 *	Old State	New State	State Change Report
994 	 *
995 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
996 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
997 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
998 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
999 	 */
1000 
1001 	if (ilm->ilm_fmode == fmode) {
1002 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1003 		slist_t *allow, *block;
1004 		if (((a_minus_b = l_alloc()) == NULL) ||
1005 		    ((b_minus_a = l_alloc()) == NULL)) {
1006 			l_free(a_minus_b);
1007 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1008 				goto send_to_ex;
1009 			else
1010 				goto send_to_in;
1011 		}
1012 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1013 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1014 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1015 			allow = b_minus_a;
1016 			block = a_minus_b;
1017 		} else {
1018 			allow = a_minus_b;
1019 			block = b_minus_a;
1020 		}
1021 		rp = NULL;
1022 		if (!SLIST_IS_EMPTY(allow))
1023 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1024 			    allow, rp);
1025 		if (!SLIST_IS_EMPTY(block))
1026 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1027 			    block, rp);
1028 		l_free(a_minus_b);
1029 		l_free(b_minus_a);
1030 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1031 send_to_ex:
1032 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1033 		    NULL);
1034 	} else {
1035 send_to_in:
1036 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1037 		    NULL);
1038 	}
1039 
1040 	/*
1041 	 * Need to set up retransmission state; merge the new info with the
1042 	 * current state (which may be null).  If the timer is not currently
1043 	 * running, start it (need to do a delayed start of the timer as
1044 	 * we're currently in the sq).
1045 	 */
1046 	rp = mcast_merge_rtx(ilm, rp, flist);
1047 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1048 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1049 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1050 		mutex_enter(&ipst->ips_igmp_timer_lock);
1051 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1052 		    ilm->ilm_rtx.rtx_timer);
1053 		mutex_exit(&ipst->ips_igmp_timer_lock);
1054 	}
1055 
1056 	mutex_exit(&ill->ill_lock);
1057 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1058 }
1059 
1060 void
1061 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1062 {
1063 	ill_t *ill;
1064 	mrec_t *rp = NULL;
1065 	ip_stack_t	*ipst = ilm->ilm_ipst;
1066 
1067 	ASSERT(ilm != NULL);
1068 
1069 	ill = ilm->ilm_ill;
1070 
1071 	/* only need to send if we have an mldv2-capable router */
1072 	mutex_enter(&ill->ill_lock);
1073 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1074 		mutex_exit(&ill->ill_lock);
1075 		return;
1076 	}
1077 
1078 	/*
1079 	 * Compare existing (old) state with the new state passed in
1080 	 * and send appropriate MLDv2 State Change Report.
1081 	 *
1082 	 *	Old State	New State	State Change Report
1083 	 *
1084 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1085 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1086 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1087 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1088 	 */
1089 	if (ilm->ilm_fmode == fmode) {
1090 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1091 		slist_t *allow, *block;
1092 		if (((a_minus_b = l_alloc()) == NULL) ||
1093 		    ((b_minus_a = l_alloc()) == NULL)) {
1094 			l_free(a_minus_b);
1095 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1096 				goto send_to_ex;
1097 			else
1098 				goto send_to_in;
1099 		}
1100 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1101 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1102 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1103 			allow = b_minus_a;
1104 			block = a_minus_b;
1105 		} else {
1106 			allow = a_minus_b;
1107 			block = b_minus_a;
1108 		}
1109 		if (!SLIST_IS_EMPTY(allow))
1110 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1111 			    allow, rp);
1112 		if (!SLIST_IS_EMPTY(block))
1113 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1114 			    block, rp);
1115 		l_free(a_minus_b);
1116 		l_free(b_minus_a);
1117 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1118 send_to_ex:
1119 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1120 		    NULL);
1121 	} else {
1122 send_to_in:
1123 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1124 		    NULL);
1125 	}
1126 
1127 	/*
1128 	 * Need to set up retransmission state; merge the new info with the
1129 	 * current state (which may be null).  If the timer is not currently
1130 	 * running, start it (need to do a deferred start of the timer as
1131 	 * we're currently in the sq).
1132 	 */
1133 	rp = mcast_merge_rtx(ilm, rp, flist);
1134 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1135 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1136 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1137 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1138 		mutex_enter(&ipst->ips_mld_timer_lock);
1139 		ipst->ips_mld_deferred_next =
1140 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1141 		mutex_exit(&ipst->ips_mld_timer_lock);
1142 	}
1143 
1144 	mutex_exit(&ill->ill_lock);
1145 	mldv2_sendrpt(ill, rp);
1146 }
1147 
1148 uint_t
1149 igmp_timeout_handler_per_ill(ill_t *ill, int elapsed)
1150 {
1151 	uint_t	next = INFINITY;
1152 	ilm_t	*ilm;
1153 	ipif_t	*ipif;
1154 	mrec_t	*rp = NULL;
1155 	mrec_t	*rtxrp = NULL;
1156 	rtx_state_t *rtxp;
1157 	mcast_record_t	rtype;
1158 
1159 	ASSERT(IAM_WRITER_ILL(ill));
1160 
1161 	mutex_enter(&ill->ill_lock);
1162 
1163 	/* First check the global timer on this interface */
1164 	if (ill->ill_global_timer == INFINITY)
1165 		goto per_ilm_timer;
1166 	if (ill->ill_global_timer <= elapsed) {
1167 		ill->ill_global_timer = INFINITY;
1168 		/*
1169 		 * Send report for each group on this interface.
1170 		 * Since we just set the global timer (received a v3 general
1171 		 * query), need to skip the all hosts addr (224.0.0.1), per
1172 		 * RFC 3376 section 5.
1173 		 */
1174 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1175 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1176 				continue;
1177 			ASSERT(ilm->ilm_ipif != NULL);
1178 			ilm->ilm_ipif->ipif_igmp_rpt =
1179 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1180 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1181 			/*
1182 			 * Since we're sending a report on this group, okay
1183 			 * to delete pending group-specific timers.  Note
1184 			 * that group-specific retransmit timers still need
1185 			 * to be checked in the per_ilm_timer for-loop.
1186 			 */
1187 			ilm->ilm_timer = INFINITY;
1188 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1189 			FREE_SLIST(ilm->ilm_pendsrcs);
1190 			ilm->ilm_pendsrcs = NULL;
1191 		}
1192 		/*
1193 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1194 		 * and send a report for each ipif that has an mrec list.
1195 		 */
1196 		for (ipif = ill->ill_ipif; ipif != NULL;
1197 		    ipif = ipif->ipif_next) {
1198 			if (ipif->ipif_igmp_rpt == NULL)
1199 				continue;
1200 			mutex_exit(&ill->ill_lock);
1201 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1202 			mutex_enter(&ill->ill_lock);
1203 			/* mrec list was freed by igmpv3_sendrpt() */
1204 			ipif->ipif_igmp_rpt = NULL;
1205 		}
1206 	} else {
1207 		ill->ill_global_timer -= elapsed;
1208 		if (ill->ill_global_timer < next)
1209 			next = ill->ill_global_timer;
1210 	}
1211 
1212 per_ilm_timer:
1213 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1214 		if (ilm->ilm_timer == INFINITY)
1215 			goto per_ilm_rtxtimer;
1216 
1217 		if (ilm->ilm_timer > elapsed) {
1218 			ilm->ilm_timer -= elapsed;
1219 			if (ilm->ilm_timer < next)
1220 				next = ilm->ilm_timer;
1221 
1222 			if (ip_debug > 1) {
1223 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1224 				    "igmp_timo_hlr 2: ilm_timr %d elap %d "
1225 				    "typ %d nxt %d",
1226 				    (int)ntohl(ilm->ilm_timer), elapsed,
1227 				    (ill->ill_mcast_type), next);
1228 			}
1229 
1230 			goto per_ilm_rtxtimer;
1231 		}
1232 
1233 		/* the timer has expired, need to take action */
1234 		ilm->ilm_timer = INFINITY;
1235 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1236 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1237 			mutex_exit(&ill->ill_lock);
1238 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1239 			mutex_enter(&ill->ill_lock);
1240 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1241 			mutex_exit(&ill->ill_lock);
1242 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1243 			mutex_enter(&ill->ill_lock);
1244 		} else {
1245 			slist_t *rsp;
1246 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1247 			    (rsp = l_alloc()) != NULL) {
1248 				/*
1249 				 * Contents of reply depend on pending
1250 				 * requested source list.
1251 				 */
1252 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1253 					l_intersection(ilm->ilm_filter,
1254 					    ilm->ilm_pendsrcs, rsp);
1255 				} else {
1256 					l_difference(ilm->ilm_pendsrcs,
1257 					    ilm->ilm_filter, rsp);
1258 				}
1259 				FREE_SLIST(ilm->ilm_pendsrcs);
1260 				ilm->ilm_pendsrcs = NULL;
1261 				if (!SLIST_IS_EMPTY(rsp))
1262 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1263 					    &ilm->ilm_v6addr, rsp, rp);
1264 				FREE_SLIST(rsp);
1265 			} else {
1266 				/*
1267 				 * Either the pending request is just group-
1268 				 * specific, or we couldn't get the resources
1269 				 * (rsp) to build a source-specific reply.
1270 				 */
1271 				rp = mcast_bldmrec(ilm->ilm_fmode,
1272 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1273 			}
1274 			mutex_exit(&ill->ill_lock);
1275 			igmpv3_sendrpt(ill->ill_ipif, rp);
1276 			mutex_enter(&ill->ill_lock);
1277 			rp = NULL;
1278 		}
1279 
1280 		if (ip_debug > 1) {
1281 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1282 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1283 			    "typ %d nxt %d",
1284 			    (int)ntohl(ilm->ilm_timer), elapsed,
1285 			    (ill->ill_mcast_type), next);
1286 		}
1287 
1288 per_ilm_rtxtimer:
1289 		rtxp = &ilm->ilm_rtx;
1290 
1291 		if (rtxp->rtx_timer == INFINITY)
1292 			continue;
1293 		if (rtxp->rtx_timer > elapsed) {
1294 			rtxp->rtx_timer -= elapsed;
1295 			if (rtxp->rtx_timer < next)
1296 				next = rtxp->rtx_timer;
1297 			continue;
1298 		}
1299 
1300 		rtxp->rtx_timer = INFINITY;
1301 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1302 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1303 			mutex_exit(&ill->ill_lock);
1304 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1305 			mutex_enter(&ill->ill_lock);
1306 			continue;
1307 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1308 			mutex_exit(&ill->ill_lock);
1309 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1310 			mutex_enter(&ill->ill_lock);
1311 			continue;
1312 		}
1313 
1314 		/*
1315 		 * The retransmit timer has popped, and our router is
1316 		 * IGMPv3.  We have to delve into the retransmit state
1317 		 * stored in the ilm.
1318 		 *
1319 		 * Decrement the retransmit count.  If the fmode rtx
1320 		 * count is active, decrement it, and send a filter
1321 		 * mode change report with the ilm's source list.
1322 		 * Otherwise, send a source list change report with
1323 		 * the current retransmit lists.
1324 		 */
1325 		ASSERT(rtxp->rtx_cnt > 0);
1326 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1327 		rtxp->rtx_cnt--;
1328 		if (rtxp->rtx_fmode_cnt > 0) {
1329 			rtxp->rtx_fmode_cnt--;
1330 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1331 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1332 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1333 			    ilm->ilm_filter, rtxrp);
1334 		} else {
1335 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1336 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1337 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1338 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1339 		}
1340 		if (rtxp->rtx_cnt > 0) {
1341 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1342 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1343 			if (rtxp->rtx_timer < next)
1344 				next = rtxp->rtx_timer;
1345 		} else {
1346 			CLEAR_SLIST(rtxp->rtx_allow);
1347 			CLEAR_SLIST(rtxp->rtx_block);
1348 		}
1349 		mutex_exit(&ill->ill_lock);
1350 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1351 		mutex_enter(&ill->ill_lock);
1352 		rtxrp = NULL;
1353 	}
1354 
1355 	mutex_exit(&ill->ill_lock);
1356 
1357 	return (next);
1358 }
1359 
1360 /*
1361  * igmp_timeout_handler:
1362  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1363  * Returns number of ticks to next event (or 0 if none).
1364  *
1365  * As part of multicast join and leave igmp we may need to send out an
1366  * igmp request. The igmp related state variables in the ilm are protected
1367  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1368  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1369  * starts the igmp timer if needed. It serializes multiple threads trying to
1370  * simultaneously start the timer using the igmp_timer_setter_active flag.
1371  *
1372  * igmp_input() receives igmp queries and responds to the queries
1373  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1374  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1375  * performs the action exclusively after entering each ill's ipsq as writer.
1376  * The actual igmp timeout handler needs to run in the ipsq since it has to
1377  * access the ilm's and we don't want another exclusive operation like
1378  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1379  * another.
1380  *
1381  * The igmp_slowtimeo() function is called thru another timer.
1382  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1383  */
1384 void
1385 igmp_timeout_handler(void *arg)
1386 {
1387 	ill_t	*ill;
1388 	int	elapsed;	/* Since last call */
1389 	uint_t  global_next = INFINITY;
1390 	uint_t  next;
1391 	ill_walk_context_t ctx;
1392 	boolean_t success;
1393 	ip_stack_t *ipst = (ip_stack_t *)arg;
1394 
1395 	ASSERT(arg != NULL);
1396 	mutex_enter(&ipst->ips_igmp_timer_lock);
1397 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1398 	ipst->ips_igmp_timer_fired_last = ddi_get_lbolt();
1399 	elapsed = ipst->ips_igmp_time_to_next;
1400 	ipst->ips_igmp_time_to_next = 0;
1401 	mutex_exit(&ipst->ips_igmp_timer_lock);
1402 
1403 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1404 	ill = ILL_START_WALK_V4(&ctx, ipst);
1405 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1406 		ASSERT(!ill->ill_isv6);
1407 		/*
1408 		 * We may not be able to refhold the ill if the ill/ipif
1409 		 * is changing. But we need to make sure that the ill will
1410 		 * not vanish. So we just bump up the ill_waiter count.
1411 		 */
1412 		if (!ill_waiter_inc(ill))
1413 			continue;
1414 		rw_exit(&ipst->ips_ill_g_lock);
1415 		success = ipsq_enter(ill, B_TRUE);
1416 		if (success) {
1417 			next = igmp_timeout_handler_per_ill(ill, elapsed);
1418 			if (next < global_next)
1419 				global_next = next;
1420 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1421 			    B_TRUE);
1422 		}
1423 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1424 		ill_waiter_dcr(ill);
1425 	}
1426 	rw_exit(&ipst->ips_ill_g_lock);
1427 
1428 	mutex_enter(&ipst->ips_igmp_timer_lock);
1429 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1430 	ipst->ips_igmp_timeout_id = 0;
1431 	mutex_exit(&ipst->ips_igmp_timer_lock);
1432 
1433 	if (global_next != INFINITY)
1434 		igmp_start_timers(global_next, ipst);
1435 }
1436 
1437 /*
1438  * mld_timeout_handler:
1439  * Called when there are timeout events, every next (tick).
1440  * Returns number of ticks to next event (or 0 if none).
1441  */
1442 /* ARGSUSED */
1443 uint_t
1444 mld_timeout_handler_per_ill(ill_t *ill, int elapsed)
1445 {
1446 	ilm_t 	*ilm;
1447 	uint_t	next = INFINITY;
1448 	mrec_t	*rp, *rtxrp;
1449 	rtx_state_t *rtxp;
1450 	mcast_record_t	rtype;
1451 
1452 	ASSERT(IAM_WRITER_ILL(ill));
1453 
1454 	mutex_enter(&ill->ill_lock);
1455 
1456 	/*
1457 	 * First check the global timer on this interface; the global timer
1458 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1459 	 */
1460 	if (ill->ill_global_timer == INFINITY)
1461 		goto per_ilm_timer;
1462 	if (ill->ill_global_timer <= elapsed) {
1463 		ill->ill_global_timer = INFINITY;
1464 		/*
1465 		 * Send report for each group on this interface.
1466 		 * Since we just set the global timer (received a v2 general
1467 		 * query), need to skip the all hosts addr (ff02::1), per
1468 		 * RFC 3810 section 6.
1469 		 */
1470 		rp = NULL;
1471 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1472 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1473 			    &ipv6_all_hosts_mcast))
1474 				continue;
1475 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1476 			    ilm->ilm_filter, rp);
1477 			/*
1478 			 * Since we're sending a report on this group, okay
1479 			 * to delete pending group-specific timers.  Note
1480 			 * that group-specific retransmit timers still need
1481 			 * to be checked in the per_ilm_timer for-loop.
1482 			 */
1483 			ilm->ilm_timer = INFINITY;
1484 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1485 			FREE_SLIST(ilm->ilm_pendsrcs);
1486 			ilm->ilm_pendsrcs = NULL;
1487 		}
1488 		mutex_exit(&ill->ill_lock);
1489 		mldv2_sendrpt(ill, rp);
1490 		mutex_enter(&ill->ill_lock);
1491 	} else {
1492 		ill->ill_global_timer -= elapsed;
1493 		if (ill->ill_global_timer < next)
1494 			next = ill->ill_global_timer;
1495 	}
1496 
1497 per_ilm_timer:
1498 	rp = rtxrp = NULL;
1499 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1500 		if (ilm->ilm_timer == INFINITY)
1501 			goto per_ilm_rtxtimer;
1502 
1503 		if (ilm->ilm_timer > elapsed) {
1504 			ilm->ilm_timer -= elapsed;
1505 			if (ilm->ilm_timer < next)
1506 				next = ilm->ilm_timer;
1507 
1508 			if (ip_debug > 1) {
1509 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1510 				    "igmp_timo_hlr 2: ilm_timr"
1511 				    " %d elap %d typ %d nxt %d",
1512 				    (int)ntohl(ilm->ilm_timer), elapsed,
1513 				    (ill->ill_mcast_type), next);
1514 			}
1515 
1516 			goto per_ilm_rtxtimer;
1517 		}
1518 
1519 		/* the timer has expired, need to take action */
1520 		ilm->ilm_timer = INFINITY;
1521 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1522 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1523 			mutex_exit(&ill->ill_lock);
1524 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1525 			mutex_enter(&ill->ill_lock);
1526 		} else {
1527 			slist_t *rsp;
1528 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1529 			    (rsp = l_alloc()) != NULL) {
1530 				/*
1531 				 * Contents of reply depend on pending
1532 				 * requested source list.
1533 				 */
1534 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1535 					l_intersection(ilm->ilm_filter,
1536 					    ilm->ilm_pendsrcs, rsp);
1537 				} else {
1538 					l_difference(ilm->ilm_pendsrcs,
1539 					    ilm->ilm_filter, rsp);
1540 				}
1541 				FREE_SLIST(ilm->ilm_pendsrcs);
1542 				ilm->ilm_pendsrcs = NULL;
1543 				if (!SLIST_IS_EMPTY(rsp))
1544 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1545 					    &ilm->ilm_v6addr, rsp, rp);
1546 				FREE_SLIST(rsp);
1547 			} else {
1548 				rp = mcast_bldmrec(ilm->ilm_fmode,
1549 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1550 			}
1551 		}
1552 
1553 		if (ip_debug > 1) {
1554 			(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1555 			    "igmp_timo_hlr 1: ilm_timr %d elap %d "
1556 			    "typ %d nxt %d",
1557 			    (int)ntohl(ilm->ilm_timer), elapsed,
1558 			    (ill->ill_mcast_type), next);
1559 		}
1560 
1561 per_ilm_rtxtimer:
1562 		rtxp = &ilm->ilm_rtx;
1563 
1564 		if (rtxp->rtx_timer == INFINITY)
1565 			continue;
1566 		if (rtxp->rtx_timer > elapsed) {
1567 			rtxp->rtx_timer -= elapsed;
1568 			if (rtxp->rtx_timer < next)
1569 				next = rtxp->rtx_timer;
1570 			continue;
1571 		}
1572 
1573 		rtxp->rtx_timer = INFINITY;
1574 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1575 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1576 			mutex_exit(&ill->ill_lock);
1577 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1578 			mutex_enter(&ill->ill_lock);
1579 			continue;
1580 		}
1581 
1582 		/*
1583 		 * The retransmit timer has popped, and our router is
1584 		 * MLDv2.  We have to delve into the retransmit state
1585 		 * stored in the ilm.
1586 		 *
1587 		 * Decrement the retransmit count.  If the fmode rtx
1588 		 * count is active, decrement it, and send a filter
1589 		 * mode change report with the ilm's source list.
1590 		 * Otherwise, send a source list change report with
1591 		 * the current retransmit lists.
1592 		 */
1593 		ASSERT(rtxp->rtx_cnt > 0);
1594 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1595 		rtxp->rtx_cnt--;
1596 		if (rtxp->rtx_fmode_cnt > 0) {
1597 			rtxp->rtx_fmode_cnt--;
1598 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1599 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1600 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1601 			    ilm->ilm_filter, rtxrp);
1602 		} else {
1603 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1604 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1605 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1606 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1607 		}
1608 		if (rtxp->rtx_cnt > 0) {
1609 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1610 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1611 			if (rtxp->rtx_timer < next)
1612 				next = rtxp->rtx_timer;
1613 		} else {
1614 			CLEAR_SLIST(rtxp->rtx_allow);
1615 			CLEAR_SLIST(rtxp->rtx_block);
1616 		}
1617 	}
1618 
1619 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1620 		mutex_exit(&ill->ill_lock);
1621 		mldv2_sendrpt(ill, rp);
1622 		mldv2_sendrpt(ill, rtxrp);
1623 		return (next);
1624 	}
1625 
1626 	mutex_exit(&ill->ill_lock);
1627 
1628 	return (next);
1629 }
1630 
1631 /*
1632  * mld_timeout_handler:
1633  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1634  * Returns number of ticks to next event (or 0 if none).
1635  * MT issues are same as igmp_timeout_handler
1636  */
1637 void
1638 mld_timeout_handler(void *arg)
1639 {
1640 	ill_t	*ill;
1641 	int	elapsed;	/* Since last call */
1642 	uint_t  global_next = INFINITY;
1643 	uint_t  next;
1644 	ill_walk_context_t ctx;
1645 	boolean_t success;
1646 	ip_stack_t *ipst = (ip_stack_t *)arg;
1647 
1648 	ASSERT(arg != NULL);
1649 	mutex_enter(&ipst->ips_mld_timer_lock);
1650 	ASSERT(ipst->ips_mld_timeout_id != 0);
1651 	ipst->ips_mld_timer_fired_last = ddi_get_lbolt();
1652 	elapsed = ipst->ips_mld_time_to_next;
1653 	ipst->ips_mld_time_to_next = 0;
1654 	mutex_exit(&ipst->ips_mld_timer_lock);
1655 
1656 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1657 	ill = ILL_START_WALK_V6(&ctx, ipst);
1658 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1659 		ASSERT(ill->ill_isv6);
1660 		/*
1661 		 * We may not be able to refhold the ill if the ill/ipif
1662 		 * is changing. But we need to make sure that the ill will
1663 		 * not vanish. So we just bump up the ill_waiter count.
1664 		 */
1665 		if (!ill_waiter_inc(ill))
1666 			continue;
1667 		rw_exit(&ipst->ips_ill_g_lock);
1668 		success = ipsq_enter(ill, B_TRUE);
1669 		if (success) {
1670 			next = mld_timeout_handler_per_ill(ill, elapsed);
1671 			if (next < global_next)
1672 				global_next = next;
1673 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1674 			    B_FALSE);
1675 		}
1676 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1677 		ill_waiter_dcr(ill);
1678 	}
1679 	rw_exit(&ipst->ips_ill_g_lock);
1680 
1681 	mutex_enter(&ipst->ips_mld_timer_lock);
1682 	ASSERT(ipst->ips_mld_timeout_id != 0);
1683 	ipst->ips_mld_timeout_id = 0;
1684 	mutex_exit(&ipst->ips_mld_timer_lock);
1685 
1686 	if (global_next != INFINITY)
1687 		mld_start_timers(global_next, ipst);
1688 }
1689 
1690 /*
1691  * Calculate the Older Version Querier Present timeout value, in number
1692  * of slowtimo intervals, for the given ill.
1693  */
1694 #define	OVQP(ill) \
1695 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1696 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1697 
1698 /*
1699  * igmp_slowtimo:
1700  * - Resets to new router if we didnt we hear from the router
1701  *   in IGMP_AGE_THRESHOLD seconds.
1702  * - Resets slowtimeout.
1703  */
1704 void
1705 igmp_slowtimo(void *arg)
1706 {
1707 	ill_t	*ill;
1708 	ill_if_t *ifp;
1709 	avl_tree_t *avl_tree;
1710 	ip_stack_t *ipst = (ip_stack_t *)arg;
1711 
1712 	ASSERT(arg != NULL);
1713 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1714 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1715 
1716 	/*
1717 	 * The ill_if_t list is circular, hence the odd loop parameters.
1718 	 *
1719 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1720 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1721 	 * structure (allowing us to skip if none of the instances have timers
1722 	 * running).
1723 	 */
1724 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1725 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1726 	    ifp = ifp->illif_next) {
1727 		/*
1728 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1729 		 * a V1 or V2 query now and we miss seeing the count now,
1730 		 * we will see it the next time igmp_slowtimo is called.
1731 		 */
1732 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1733 			continue;
1734 
1735 		avl_tree = &ifp->illif_avl_by_ppa;
1736 		for (ill = avl_first(avl_tree); ill != NULL;
1737 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1738 			mutex_enter(&ill->ill_lock);
1739 			if (ill->ill_mcast_v1_tset == 1)
1740 				ill->ill_mcast_v1_time++;
1741 			if (ill->ill_mcast_v2_tset == 1)
1742 				ill->ill_mcast_v2_time++;
1743 			if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1744 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1745 					if (ill->ill_mcast_v2_tset > 0) {
1746 						ip1dbg(("V1 query timer "
1747 						    "expired on %s; switching "
1748 						    "mode to IGMP_V2\n",
1749 						    ill->ill_name));
1750 						ill->ill_mcast_type =
1751 						    IGMP_V2_ROUTER;
1752 					} else {
1753 						ip1dbg(("V1 query timer "
1754 						    "expired on %s; switching "
1755 						    "mode to IGMP_V3\n",
1756 						    ill->ill_name));
1757 						ill->ill_mcast_type =
1758 						    IGMP_V3_ROUTER;
1759 					}
1760 					ill->ill_mcast_v1_time = 0;
1761 					ill->ill_mcast_v1_tset = 0;
1762 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1763 				}
1764 			}
1765 			if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1766 				if (ill->ill_mcast_v2_time >= OVQP(ill)) {
1767 					ip1dbg(("V2 query timer expired on "
1768 					    "%s; switching mode to IGMP_V3\n",
1769 					    ill->ill_name));
1770 					ill->ill_mcast_type = IGMP_V3_ROUTER;
1771 					ill->ill_mcast_v2_time = 0;
1772 					ill->ill_mcast_v2_tset = 0;
1773 					atomic_add_16(&ifp->illif_mcast_v2, -1);
1774 				}
1775 			}
1776 			mutex_exit(&ill->ill_lock);
1777 		}
1778 
1779 	}
1780 	rw_exit(&ipst->ips_ill_g_lock);
1781 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1782 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1783 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1784 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1785 }
1786 
1787 /*
1788  * mld_slowtimo:
1789  * - Resets to newer version if we didn't hear from the older version router
1790  *   in MLD_AGE_THRESHOLD seconds.
1791  * - Restarts slowtimeout.
1792  */
1793 /* ARGSUSED */
1794 void
1795 mld_slowtimo(void *arg)
1796 {
1797 	ill_t *ill;
1798 	ill_if_t *ifp;
1799 	avl_tree_t *avl_tree;
1800 	ip_stack_t *ipst = (ip_stack_t *)arg;
1801 
1802 	ASSERT(arg != NULL);
1803 	/* See comments in igmp_slowtimo() above... */
1804 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1805 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1806 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1807 	    ifp = ifp->illif_next) {
1808 		if (ifp->illif_mcast_v1 == 0)
1809 			continue;
1810 
1811 		avl_tree = &ifp->illif_avl_by_ppa;
1812 		for (ill = avl_first(avl_tree); ill != NULL;
1813 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1814 			mutex_enter(&ill->ill_lock);
1815 			if (ill->ill_mcast_v1_tset == 1)
1816 				ill->ill_mcast_v1_time++;
1817 			if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1818 				if (ill->ill_mcast_v1_time >= OVQP(ill)) {
1819 					ip1dbg(("MLD query timer expired on"
1820 					    " %s; switching mode to MLD_V2\n",
1821 					    ill->ill_name));
1822 					ill->ill_mcast_type = MLD_V2_ROUTER;
1823 					ill->ill_mcast_v1_time = 0;
1824 					ill->ill_mcast_v1_tset = 0;
1825 					atomic_add_16(&ifp->illif_mcast_v1, -1);
1826 				}
1827 			}
1828 			mutex_exit(&ill->ill_lock);
1829 		}
1830 	}
1831 	rw_exit(&ipst->ips_ill_g_lock);
1832 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1833 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1834 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1835 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1836 }
1837 
1838 /*
1839  * igmp_sendpkt:
1840  * This will send to ip_wput like icmp_inbound.
1841  * Note that the lower ill (on which the membership is kept) is used
1842  * as an upper ill to pass in the multicast parameters.
1843  */
1844 static void
1845 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1846 {
1847 	mblk_t	*mp;
1848 	igmpa_t	*igmpa;
1849 	uint8_t *rtralert;
1850 	ipha_t	*ipha;
1851 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1852 	size_t	size  = hdrlen + sizeof (igmpa_t);
1853 	ipif_t 	*ipif = ilm->ilm_ipif;
1854 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1855 	mblk_t	*first_mp;
1856 	ipsec_out_t *io;
1857 	zoneid_t zoneid;
1858 	ip_stack_t *ipst = ill->ill_ipst;
1859 
1860 	/*
1861 	 * We need to make sure this packet goes out on an ipif. If
1862 	 * there is some global policy match in ip_wput_ire, we need
1863 	 * to get to the right interface after IPSEC processing.
1864 	 * To make sure this multicast packet goes out on the right
1865 	 * interface, we attach an ipsec_out and initialize ill_index
1866 	 * like we did in ip_wput. To make sure that this packet does
1867 	 * not get forwarded on other interfaces or looped back, we
1868 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1869 	 * to B_FALSE.
1870 	 *
1871 	 * We also need to make sure that this does not get load balanced
1872 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1873 	 * here. If it gets load balanced, switches supporting igmp snooping
1874 	 * will send the packet that it receives for this multicast group
1875 	 * to the interface that we are sending on. As we have joined the
1876 	 * multicast group on this ill, by sending the packet out on this
1877 	 * ill, we receive all the packets back on this ill.
1878 	 */
1879 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1880 	if (first_mp == NULL)
1881 		return;
1882 
1883 	first_mp->b_datap->db_type = M_CTL;
1884 	first_mp->b_wptr += sizeof (ipsec_info_t);
1885 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1886 	/* ipsec_out_secure is B_FALSE now */
1887 	io = (ipsec_out_t *)first_mp->b_rptr;
1888 	io->ipsec_out_type = IPSEC_OUT;
1889 	io->ipsec_out_len = sizeof (ipsec_out_t);
1890 	io->ipsec_out_use_global_policy = B_TRUE;
1891 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1892 	io->ipsec_out_attach_if = B_TRUE;
1893 	io->ipsec_out_multicast_loop = B_FALSE;
1894 	io->ipsec_out_dontroute = B_TRUE;
1895 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1896 		zoneid = GLOBAL_ZONEID;
1897 	io->ipsec_out_zoneid = zoneid;
1898 	io->ipsec_out_ns = ipst->ips_netstack;	/* No netstack_hold */
1899 
1900 	mp = allocb(size, BPRI_HI);
1901 	if (mp == NULL) {
1902 		freemsg(first_mp);
1903 		return;
1904 	}
1905 	mp->b_wptr = mp->b_rptr + size;
1906 	first_mp->b_cont = mp;
1907 
1908 	ipha = (ipha_t *)mp->b_rptr;
1909 	rtralert = (uint8_t *)&(ipha[1]);
1910 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1911 	igmpa->igmpa_type   = type;
1912 	igmpa->igmpa_code   = 0;
1913 	igmpa->igmpa_group  = ilm->ilm_addr;
1914 	igmpa->igmpa_cksum  = 0;
1915 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1916 
1917 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1918 	rtralert[1] = RTRALERT_LEN;
1919 	rtralert[2] = 0;
1920 	rtralert[3] = 0;
1921 
1922 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1923 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1924 	ipha->ipha_type_of_service 	= 0;
1925 	ipha->ipha_length = htons(size);
1926 	ipha->ipha_ident = 0;
1927 	ipha->ipha_fragment_offset_and_flags = 0;
1928 	ipha->ipha_ttl 		= IGMP_TTL;
1929 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1930 	ipha->ipha_hdr_checksum 	= 0;
1931 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1932 	ipha->ipha_src 		= ipif->ipif_src_addr;
1933 	/*
1934 	 * Request loopback of the report if we are acting as a multicast
1935 	 * router, so that the process-level routing demon can hear it.
1936 	 */
1937 	/*
1938 	 * This will run multiple times for the same group if there are members
1939 	 * on the same group for multiple ipif's on the same ill. The
1940 	 * igmp_input code will suppress this due to the loopback thus we
1941 	 * always loopback membership report.
1942 	 */
1943 	ASSERT(ill->ill_rq != NULL);
1944 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1945 
1946 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
1947 
1948 	++ipst->ips_igmpstat.igps_snd_reports;
1949 }
1950 
1951 /*
1952  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1953  * with the passed-in ipif.  The report will contain one group record
1954  * for each element of reclist.  If this causes packet length to
1955  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1956  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1957  * and those buffers are freed here.
1958  */
1959 static void
1960 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1961 {
1962 	ipsec_out_t *io;
1963 	igmp3ra_t *igmp3ra;
1964 	grphdra_t *grphdr;
1965 	mblk_t *first_mp, *mp;
1966 	ipha_t *ipha;
1967 	uint8_t *rtralert;
1968 	ipaddr_t *src_array;
1969 	int i, j, numrec, more_src_cnt;
1970 	size_t hdrsize, size, rsize;
1971 	ill_t *ill = ipif->ipif_ill;
1972 	mrec_t *rp, *cur_reclist;
1973 	mrec_t *next_reclist = reclist;
1974 	boolean_t morepkts;
1975 	zoneid_t zoneid;
1976 	ip_stack_t	 *ipst = ill->ill_ipst;
1977 
1978 	/* if there aren't any records, there's nothing to send */
1979 	if (reclist == NULL)
1980 		return;
1981 
1982 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1983 nextpkt:
1984 	size = hdrsize + sizeof (igmp3ra_t);
1985 	morepkts = B_FALSE;
1986 	more_src_cnt = 0;
1987 	cur_reclist = next_reclist;
1988 	numrec = 0;
1989 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1990 		rsize = sizeof (grphdra_t) +
1991 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1992 		if (size + rsize > ill->ill_max_frag) {
1993 			if (rp == cur_reclist) {
1994 				/*
1995 				 * If the first mrec we looked at is too big
1996 				 * to fit in a single packet (i.e the source
1997 				 * list is too big), we must either truncate
1998 				 * the list (if TO_EX or IS_EX), or send
1999 				 * multiple reports for the same group (all
2000 				 * other types).
2001 				 */
2002 				int srcspace, srcsperpkt;
2003 				srcspace = ill->ill_max_frag - (size +
2004 				    sizeof (grphdra_t));
2005 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2006 				/*
2007 				 * Increment size and numrec, because we will
2008 				 * be sending a record for the mrec we're
2009 				 * looking at now.
2010 				 */
2011 				size += sizeof (grphdra_t) +
2012 				    (srcsperpkt * sizeof (ipaddr_t));
2013 				numrec++;
2014 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2015 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2016 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2017 					if (rp->mrec_next == NULL) {
2018 						/* no more packets to send */
2019 						break;
2020 					} else {
2021 						/*
2022 						 * more packets, but we're
2023 						 * done with this mrec.
2024 						 */
2025 						next_reclist = rp->mrec_next;
2026 					}
2027 				} else {
2028 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2029 					    - srcsperpkt;
2030 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2031 					/*
2032 					 * We'll fix up this mrec (remove the
2033 					 * srcs we've already sent) before
2034 					 * returning to nextpkt above.
2035 					 */
2036 					next_reclist = rp;
2037 				}
2038 			} else {
2039 				next_reclist = rp;
2040 			}
2041 			morepkts = B_TRUE;
2042 			break;
2043 		}
2044 		size += rsize;
2045 		numrec++;
2046 	}
2047 
2048 	/*
2049 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2050 	 * load balancing requirements.
2051 	 */
2052 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2053 	if (first_mp == NULL)
2054 		goto free_reclist;
2055 
2056 	first_mp->b_datap->db_type = M_CTL;
2057 	first_mp->b_wptr += sizeof (ipsec_info_t);
2058 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2059 	/* ipsec_out_secure is B_FALSE now */
2060 	io = (ipsec_out_t *)first_mp->b_rptr;
2061 	io->ipsec_out_type = IPSEC_OUT;
2062 	io->ipsec_out_len = sizeof (ipsec_out_t);
2063 	io->ipsec_out_use_global_policy = B_TRUE;
2064 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2065 	io->ipsec_out_attach_if = B_TRUE;
2066 	io->ipsec_out_multicast_loop = B_FALSE;
2067 	io->ipsec_out_dontroute = B_TRUE;
2068 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2069 		zoneid = GLOBAL_ZONEID;
2070 	io->ipsec_out_zoneid = zoneid;
2071 
2072 	mp = allocb(size, BPRI_HI);
2073 	if (mp == NULL) {
2074 		freemsg(first_mp);
2075 		goto free_reclist;
2076 	}
2077 	bzero((char *)mp->b_rptr, size);
2078 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2079 	first_mp->b_cont = mp;
2080 
2081 	ipha = (ipha_t *)mp->b_rptr;
2082 	rtralert = (uint8_t *)&(ipha[1]);
2083 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2084 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2085 
2086 	rp = cur_reclist;
2087 	for (i = 0; i < numrec; i++) {
2088 		grphdr->grphdra_type = rp->mrec_type;
2089 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2090 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2091 		src_array = (ipaddr_t *)&(grphdr[1]);
2092 
2093 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2094 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2095 
2096 		grphdr = (grphdra_t *)&(src_array[j]);
2097 		rp = rp->mrec_next;
2098 	}
2099 
2100 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2101 	igmp3ra->igmp3ra_numrec = htons(numrec);
2102 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2103 
2104 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2105 	rtralert[1] = RTRALERT_LEN;
2106 	rtralert[2] = 0;
2107 	rtralert[3] = 0;
2108 
2109 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2110 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2111 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2112 	ipha->ipha_length = htons(size);
2113 	ipha->ipha_ttl = IGMP_TTL;
2114 	ipha->ipha_protocol = IPPROTO_IGMP;
2115 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2116 	ipha->ipha_src = ipif->ipif_src_addr;
2117 
2118 	/*
2119 	 * Request loopback of the report if we are acting as a multicast
2120 	 * router, so that the process-level routing daemon can hear it.
2121 	 *
2122 	 * This will run multiple times for the same group if there are
2123 	 * members on the same group for multiple ipifs on the same ill.
2124 	 * The igmp_input code will suppress this due to the loopback;
2125 	 * thus we always loopback membership report.
2126 	 */
2127 	ASSERT(ill->ill_rq != NULL);
2128 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2129 
2130 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
2131 
2132 	++ipst->ips_igmpstat.igps_snd_reports;
2133 
2134 	if (morepkts) {
2135 		if (more_src_cnt > 0) {
2136 			int index, mvsize;
2137 			slist_t *sl = &next_reclist->mrec_srcs;
2138 			index = sl->sl_numsrc;
2139 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2140 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2141 			    mvsize);
2142 			sl->sl_numsrc = more_src_cnt;
2143 		}
2144 		goto nextpkt;
2145 	}
2146 
2147 free_reclist:
2148 	while (reclist != NULL) {
2149 		rp = reclist->mrec_next;
2150 		mi_free(reclist);
2151 		reclist = rp;
2152 	}
2153 }
2154 
2155 /*
2156  * mld_input:
2157  */
2158 /* ARGSUSED */
2159 void
2160 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2161 {
2162 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2163 	mld_hdr_t	*mldh;
2164 	ilm_t		*ilm;
2165 	ipif_t		*ipif;
2166 	uint16_t	hdr_length, exthdr_length;
2167 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2168 	uint_t		next;
2169 	int		mldlen;
2170 	ip_stack_t	*ipst = ill->ill_ipst;
2171 
2172 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2173 
2174 	/* Make sure the src address of the packet is link-local */
2175 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2176 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2177 		freemsg(mp);
2178 		return;
2179 	}
2180 
2181 	if (ip6h->ip6_hlim != 1) {
2182 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2183 		freemsg(mp);
2184 		return;
2185 	}
2186 
2187 	/* Get to the icmp header part */
2188 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2189 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2190 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2191 	} else {
2192 		hdr_length = IPV6_HDR_LEN;
2193 		exthdr_length = 0;
2194 	}
2195 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2196 
2197 	/* An MLD packet must at least be 24 octets to be valid */
2198 	if (mldlen < MLD_MINLEN) {
2199 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2200 		freemsg(mp);
2201 		return;
2202 	}
2203 
2204 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2205 
2206 	switch (mldh->mld_type) {
2207 	case MLD_LISTENER_QUERY:
2208 		/*
2209 		 * packet length differentiates between v1 and v2.  v1
2210 		 * query should be exactly 24 octets long; v2 is >= 28.
2211 		 */
2212 		if (mldlen == MLD_MINLEN) {
2213 			next = mld_query_in(mldh, ill);
2214 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2215 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2216 		} else {
2217 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2218 			freemsg(mp);
2219 			return;
2220 		}
2221 		if (next == 0) {
2222 			freemsg(mp);
2223 			return;
2224 		}
2225 
2226 		if (next != INFINITY)
2227 			mld_start_timers(next, ipst);
2228 		break;
2229 
2230 	case MLD_LISTENER_REPORT: {
2231 
2232 		ASSERT(ill->ill_ipif != NULL);
2233 		/*
2234 		 * For fast leave to work, we have to know that we are the
2235 		 * last person to send a report for this group.  Reports
2236 		 * generated by us are looped back since we could potentially
2237 		 * be a multicast router, so discard reports sourced by me.
2238 		 */
2239 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2240 		mutex_enter(&ill->ill_lock);
2241 		for (ipif = ill->ill_ipif; ipif != NULL;
2242 		    ipif = ipif->ipif_next) {
2243 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2244 			    lcladdr_ptr)) {
2245 				if (ip_debug > 1) {
2246 					char    buf1[INET6_ADDRSTRLEN];
2247 					char	buf2[INET6_ADDRSTRLEN];
2248 
2249 					(void) mi_strlog(ill->ill_rq,
2250 					    1,
2251 					    SL_TRACE,
2252 					    "mld_input: we are only "
2253 					    "member src %s ipif_local %s",
2254 					    inet_ntop(AF_INET6, lcladdr_ptr,
2255 					    buf1, sizeof (buf1)),
2256 					    inet_ntop(AF_INET6,
2257 					    &ipif->ipif_v6lcl_addr,
2258 					    buf2, sizeof (buf2)));
2259 				}
2260 				mutex_exit(&ill->ill_lock);
2261 				freemsg(mp);
2262 				return;
2263 			}
2264 		}
2265 		mutex_exit(&ill->ill_lock);
2266 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2267 
2268 		v6group_ptr = &mldh->mld_addr;
2269 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2270 			BUMP_MIB(ill->ill_icmp6_mib,
2271 			    ipv6IfIcmpInGroupMembBadReports);
2272 			freemsg(mp);
2273 			return;
2274 		}
2275 
2276 
2277 		/*
2278 		 * If we belong to the group being reported, and we are a
2279 		 * 'Delaying member' per the RFC terminology, stop our timer
2280 		 * for that group and 'clear flag' i.e. mark ilm_state as
2281 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2282 		 * membership entries for the same group address (one per zone)
2283 		 * so we need to walk the ill_ilm list.
2284 		 */
2285 		mutex_enter(&ill->ill_lock);
2286 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2287 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2288 			    continue;
2289 			BUMP_MIB(ill->ill_icmp6_mib,
2290 			    ipv6IfIcmpInGroupMembOurReports);
2291 
2292 			ilm->ilm_timer = INFINITY;
2293 			ilm->ilm_state = IGMP_OTHERMEMBER;
2294 		}
2295 		mutex_exit(&ill->ill_lock);
2296 		break;
2297 	}
2298 	case MLD_LISTENER_REDUCTION:
2299 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2300 		break;
2301 	}
2302 	/*
2303 	 * All MLD packets have already been passed up to any
2304 	 * process(es) listening on a ICMP6 raw socket. This
2305 	 * has been accomplished in ip_deliver_local_v6 prior to
2306 	 * this function call. It is assumed that the multicast daemon
2307 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2308 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2309 	 * Thus we can free the MLD message block here
2310 	 */
2311 	freemsg(mp);
2312 }
2313 
2314 /*
2315  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2316  * (non-zero, unsigned) timer value to be set on success.
2317  */
2318 static uint_t
2319 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2320 {
2321 	ilm_t	*ilm;
2322 	int	timer;
2323 	uint_t	next;
2324 	in6_addr_t *v6group;
2325 
2326 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2327 
2328 	/*
2329 	 * In the MLD specification, there are 3 states and a flag.
2330 	 *
2331 	 * In Non-Listener state, we simply don't have a membership record.
2332 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2333 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2334 	 * INFINITY)
2335 	 *
2336 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2337 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2338 	 * if I sent the last report.
2339 	 */
2340 	v6group = &mldh->mld_addr;
2341 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2342 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2343 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2344 		return (0);
2345 	}
2346 
2347 	/* Need to do compatibility mode checking */
2348 	mutex_enter(&ill->ill_lock);
2349 	ill->ill_mcast_v1_time = 0;
2350 	ill->ill_mcast_v1_tset = 1;
2351 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2352 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2353 		    "MLD_V1_ROUTER\n", ill->ill_name));
2354 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2355 		ill->ill_mcast_type = MLD_V1_ROUTER;
2356 	}
2357 	mutex_exit(&ill->ill_lock);
2358 
2359 	timer = (int)ntohs(mldh->mld_maxdelay);
2360 	if (ip_debug > 1) {
2361 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2362 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2363 		    timer, (int)mldh->mld_type);
2364 	}
2365 
2366 	/*
2367 	 * -Start the timers in all of our membership records for
2368 	 * the physical interface on which the query arrived,
2369 	 * excl:
2370 	 *	1.  those that belong to the "all hosts" group,
2371 	 *	2.  those with 0 scope, or 1 node-local scope.
2372 	 *
2373 	 * -Restart any timer that is already running but has a value
2374 	 * longer that the requested timeout.
2375 	 * -Use the value specified in the query message as the
2376 	 * maximum timeout.
2377 	 */
2378 	next = INFINITY;
2379 	mutex_enter(&ill->ill_lock);
2380 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2381 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2382 
2383 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2384 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2385 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2386 			continue;
2387 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2388 		    &ipv6_all_hosts_mcast)) &&
2389 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2390 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2391 			if (timer == 0) {
2392 				/* Respond immediately */
2393 				ilm->ilm_timer = INFINITY;
2394 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2395 				mutex_exit(&ill->ill_lock);
2396 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2397 				mutex_enter(&ill->ill_lock);
2398 				break;
2399 			}
2400 			if (ilm->ilm_timer > timer) {
2401 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2402 				if (ilm->ilm_timer < next)
2403 					next = ilm->ilm_timer;
2404 			}
2405 			break;
2406 		}
2407 	}
2408 	mutex_exit(&ill->ill_lock);
2409 
2410 	return (next);
2411 }
2412 
2413 /*
2414  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2415  * returns the appropriate (non-zero, unsigned) timer value (which may
2416  * be INFINITY) to be set.
2417  */
2418 static uint_t
2419 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2420 {
2421 	ilm_t	*ilm;
2422 	in6_addr_t *v6group, *src_array;
2423 	uint_t	next, numsrc, i, mrd, delay, qqi;
2424 	uint8_t	qrv;
2425 
2426 	v6group = &mld2q->mld2q_addr;
2427 	numsrc = ntohs(mld2q->mld2q_numsrc);
2428 
2429 	/* make sure numsrc matches packet size */
2430 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2431 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2432 		return (0);
2433 	}
2434 	src_array = (in6_addr_t *)&mld2q[1];
2435 
2436 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2437 
2438 	/* extract Maximum Response Delay from code in header */
2439 	mrd = ntohs(mld2q->mld2q_mxrc);
2440 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2441 		uint_t hdrval, mant, exp;
2442 		hdrval = mrd;
2443 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2444 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2445 		mrd = (mant | 0x1000) << (exp + 3);
2446 	}
2447 	MCAST_RANDOM_DELAY(delay, mrd);
2448 	next = (unsigned)INFINITY;
2449 
2450 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2451 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2452 	else
2453 		ill->ill_mcast_rv = qrv;
2454 
2455 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2456 		uint_t mant, exp;
2457 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2458 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2459 		qqi = (mant | 0x10) << (exp + 3);
2460 	}
2461 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2462 
2463 	/*
2464 	 * If we have a pending general query response that's scheduled
2465 	 * sooner than the delay we calculated for this response, then
2466 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2467 	 */
2468 	mutex_enter(&ill->ill_lock);
2469 	if (ill->ill_global_timer < delay) {
2470 		mutex_exit(&ill->ill_lock);
2471 		return (next);
2472 	}
2473 	mutex_exit(&ill->ill_lock);
2474 
2475 	/*
2476 	 * Now take action depending on query type: general,
2477 	 * group specific, or group/source specific.
2478 	 */
2479 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2480 		/*
2481 		 * general query
2482 		 * We know global timer is either not running or is
2483 		 * greater than our calculated delay, so reset it to
2484 		 * our delay (random value in range [0, response time])
2485 		 */
2486 		mutex_enter(&ill->ill_lock);
2487 		ill->ill_global_timer = delay;
2488 		next = ill->ill_global_timer;
2489 		mutex_exit(&ill->ill_lock);
2490 
2491 	} else {
2492 		/* group or group/source specific query */
2493 		mutex_enter(&ill->ill_lock);
2494 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2495 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2496 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2497 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2498 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2499 				continue;
2500 
2501 			/*
2502 			 * If the query is group specific or we have a
2503 			 * pending group specific query, the response is
2504 			 * group specific (pending sources list should be
2505 			 * empty).  Otherwise, need to update the pending
2506 			 * sources list for the group and source specific
2507 			 * response.
2508 			 */
2509 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2510 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2511 group_query:
2512 				FREE_SLIST(ilm->ilm_pendsrcs);
2513 				ilm->ilm_pendsrcs = NULL;
2514 			} else {
2515 				boolean_t overflow;
2516 				slist_t *pktl;
2517 				if (numsrc > MAX_FILTER_SIZE ||
2518 				    (ilm->ilm_pendsrcs == NULL &&
2519 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2520 					/*
2521 					 * We've been sent more sources than
2522 					 * we can deal with; or we can't deal
2523 					 * with a source list at all. Revert
2524 					 * to a group specific query.
2525 					 */
2526 					goto group_query;
2527 				}
2528 				if ((pktl = l_alloc()) == NULL)
2529 					goto group_query;
2530 				pktl->sl_numsrc = numsrc;
2531 				for (i = 0; i < numsrc; i++)
2532 					pktl->sl_addr[i] = src_array[i];
2533 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2534 				    &overflow);
2535 				l_free(pktl);
2536 				if (overflow)
2537 					goto group_query;
2538 			}
2539 			/* set timer to soonest value */
2540 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2541 			if (ilm->ilm_timer < next)
2542 				next = ilm->ilm_timer;
2543 			break;
2544 		}
2545 		mutex_exit(&ill->ill_lock);
2546 	}
2547 
2548 	return (next);
2549 }
2550 
2551 /*
2552  * Send MLDv1 response packet with hoplimit 1
2553  */
2554 static void
2555 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2556 {
2557 	mblk_t		*mp;
2558 	mld_hdr_t	*mldh;
2559 	ip6_t 		*ip6h;
2560 	ip6_hbh_t	*ip6hbh;
2561 	struct ip6_opt_router	*ip6router;
2562 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2563 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2564 	ipif_t		*ipif;
2565 	ip6i_t		*ip6i;
2566 
2567 	/*
2568 	 * We need to place a router alert option in this packet.  The length
2569 	 * of the options must be a multiple of 8.  The hbh option header is 2
2570 	 * bytes followed by the 4 byte router alert option.  That leaves
2571 	 * 2 bytes of pad for a total of 8 bytes.
2572 	 */
2573 	const int	router_alert_length = 8;
2574 
2575 	ASSERT(ill->ill_isv6);
2576 
2577 	/*
2578 	 * We need to make sure that this packet does not get load balanced.
2579 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2580 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2581 	 * If it gets load balanced, switches supporting MLD snooping
2582 	 * (in the future) will send the packet that it receives for this
2583 	 * multicast group to the interface that we are sending on. As we have
2584 	 * joined the multicast group on this ill, by sending the packet out
2585 	 * on this ill, we receive all the packets back on this ill.
2586 	 */
2587 	size += sizeof (ip6i_t) + router_alert_length;
2588 	mp = allocb(size, BPRI_HI);
2589 	if (mp == NULL)
2590 		return;
2591 	bzero(mp->b_rptr, size);
2592 	mp->b_wptr = mp->b_rptr + size;
2593 
2594 	ip6i = (ip6i_t *)mp->b_rptr;
2595 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2596 	ip6i->ip6i_nxt = IPPROTO_RAW;
2597 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2598 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2599 
2600 	ip6h = (ip6_t *)&ip6i[1];
2601 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2602 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2603 	/*
2604 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2605 	 * above will pad between ip6router and mld.
2606 	 */
2607 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2608 
2609 	mldh->mld_type = type;
2610 	mldh->mld_addr = ilm->ilm_v6addr;
2611 
2612 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2613 	ip6router->ip6or_len = 2;
2614 	ip6router->ip6or_value[0] = 0;
2615 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2616 
2617 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2618 	ip6hbh->ip6h_len = 0;
2619 
2620 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2621 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2622 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2623 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2624 	if (v6addr == NULL)
2625 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2626 	else
2627 		ip6h->ip6_dst = *v6addr;
2628 
2629 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2630 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2631 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2632 		ipif_refrele(ipif);
2633 	} else {
2634 		/* Otherwise, use IPv6 default address selection. */
2635 		ip6h->ip6_src = ipv6_all_zeros;
2636 	}
2637 
2638 	/*
2639 	 * Prepare for checksum by putting icmp length in the icmp
2640 	 * checksum field. The checksum is calculated in ip_wput_v6.
2641 	 */
2642 	mldh->mld_cksum = htons(sizeof (*mldh));
2643 
2644 	/*
2645 	 * ip_wput will automatically loopback the multicast packet to
2646 	 * the conn if multicast loopback is enabled.
2647 	 * The MIB stats corresponding to this outgoing MLD packet
2648 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2649 	 * ->icmp_update_out_mib_v6 function call.
2650 	 */
2651 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2652 }
2653 
2654 /*
2655  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2656  * report will contain one multicast address record for each element of
2657  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2658  * multiple reports are sent.  reclist is assumed to be made up of
2659  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2660  */
2661 static void
2662 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2663 {
2664 	mblk_t		*mp;
2665 	mld2r_t		*mld2r;
2666 	mld2mar_t	*mld2mar;
2667 	in6_addr_t	*srcarray;
2668 	ip6_t		*ip6h;
2669 	ip6_hbh_t	*ip6hbh;
2670 	ip6i_t		*ip6i;
2671 	struct ip6_opt_router	*ip6router;
2672 	size_t		size, optlen, padlen, icmpsize, rsize;
2673 	ipif_t		*ipif;
2674 	int		i, numrec, more_src_cnt;
2675 	mrec_t		*rp, *cur_reclist;
2676 	mrec_t		*next_reclist = reclist;
2677 	boolean_t	morepkts;
2678 
2679 	/* If there aren't any records, there's nothing to send */
2680 	if (reclist == NULL)
2681 		return;
2682 
2683 	ASSERT(ill->ill_isv6);
2684 
2685 	/*
2686 	 * Total option length (optlen + padlen) must be a multiple of
2687 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2688 	 * length will be 8.  Assert this in case anything ever changes.
2689 	 */
2690 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2691 	ASSERT(optlen <= 8);
2692 	padlen = 8 - optlen;
2693 nextpkt:
2694 	icmpsize = sizeof (mld2r_t);
2695 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2696 	morepkts = B_FALSE;
2697 	more_src_cnt = 0;
2698 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2699 	    rp = rp->mrec_next, numrec++) {
2700 		rsize = sizeof (mld2mar_t) +
2701 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2702 		if (size + rsize > ill->ill_max_frag) {
2703 			if (rp == cur_reclist) {
2704 				/*
2705 				 * If the first mrec we looked at is too big
2706 				 * to fit in a single packet (i.e the source
2707 				 * list is too big), we must either truncate
2708 				 * the list (if TO_EX or IS_EX), or send
2709 				 * multiple reports for the same group (all
2710 				 * other types).
2711 				 */
2712 				int srcspace, srcsperpkt;
2713 				srcspace = ill->ill_max_frag -
2714 				    (size + sizeof (mld2mar_t));
2715 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2716 				/*
2717 				 * Increment icmpsize and size, because we will
2718 				 * be sending a record for the mrec we're
2719 				 * looking at now.
2720 				 */
2721 				rsize = sizeof (mld2mar_t) +
2722 				    (srcsperpkt * sizeof (in6_addr_t));
2723 				icmpsize += rsize;
2724 				size += rsize;
2725 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2726 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2727 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2728 					if (rp->mrec_next == NULL) {
2729 						/* no more packets to send */
2730 						break;
2731 					} else {
2732 						/*
2733 						 * more packets, but we're
2734 						 * done with this mrec.
2735 						 */
2736 						next_reclist = rp->mrec_next;
2737 					}
2738 				} else {
2739 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2740 					    - srcsperpkt;
2741 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2742 					/*
2743 					 * We'll fix up this mrec (remove the
2744 					 * srcs we've already sent) before
2745 					 * returning to nextpkt above.
2746 					 */
2747 					next_reclist = rp;
2748 				}
2749 			} else {
2750 				next_reclist = rp;
2751 			}
2752 			morepkts = B_TRUE;
2753 			break;
2754 		}
2755 		icmpsize += rsize;
2756 		size += rsize;
2757 	}
2758 
2759 	/*
2760 	 * We need to make sure that this packet does not get load balanced.
2761 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2762 	 * ip_newroute_ipif_v6 know how to handle such packets.
2763 	 * If it gets load balanced, switches supporting MLD snooping
2764 	 * (in the future) will send the packet that it receives for this
2765 	 * multicast group to the interface that we are sending on. As we have
2766 	 * joined the multicast group on this ill, by sending the packet out
2767 	 * on this ill, we receive all the packets back on this ill.
2768 	 */
2769 	size += sizeof (ip6i_t);
2770 	mp = allocb(size, BPRI_HI);
2771 	if (mp == NULL)
2772 		goto free_reclist;
2773 	bzero(mp->b_rptr, size);
2774 	mp->b_wptr = mp->b_rptr + size;
2775 
2776 	ip6i = (ip6i_t *)mp->b_rptr;
2777 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2778 	ip6i->ip6i_nxt = IPPROTO_RAW;
2779 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2780 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2781 
2782 	ip6h = (ip6_t *)&(ip6i[1]);
2783 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2784 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2785 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2786 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2787 
2788 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2789 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2790 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2791 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2792 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2793 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2794 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2795 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2796 		ipif_refrele(ipif);
2797 	} else {
2798 		/* otherwise, use IPv6 default address selection. */
2799 		ip6h->ip6_src = ipv6_all_zeros;
2800 	}
2801 
2802 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2803 	/*
2804 	 * ip6h_len is the number of 8-byte words, not including the first
2805 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2806 	 */
2807 	ip6hbh->ip6h_len = 0;
2808 
2809 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2810 	ip6router->ip6or_len = 2;
2811 	ip6router->ip6or_value[0] = 0;
2812 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2813 
2814 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2815 	mld2r->mld2r_nummar = htons(numrec);
2816 	/*
2817 	 * Prepare for the checksum by putting icmp length in the icmp
2818 	 * checksum field. The checksum is calculated in ip_wput_v6.
2819 	 */
2820 	mld2r->mld2r_cksum = htons(icmpsize);
2821 
2822 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2823 		mld2mar->mld2mar_type = rp->mrec_type;
2824 		mld2mar->mld2mar_auxlen = 0;
2825 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2826 		mld2mar->mld2mar_group = rp->mrec_group;
2827 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2828 
2829 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2830 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2831 
2832 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2833 	}
2834 
2835 	/*
2836 	 * ip_wput will automatically loopback the multicast packet to
2837 	 * the conn if multicast loopback is enabled.
2838 	 * The MIB stats corresponding to this outgoing MLD packet
2839 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2840 	 * ->icmp_update_out_mib_v6 function call.
2841 	 */
2842 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2843 
2844 	if (morepkts) {
2845 		if (more_src_cnt > 0) {
2846 			int index, mvsize;
2847 			slist_t *sl = &next_reclist->mrec_srcs;
2848 			index = sl->sl_numsrc;
2849 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2850 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2851 			    mvsize);
2852 			sl->sl_numsrc = more_src_cnt;
2853 		}
2854 		goto nextpkt;
2855 	}
2856 
2857 free_reclist:
2858 	while (reclist != NULL) {
2859 		rp = reclist->mrec_next;
2860 		mi_free(reclist);
2861 		reclist = rp;
2862 	}
2863 }
2864 
2865 static mrec_t *
2866 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2867     mrec_t *next)
2868 {
2869 	mrec_t *rp;
2870 	int i;
2871 
2872 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2873 	    SLIST_IS_EMPTY(srclist))
2874 		return (next);
2875 
2876 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2877 	if (rp == NULL)
2878 		return (next);
2879 
2880 	rp->mrec_next = next;
2881 	rp->mrec_type = type;
2882 	rp->mrec_auxlen = 0;
2883 	rp->mrec_group = *grp;
2884 	if (srclist == NULL) {
2885 		rp->mrec_srcs.sl_numsrc = 0;
2886 	} else {
2887 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2888 		for (i = 0; i < srclist->sl_numsrc; i++)
2889 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2890 	}
2891 
2892 	return (rp);
2893 }
2894 
2895 /*
2896  * Set up initial retransmit state.  If memory cannot be allocated for
2897  * the source lists, simply create as much state as is possible; memory
2898  * allocation failures are considered one type of transient error that
2899  * the retransmissions are designed to overcome (and if they aren't
2900  * transient, there are bigger problems than failing to notify the
2901  * router about multicast group membership state changes).
2902  */
2903 static void
2904 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2905     slist_t *flist)
2906 {
2907 	/*
2908 	 * There are only three possibilities for rtype:
2909 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2910 	 *	  => rtype is ALLOW_NEW_SOURCES
2911 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2912 	 *	  => rtype is CHANGE_TO_EXCLUDE
2913 	 *	State change that involves a filter mode change
2914 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2915 	 */
2916 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2917 	    rtype == ALLOW_NEW_SOURCES);
2918 
2919 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2920 
2921 	switch (rtype) {
2922 	case CHANGE_TO_EXCLUDE:
2923 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2924 		CLEAR_SLIST(rtxp->rtx_allow);
2925 		COPY_SLIST(flist, rtxp->rtx_block);
2926 		break;
2927 	case ALLOW_NEW_SOURCES:
2928 	case CHANGE_TO_INCLUDE:
2929 		rtxp->rtx_fmode_cnt =
2930 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2931 		CLEAR_SLIST(rtxp->rtx_block);
2932 		COPY_SLIST(flist, rtxp->rtx_allow);
2933 		break;
2934 	}
2935 }
2936 
2937 /*
2938  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2939  * RFC 3376 section 5.1, covers three cases:
2940  *	* The current state change is a filter mode change
2941  *		Set filter mode retransmit counter; set retransmit allow or
2942  *		block list to new source list as appropriate, and clear the
2943  *		retransmit list that was not set; send TO_IN or TO_EX with
2944  *		new source list.
2945  *	* The current state change is a source list change, but the filter
2946  *	  mode retransmit counter is > 0
2947  *		Decrement filter mode retransmit counter; set retransmit
2948  *		allow or block list to  new source list as appropriate,
2949  *		and clear the retransmit list that was not set; send TO_IN
2950  *		or TO_EX with new source list.
2951  *	* The current state change is a source list change, and the filter
2952  *	  mode retransmit counter is 0.
2953  *		Merge existing rtx allow and block lists with new state:
2954  *		  rtx_allow = (new allow + rtx_allow) - new block
2955  *		  rtx_block = (new block + rtx_block) - new allow
2956  *		Send ALLOW and BLOCK records for new retransmit lists;
2957  *		decrement retransmit counter.
2958  *
2959  * As is the case for mcast_init_rtx(), memory allocation failures are
2960  * acceptable; we just create as much state as we can.
2961  */
2962 static mrec_t *
2963 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2964 {
2965 	ill_t *ill;
2966 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2967 	mcast_record_t txtype;
2968 	mrec_t *rp, *rpnext, *rtnmrec;
2969 	boolean_t ovf;
2970 
2971 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
2972 
2973 	if (mreclist == NULL)
2974 		return (mreclist);
2975 
2976 	/*
2977 	 * A filter mode change is indicated by a single mrec, which is
2978 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2979 	 * retransmit state as if this were an initial join.  There is
2980 	 * no change to the mrec list.
2981 	 */
2982 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2983 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2984 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2985 		    &mreclist->mrec_srcs);
2986 		return (mreclist);
2987 	}
2988 
2989 	/*
2990 	 * Only the source list has changed
2991 	 */
2992 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2993 	if (rtxp->rtx_fmode_cnt > 0) {
2994 		/* but we're still sending filter mode change reports */
2995 		rtxp->rtx_fmode_cnt--;
2996 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
2997 			CLEAR_SLIST(rtxp->rtx_block);
2998 			COPY_SLIST(flist, rtxp->rtx_allow);
2999 			txtype = CHANGE_TO_INCLUDE;
3000 		} else {
3001 			CLEAR_SLIST(rtxp->rtx_allow);
3002 			COPY_SLIST(flist, rtxp->rtx_block);
3003 			txtype = CHANGE_TO_EXCLUDE;
3004 		}
3005 		/* overwrite first mrec with new info */
3006 		mreclist->mrec_type = txtype;
3007 		l_copy(flist, &mreclist->mrec_srcs);
3008 		/* then free any remaining mrecs */
3009 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3010 			rpnext = rp->mrec_next;
3011 			mi_free(rp);
3012 		}
3013 		mreclist->mrec_next = NULL;
3014 		rtnmrec = mreclist;
3015 	} else {
3016 		mrec_t *allow_mrec, *block_mrec;
3017 		/*
3018 		 * Just send the source change reports; but we need to
3019 		 * recalculate the ALLOW and BLOCK lists based on previous
3020 		 * state and new changes.
3021 		 */
3022 		rtnmrec = mreclist;
3023 		allow_mrec = block_mrec = NULL;
3024 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3025 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3026 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3027 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3028 				allow_mrec = rp;
3029 			else
3030 				block_mrec = rp;
3031 		}
3032 		/*
3033 		 * Perform calculations:
3034 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3035 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3036 		 *
3037 		 * Each calc requires two steps, for example:
3038 		 *   rtx_allow = rtx_allow - mrec_block;
3039 		 *   new_allow = mrec_allow + rtx_allow;
3040 		 *
3041 		 * Store results in mrec lists, and then copy into rtx lists.
3042 		 * We do it in this order in case the rtx list hasn't been
3043 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3044 		 * Overflows are also okay.
3045 		 */
3046 		if (block_mrec != NULL) {
3047 			l_difference_in_a(rtxp->rtx_allow,
3048 			    &block_mrec->mrec_srcs);
3049 		}
3050 		if (allow_mrec != NULL) {
3051 			l_difference_in_a(rtxp->rtx_block,
3052 			    &allow_mrec->mrec_srcs);
3053 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3054 			    &ovf);
3055 		}
3056 		if (block_mrec != NULL) {
3057 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3058 			    &ovf);
3059 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3060 		} else {
3061 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3062 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3063 		}
3064 		if (allow_mrec != NULL) {
3065 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3066 		} else {
3067 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3068 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3069 		}
3070 	}
3071 
3072 	return (rtnmrec);
3073 }
3074