xref: /titanic_50/usr/src/uts/common/inet/ip/igmp.c (revision 1e1ddd6cc98ab5af8293f7ebd132be62900730fd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Internet Group Management Protocol (IGMP) routines.
31  * Multicast Listener Discovery Protocol (MLD) routines.
32  *
33  * Written by Steve Deering, Stanford, May 1988.
34  * Modified by Rosen Sharma, Stanford, Aug 1994.
35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
36  *
37  * MULTICAST 3.5.1.1
38  */
39 
40 #include <sys/types.h>
41 #include <sys/stream.h>
42 #include <sys/stropts.h>
43 #include <sys/strlog.h>
44 #include <sys/strsun.h>
45 #include <sys/systm.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/atomic.h>
50 #include <sys/zone.h>
51 
52 #include <sys/param.h>
53 #include <sys/socket.h>
54 #include <inet/ipclassifier.h>
55 #include <net/if.h>
56 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 
62 #include <inet/common.h>
63 #include <inet/mi.h>
64 #include <inet/nd.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_if.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 
75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
84 		    slist_t *srclist, mrec_t *next);
85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
86 		    mcast_record_t rtype, slist_t *flist);
87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
88 
89 
90 /*
91  * Macros used to do timer len conversions.  Timer values are always
92  * stored and passed to the timer functions as milliseconds; but the
93  * default values and values from the wire may not be.
94  *
95  * And yes, it's obscure, but decisecond is easier to abbreviate than
96  * "tenths of a second".
97  */
98 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
99 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
100 
101 /*
102  * A running timer (scheduled thru timeout) can be cancelled if another
103  * timer with a shorter timeout value is scheduled before it has timed
104  * out.  When the shorter timer expires, the original timer is updated
105  * to account for the time elapsed while the shorter timer ran; but this
106  * does not take into account the amount of time already spent in timeout
107  * state before being preempted by the shorter timer, that is the time
108  * interval between time scheduled to time cancelled.  This can cause
109  * delays in sending out multicast membership reports.  To resolve this
110  * problem, wallclock time (absolute time) is used instead of deltas
111  * (relative time) to track timers.
112  *
113  * The MACRO below gets the lbolt value, used for proper timer scheduling
114  * and firing. Therefore multicast membership reports are sent on time.
115  * The timer does not exactly fire at the time it was scehduled to fire,
116  * there is a difference of a few milliseconds observed. An offset is used
117  * to take care of the difference.
118  */
119 
120 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
121 #define	CURRENT_OFFSET	(999)
122 
123 /*
124  * The first multicast join will trigger the igmp timers / mld timers
125  * The unit for next is milliseconds.
126  */
127 void
128 igmp_start_timers(unsigned next, ip_stack_t *ipst)
129 {
130 	int	time_left;
131 	int	ret;
132 
133 	ASSERT(next != 0 && next != INFINITY);
134 
135 	mutex_enter(&ipst->ips_igmp_timer_lock);
136 
137 	if (ipst->ips_igmp_timer_setter_active) {
138 		/*
139 		 * Serialize timer setters, one at a time. If the
140 		 * timer is currently being set by someone,
141 		 * just record the next time when it has to be
142 		 * invoked and return. The current setter will
143 		 * take care.
144 		 */
145 		ipst->ips_igmp_time_to_next =
146 		    MIN(ipst->ips_igmp_time_to_next, next);
147 		mutex_exit(&ipst->ips_igmp_timer_lock);
148 		return;
149 	} else {
150 		ipst->ips_igmp_timer_setter_active = B_TRUE;
151 	}
152 	if (ipst->ips_igmp_timeout_id == 0) {
153 		/*
154 		 * The timer is inactive. We need to start a timer
155 		 */
156 		ipst->ips_igmp_time_to_next = next;
157 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
158 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
159 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
160 		ipst->ips_igmp_timer_setter_active = B_FALSE;
161 		mutex_exit(&ipst->ips_igmp_timer_lock);
162 		return;
163 	}
164 
165 	/*
166 	 * The timer was scheduled sometime back for firing in
167 	 * 'igmp_time_to_next' ms and is active. We need to
168 	 * reschedule the timeout if the new 'next' will happen
169 	 * earlier than the currently scheduled timeout
170 	 */
171 	time_left = ipst->ips_igmp_timer_scheduled_last +
172 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
173 	if (time_left < MSEC_TO_TICK(next)) {
174 		ipst->ips_igmp_timer_setter_active = B_FALSE;
175 		mutex_exit(&ipst->ips_igmp_timer_lock);
176 		return;
177 	}
178 
179 	mutex_exit(&ipst->ips_igmp_timer_lock);
180 	ret = untimeout(ipst->ips_igmp_timeout_id);
181 	mutex_enter(&ipst->ips_igmp_timer_lock);
182 	/*
183 	 * The timeout was cancelled, or the timeout handler
184 	 * completed, while we were blocked in the untimeout.
185 	 * No other thread could have set the timer meanwhile
186 	 * since we serialized all the timer setters. Thus
187 	 * no timer is currently active nor executing nor will
188 	 * any timer fire in the future. We start the timer now
189 	 * if needed.
190 	 */
191 	if (ret == -1) {
192 		ASSERT(ipst->ips_igmp_timeout_id == 0);
193 	} else {
194 		ASSERT(ipst->ips_igmp_timeout_id != 0);
195 		ipst->ips_igmp_timeout_id = 0;
196 	}
197 	if (ipst->ips_igmp_time_to_next != 0) {
198 		ipst->ips_igmp_time_to_next =
199 		    MIN(ipst->ips_igmp_time_to_next, next);
200 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
201 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
202 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
203 	}
204 	ipst->ips_igmp_timer_setter_active = B_FALSE;
205 	mutex_exit(&ipst->ips_igmp_timer_lock);
206 }
207 
208 /*
209  * mld_start_timers:
210  * The unit for next is milliseconds.
211  */
212 void
213 mld_start_timers(unsigned next, ip_stack_t *ipst)
214 {
215 	int	time_left;
216 	int	ret;
217 
218 	ASSERT(next != 0 && next != INFINITY);
219 
220 	mutex_enter(&ipst->ips_mld_timer_lock);
221 	if (ipst->ips_mld_timer_setter_active) {
222 		/*
223 		 * Serialize timer setters, one at a time. If the
224 		 * timer is currently being set by someone,
225 		 * just record the next time when it has to be
226 		 * invoked and return. The current setter will
227 		 * take care.
228 		 */
229 		ipst->ips_mld_time_to_next =
230 		    MIN(ipst->ips_mld_time_to_next, next);
231 		mutex_exit(&ipst->ips_mld_timer_lock);
232 		return;
233 	} else {
234 		ipst->ips_mld_timer_setter_active = B_TRUE;
235 	}
236 	if (ipst->ips_mld_timeout_id == 0) {
237 		/*
238 		 * The timer is inactive. We need to start a timer
239 		 */
240 		ipst->ips_mld_time_to_next = next;
241 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
242 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
243 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
244 		ipst->ips_mld_timer_setter_active = B_FALSE;
245 		mutex_exit(&ipst->ips_mld_timer_lock);
246 		return;
247 	}
248 
249 	/*
250 	 * The timer was scheduled sometime back for firing in
251 	 * 'igmp_time_to_next' ms and is active. We need to
252 	 * reschedule the timeout if the new 'next' will happen
253 	 * earlier than the currently scheduled timeout
254 	 */
255 	time_left = ipst->ips_mld_timer_scheduled_last +
256 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
257 	if (time_left < MSEC_TO_TICK(next)) {
258 		ipst->ips_mld_timer_setter_active = B_FALSE;
259 		mutex_exit(&ipst->ips_mld_timer_lock);
260 		return;
261 	}
262 
263 	mutex_exit(&ipst->ips_mld_timer_lock);
264 	ret = untimeout(ipst->ips_mld_timeout_id);
265 	mutex_enter(&ipst->ips_mld_timer_lock);
266 	/*
267 	 * The timeout was cancelled, or the timeout handler
268 	 * completed, while we were blocked in the untimeout.
269 	 * No other thread could have set the timer meanwhile
270 	 * since we serialized all the timer setters. Thus
271 	 * no timer is currently active nor executing nor will
272 	 * any timer fire in the future. We start the timer now
273 	 * if needed.
274 	 */
275 	if (ret == -1) {
276 		ASSERT(ipst->ips_mld_timeout_id == 0);
277 	} else {
278 		ASSERT(ipst->ips_mld_timeout_id != 0);
279 		ipst->ips_mld_timeout_id = 0;
280 	}
281 	if (ipst->ips_mld_time_to_next != 0) {
282 		ipst->ips_mld_time_to_next =
283 		    MIN(ipst->ips_mld_time_to_next, next);
284 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
285 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
286 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
287 	}
288 	ipst->ips_mld_timer_setter_active = B_FALSE;
289 	mutex_exit(&ipst->ips_mld_timer_lock);
290 }
291 
292 /*
293  * igmp_input:
294  * Return NULL for a bad packet that is discarded here.
295  * Return mp if the message is OK and should be handed to "raw" receivers.
296  * Callers of igmp_input() may need to reinitialize variables that were copied
297  * from the mblk as this calls pullupmsg().
298  */
299 /* ARGSUSED */
300 mblk_t *
301 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
302 {
303 	igmpa_t 	*igmpa;
304 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
305 	int		iphlen, igmplen, mblklen;
306 	ilm_t 		*ilm;
307 	uint32_t	src, dst;
308 	uint32_t 	group;
309 	uint_t		next;
310 	ipif_t 		*ipif;
311 	ip_stack_t	 *ipst;
312 
313 	ASSERT(ill != NULL);
314 	ASSERT(!ill->ill_isv6);
315 	ipst = ill->ill_ipst;
316 	++ipst->ips_igmpstat.igps_rcv_total;
317 
318 	mblklen = MBLKL(mp);
319 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
320 		++ipst->ips_igmpstat.igps_rcv_tooshort;
321 		goto bad_pkt;
322 	}
323 	igmplen = ntohs(ipha->ipha_length) - iphlen;
324 	/*
325 	 * Since msg sizes are more variable with v3, just pullup the
326 	 * whole thing now.
327 	 */
328 	if (MBLKL(mp) < (igmplen + iphlen)) {
329 		mblk_t *mp1;
330 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
331 			++ipst->ips_igmpstat.igps_rcv_tooshort;
332 			goto bad_pkt;
333 		}
334 		freemsg(mp);
335 		mp = mp1;
336 		ipha = (ipha_t *)(mp->b_rptr);
337 	}
338 
339 	/*
340 	 * Validate lengths
341 	 */
342 	if (igmplen < IGMP_MINLEN) {
343 		++ipst->ips_igmpstat.igps_rcv_tooshort;
344 		goto bad_pkt;
345 	}
346 	/*
347 	 * Validate checksum
348 	 */
349 	if (IP_CSUM(mp, iphlen, 0)) {
350 		++ipst->ips_igmpstat.igps_rcv_badsum;
351 		goto bad_pkt;
352 	}
353 
354 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
355 	src = ipha->ipha_src;
356 	dst = ipha->ipha_dst;
357 	if (ip_debug > 1)
358 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
359 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
360 		    (int)ntohl(src), (int)ntohl(dst),
361 		    ill->ill_name);
362 
363 	switch (igmpa->igmpa_type) {
364 	case IGMP_MEMBERSHIP_QUERY:
365 		/*
366 		 * packet length differentiates between v1/v2 and v3
367 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
368 		 */
369 		if ((igmplen == IGMP_MINLEN) ||
370 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
371 			next = igmp_query_in(ipha, igmpa, ill);
372 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
373 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
374 			    igmplen);
375 		} else {
376 			++ipst->ips_igmpstat.igps_rcv_tooshort;
377 			goto bad_pkt;
378 		}
379 		if (next == 0)
380 			goto bad_pkt;
381 
382 		if (next != INFINITY)
383 			igmp_start_timers(next, ipst);
384 
385 		break;
386 
387 	case IGMP_V1_MEMBERSHIP_REPORT:
388 	case IGMP_V2_MEMBERSHIP_REPORT:
389 		/*
390 		 * For fast leave to work, we have to know that we are the
391 		 * last person to send a report for this group. Reports
392 		 * generated by us are looped back since we could potentially
393 		 * be a multicast router, so discard reports sourced by me.
394 		 */
395 		mutex_enter(&ill->ill_lock);
396 		for (ipif = ill->ill_ipif; ipif != NULL;
397 		    ipif = ipif->ipif_next) {
398 			if (ipif->ipif_lcl_addr == src) {
399 				if (ip_debug > 1) {
400 					(void) mi_strlog(ill->ill_rq,
401 					    1,
402 					    SL_TRACE,
403 					    "igmp_input: we are only "
404 					    "member src 0x%x ipif_local 0x%x",
405 					    (int)ntohl(src),
406 					    (int)
407 					    ntohl(ipif->ipif_lcl_addr));
408 				}
409 				mutex_exit(&ill->ill_lock);
410 				return (mp);
411 			}
412 		}
413 		mutex_exit(&ill->ill_lock);
414 
415 		++ipst->ips_igmpstat.igps_rcv_reports;
416 		group = igmpa->igmpa_group;
417 		if (!CLASSD(group)) {
418 			++ipst->ips_igmpstat.igps_rcv_badreports;
419 			goto bad_pkt;
420 		}
421 
422 		/*
423 		 * KLUDGE: if the IP source address of the report has an
424 		 * unspecified (i.e., zero) subnet number, as is allowed for
425 		 * a booting host, replace it with the correct subnet number
426 		 * so that a process-level multicast routing demon can
427 		 * determine which subnet it arrived from.  This is necessary
428 		 * to compensate for the lack of any way for a process to
429 		 * determine the arrival interface of an incoming packet.
430 		 *
431 		 * Requires that a copy of *this* message it passed up
432 		 * to the raw interface which is done by our caller.
433 		 */
434 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
435 			/* Pick the first ipif on this ill */
436 			mutex_enter(&ill->ill_lock);
437 			src = ill->ill_ipif->ipif_subnet;
438 			mutex_exit(&ill->ill_lock);
439 			ip1dbg(("igmp_input: changed src to 0x%x\n",
440 			    (int)ntohl(src)));
441 			ipha->ipha_src = src;
442 		}
443 
444 		/*
445 		 * If we belong to the group being reported, and
446 		 * we are a 'Delaying member' in the RFC terminology,
447 		 * stop our timer for that group and 'clear flag' i.e.
448 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
449 		 * interfaces on the given physical interface.
450 		 */
451 		mutex_enter(&ill->ill_lock);
452 		for (ipif = ill->ill_ipif; ipif != NULL;
453 		    ipif = ipif->ipif_next) {
454 			ilm = ilm_lookup_ipif(ipif, group);
455 			if (ilm != NULL) {
456 				++ipst->ips_igmpstat.igps_rcv_ourreports;
457 				ilm->ilm_timer = INFINITY;
458 				ilm->ilm_state = IGMP_OTHERMEMBER;
459 			}
460 		} /* for */
461 		mutex_exit(&ill->ill_lock);
462 		break;
463 
464 	case IGMP_V3_MEMBERSHIP_REPORT:
465 		/*
466 		 * Currently nothing to do here; IGMP router is not
467 		 * implemented in ip, and v3 hosts don't pay attention
468 		 * to membership reports.
469 		 */
470 		break;
471 	}
472 	/*
473 	 * Pass all valid IGMP packets up to any process(es) listening
474 	 * on a raw IGMP socket. Do not free the packet.
475 	 */
476 	return (mp);
477 
478 bad_pkt:
479 	freemsg(mp);
480 	return (NULL);
481 }
482 
483 static uint_t
484 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
485 {
486 	ilm_t	*ilm;
487 	int	timer;
488 	uint_t	next, current;
489 	ip_stack_t	 *ipst;
490 
491 	ipst = ill->ill_ipst;
492 	++ipst->ips_igmpstat.igps_rcv_queries;
493 
494 	/*
495 	 * In the IGMPv2 specification, there are 3 states and a flag.
496 	 *
497 	 * In Non-Member state, we simply don't have a membership record.
498 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
499 	 * < INFINITY).  In Idle Member state, our timer is not running
500 	 * (ilm->ilm_timer == INFINITY).
501 	 *
502 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
503 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
504 	 * if I sent the last report.
505 	 */
506 	if ((igmpa->igmpa_code == 0) ||
507 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
508 		/*
509 		 * Query from an old router.
510 		 * Remember that the querier on this interface is old,
511 		 * and set the timer to the value in RFC 1112.
512 		 */
513 
514 
515 		mutex_enter(&ill->ill_lock);
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 		mutex_exit(&ill->ill_lock);
525 
526 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
527 
528 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
529 		    igmpa->igmpa_group != 0) {
530 			++ipst->ips_igmpstat.igps_rcv_badqueries;
531 			return (0);
532 		}
533 
534 	} else {
535 		in_addr_t group;
536 
537 		/*
538 		 * Query from a new router
539 		 * Simply do a validity check
540 		 */
541 		group = igmpa->igmpa_group;
542 		if (group != 0 && (!CLASSD(group))) {
543 			++ipst->ips_igmpstat.igps_rcv_badqueries;
544 			return (0);
545 		}
546 
547 		/*
548 		 * Switch interface state to v2 on receipt of a v2 query
549 		 * ONLY IF current state is v3.  Let things be if current
550 		 * state if v1 but do reset the v2-querier-present timer.
551 		 */
552 		mutex_enter(&ill->ill_lock);
553 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
554 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
555 			    "to IGMP_V2_ROUTER", ill->ill_name));
556 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
557 			ill->ill_mcast_type = IGMP_V2_ROUTER;
558 		}
559 		ill->ill_mcast_v2_time = 0;
560 		ill->ill_mcast_v2_tset = 1;
561 		mutex_exit(&ill->ill_lock);
562 
563 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
564 	}
565 
566 	if (ip_debug > 1) {
567 		mutex_enter(&ill->ill_lock);
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 		mutex_exit(&ill->ill_lock);
573 	}
574 
575 	/*
576 	 * -Start the timers in all of our membership records
577 	 *  for the physical interface on which the query
578 	 *  arrived, excluding those that belong to the "all
579 	 *  hosts" group (224.0.0.1).
580 	 *
581 	 * -Restart any timer that is already running but has
582 	 *  a value longer than the requested timeout.
583 	 *
584 	 * -Use the value specified in the query message as
585 	 *  the maximum timeout.
586 	 */
587 	next = (unsigned)INFINITY;
588 	mutex_enter(&ill->ill_lock);
589 
590 	current = CURRENT_MSTIME;
591 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
592 
593 		/*
594 		 * A multicast router joins INADDR_ANY address
595 		 * to enable promiscuous reception of all
596 		 * mcasts from the interface. This INADDR_ANY
597 		 * is stored in the ilm_v6addr as V6 unspec addr
598 		 */
599 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
600 			continue;
601 		if (ilm->ilm_addr == htonl(INADDR_ANY))
602 			continue;
603 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
604 		    (igmpa->igmpa_group == 0) ||
605 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
606 			if (ilm->ilm_timer > timer) {
607 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
608 				if (ilm->ilm_timer < next)
609 					next = ilm->ilm_timer;
610 				ilm->ilm_timer += current;
611 			}
612 		}
613 	}
614 	mutex_exit(&ill->ill_lock);
615 
616 	return (next);
617 }
618 
619 static uint_t
620 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
621 {
622 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
623 	uint_t		current;
624 	ilm_t		*ilm;
625 	ipaddr_t	*src_array;
626 	uint8_t		qrv;
627 	ip_stack_t	 *ipst;
628 
629 	ipst = ill->ill_ipst;
630 	/* make sure numsrc matches packet size */
631 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
632 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
633 		++ipst->ips_igmpstat.igps_rcv_tooshort;
634 		return (0);
635 	}
636 	src_array = (ipaddr_t *)&igmp3qa[1];
637 
638 	++ipst->ips_igmpstat.igps_rcv_queries;
639 
640 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
641 		uint_t hdrval, mant, exp;
642 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
643 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
644 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
645 		mrd = (mant | 0x10) << (exp + 3);
646 	}
647 	if (mrd == 0)
648 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
649 	timer = DSEC_TO_MSEC(mrd);
650 	MCAST_RANDOM_DELAY(delay, timer);
651 	next = (unsigned)INFINITY;
652 	current = CURRENT_MSTIME;
653 
654 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
655 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
656 	else
657 		ill->ill_mcast_rv = qrv;
658 
659 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
660 		uint_t hdrval, mant, exp;
661 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
662 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
663 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
664 		qqi = (mant | 0x10) << (exp + 3);
665 	}
666 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
667 
668 	/*
669 	 * If we have a pending general query response that's scheduled
670 	 * sooner than the delay we calculated for this response, then
671 	 * no action is required (RFC3376 section 5.2 rule 1)
672 	 */
673 	mutex_enter(&ill->ill_lock);
674 	if (ill->ill_global_timer < (current + delay)) {
675 		mutex_exit(&ill->ill_lock);
676 		return (next);
677 	}
678 	mutex_exit(&ill->ill_lock);
679 
680 	/*
681 	 * Now take action depending upon query type:
682 	 * general, group specific, or group/source specific.
683 	 */
684 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
685 		/*
686 		 * general query
687 		 * We know global timer is either not running or is
688 		 * greater than our calculated delay, so reset it to
689 		 * our delay (random value in range [0, response time]).
690 		 */
691 		mutex_enter(&ill->ill_lock);
692 		ill->ill_global_timer =  current + delay;
693 		mutex_exit(&ill->ill_lock);
694 		next = delay;
695 
696 	} else {
697 		/* group or group/source specific query */
698 		mutex_enter(&ill->ill_lock);
699 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
700 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
701 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
702 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
703 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
704 				continue;
705 			/*
706 			 * If the query is group specific or we have a
707 			 * pending group specific query, the response is
708 			 * group specific (pending sources list should be
709 			 * empty).  Otherwise, need to update the pending
710 			 * sources list for the group and source specific
711 			 * response.
712 			 */
713 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
714 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
715 group_query:
716 				FREE_SLIST(ilm->ilm_pendsrcs);
717 				ilm->ilm_pendsrcs = NULL;
718 			} else {
719 				boolean_t overflow;
720 				slist_t *pktl;
721 				if (numsrc > MAX_FILTER_SIZE ||
722 				    (ilm->ilm_pendsrcs == NULL &&
723 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
724 					/*
725 					 * We've been sent more sources than
726 					 * we can deal with; or we can't deal
727 					 * with a source list at all.  Revert
728 					 * to a group specific query.
729 					 */
730 					goto group_query;
731 				}
732 				if ((pktl = l_alloc()) == NULL)
733 					goto group_query;
734 				pktl->sl_numsrc = numsrc;
735 				for (i = 0; i < numsrc; i++)
736 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
737 					    &(pktl->sl_addr[i]));
738 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
739 				    &overflow);
740 				l_free(pktl);
741 				if (overflow)
742 					goto group_query;
743 			}
744 
745 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
746 			    INFINITY : (ilm->ilm_timer - current);
747 			/* choose soonest timer */
748 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
749 			if (ilm->ilm_timer < next)
750 				next = ilm->ilm_timer;
751 			ilm->ilm_timer += current;
752 		}
753 		mutex_exit(&ill->ill_lock);
754 	}
755 
756 	return (next);
757 }
758 
759 void
760 igmp_joingroup(ilm_t *ilm)
761 {
762 	uint_t	timer;
763 	ill_t	*ill;
764 	ip_stack_t	*ipst = ilm->ilm_ipst;
765 
766 	ill = ilm->ilm_ipif->ipif_ill;
767 
768 	ASSERT(IAM_WRITER_ILL(ill));
769 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
770 
771 	mutex_enter(&ill->ill_lock);
772 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
773 		ilm->ilm_rtx.rtx_timer = INFINITY;
774 		ilm->ilm_state = IGMP_OTHERMEMBER;
775 		mutex_exit(&ill->ill_lock);
776 	} else {
777 		ip1dbg(("Querier mode %d, sending report, group %x\n",
778 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
779 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
780 			mutex_exit(&ill->ill_lock);
781 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
782 			mutex_enter(&ill->ill_lock);
783 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
784 			mutex_exit(&ill->ill_lock);
785 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
786 			mutex_enter(&ill->ill_lock);
787 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
788 			mrec_t *rp;
789 			mcast_record_t rtype;
790 			/*
791 			 * The possible state changes we need to handle here:
792 			 *   Old State	New State	Report
793 			 *
794 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
795 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
796 			 *
797 			 * No need to send the BLOCK(0) report; ALLOW(X)
798 			 * is enough.
799 			 */
800 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
801 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
802 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
803 			    ilm->ilm_filter, NULL);
804 			mutex_exit(&ill->ill_lock);
805 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
806 			mutex_enter(&ill->ill_lock);
807 			/*
808 			 * Set up retransmission state.  Timer is set below,
809 			 * for both v3 and older versions.
810 			 */
811 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
812 			    ilm->ilm_filter);
813 		}
814 
815 		/* Set the ilm timer value */
816 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
817 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
818 		timer = ilm->ilm_rtx.rtx_timer;
819 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
820 		ilm->ilm_state = IGMP_IREPORTEDLAST;
821 		mutex_exit(&ill->ill_lock);
822 
823 		/*
824 		 * To avoid deadlock, we don't call igmp_start_timers from
825 		 * here. igmp_start_timers needs to call untimeout, and we
826 		 * can't hold the ipsq across untimeout since
827 		 * igmp_timeout_handler could be blocking trying to
828 		 * acquire the ipsq. Instead we start the timer after we get
829 		 * out of the ipsq in ipsq_exit.
830 		 */
831 		mutex_enter(&ipst->ips_igmp_timer_lock);
832 		ipst->ips_igmp_deferred_next = MIN(timer,
833 		    ipst->ips_igmp_deferred_next);
834 		mutex_exit(&ipst->ips_igmp_timer_lock);
835 	}
836 
837 	if (ip_debug > 1) {
838 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
839 		    "igmp_joingroup: multicast_type %d timer %d",
840 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
841 		    (int)ntohl(timer));
842 	}
843 }
844 
845 void
846 mld_joingroup(ilm_t *ilm)
847 {
848 	uint_t	timer;
849 	ill_t	*ill;
850 	ip_stack_t	*ipst = ilm->ilm_ipst;
851 
852 	ill = ilm->ilm_ill;
853 
854 	ASSERT(IAM_WRITER_ILL(ill));
855 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
856 
857 	mutex_enter(&ill->ill_lock);
858 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
859 		ilm->ilm_rtx.rtx_timer = INFINITY;
860 		ilm->ilm_state = IGMP_OTHERMEMBER;
861 		mutex_exit(&ill->ill_lock);
862 	} else {
863 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
864 			mutex_exit(&ill->ill_lock);
865 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
866 			mutex_enter(&ill->ill_lock);
867 		} else {
868 			mrec_t *rp;
869 			mcast_record_t rtype;
870 			/*
871 			 * The possible state changes we need to handle here:
872 			 *	Old State   New State	Report
873 			 *
874 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
875 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
876 			 *
877 			 * No need to send the BLOCK(0) report; ALLOW(X)
878 			 * is enough
879 			 */
880 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
881 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
882 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
883 			    ilm->ilm_filter, NULL);
884 			mutex_exit(&ill->ill_lock);
885 			mldv2_sendrpt(ill, rp);
886 			mutex_enter(&ill->ill_lock);
887 			/*
888 			 * Set up retransmission state.  Timer is set below,
889 			 * for both v2 and v1.
890 			 */
891 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
892 			    ilm->ilm_filter);
893 		}
894 
895 		/* Set the ilm timer value */
896 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
897 		    ilm->ilm_rtx.rtx_cnt > 0);
898 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
899 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
900 		timer = ilm->ilm_rtx.rtx_timer;
901 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
902 		ilm->ilm_state = IGMP_IREPORTEDLAST;
903 		mutex_exit(&ill->ill_lock);
904 
905 		/*
906 		 * To avoid deadlock, we don't call mld_start_timers from
907 		 * here. mld_start_timers needs to call untimeout, and we
908 		 * can't hold the ipsq (i.e. the lock) across untimeout
909 		 * since mld_timeout_handler could be blocking trying to
910 		 * acquire the ipsq. Instead we start the timer after we get
911 		 * out of the ipsq in ipsq_exit
912 		 */
913 		mutex_enter(&ipst->ips_mld_timer_lock);
914 		ipst->ips_mld_deferred_next = MIN(timer,
915 		    ipst->ips_mld_deferred_next);
916 		mutex_exit(&ipst->ips_mld_timer_lock);
917 	}
918 
919 	if (ip_debug > 1) {
920 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
921 		    "mld_joingroup: multicast_type %d timer %d",
922 		    (ilm->ilm_ill->ill_mcast_type),
923 		    (int)ntohl(timer));
924 	}
925 }
926 
927 void
928 igmp_leavegroup(ilm_t *ilm)
929 {
930 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
931 
932 	ASSERT(ilm->ilm_ill == NULL);
933 	ASSERT(!ill->ill_isv6);
934 
935 	mutex_enter(&ill->ill_lock);
936 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
937 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
938 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
939 		mutex_exit(&ill->ill_lock);
940 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
941 		    (htonl(INADDR_ALLRTRS_GROUP)));
942 		return;
943 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
944 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
945 		mrec_t *rp;
946 		/*
947 		 * The possible state changes we need to handle here:
948 		 *	Old State	New State	Report
949 		 *
950 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
951 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
952 		 *
953 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
954 		 */
955 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
956 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
957 			    ilm->ilm_filter, NULL);
958 		} else {
959 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
960 			    NULL, NULL);
961 		}
962 		mutex_exit(&ill->ill_lock);
963 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
964 		return;
965 	}
966 	mutex_exit(&ill->ill_lock);
967 }
968 
969 void
970 mld_leavegroup(ilm_t *ilm)
971 {
972 	ill_t *ill = ilm->ilm_ill;
973 
974 	ASSERT(ilm->ilm_ipif == NULL);
975 	ASSERT(ill->ill_isv6);
976 
977 	mutex_enter(&ill->ill_lock);
978 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
979 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
980 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
981 		mutex_exit(&ill->ill_lock);
982 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
983 		return;
984 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
985 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
986 		mrec_t *rp;
987 		/*
988 		 * The possible state changes we need to handle here:
989 		 *	Old State	New State	Report
990 		 *
991 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
992 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
993 		 *
994 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
995 		 */
996 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
997 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
998 			    ilm->ilm_filter, NULL);
999 		} else {
1000 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
1001 			    NULL, NULL);
1002 		}
1003 		mutex_exit(&ill->ill_lock);
1004 		mldv2_sendrpt(ill, rp);
1005 		return;
1006 	}
1007 	mutex_exit(&ill->ill_lock);
1008 }
1009 
1010 void
1011 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1012 {
1013 	ill_t *ill;
1014 	mrec_t *rp;
1015 	ip_stack_t	*ipst = ilm->ilm_ipst;
1016 
1017 	ASSERT(ilm != NULL);
1018 
1019 	/* state change reports should only be sent if the router is v3 */
1020 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
1021 		return;
1022 
1023 	if (ilm->ilm_ill == NULL) {
1024 		ASSERT(ilm->ilm_ipif != NULL);
1025 		ill = ilm->ilm_ipif->ipif_ill;
1026 	} else {
1027 		ill = ilm->ilm_ill;
1028 	}
1029 
1030 	mutex_enter(&ill->ill_lock);
1031 
1032 	/*
1033 	 * Compare existing(old) state with the new state and prepare
1034 	 * State Change Report, according to the rules in RFC 3376:
1035 	 *
1036 	 *	Old State	New State	State Change Report
1037 	 *
1038 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1039 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1040 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1041 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1042 	 */
1043 
1044 	if (ilm->ilm_fmode == fmode) {
1045 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1046 		slist_t *allow, *block;
1047 		if (((a_minus_b = l_alloc()) == NULL) ||
1048 		    ((b_minus_a = l_alloc()) == NULL)) {
1049 			l_free(a_minus_b);
1050 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1051 				goto send_to_ex;
1052 			else
1053 				goto send_to_in;
1054 		}
1055 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1056 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1057 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1058 			allow = b_minus_a;
1059 			block = a_minus_b;
1060 		} else {
1061 			allow = a_minus_b;
1062 			block = b_minus_a;
1063 		}
1064 		rp = NULL;
1065 		if (!SLIST_IS_EMPTY(allow))
1066 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1067 			    allow, rp);
1068 		if (!SLIST_IS_EMPTY(block))
1069 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1070 			    block, rp);
1071 		l_free(a_minus_b);
1072 		l_free(b_minus_a);
1073 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1074 send_to_ex:
1075 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1076 		    NULL);
1077 	} else {
1078 send_to_in:
1079 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1080 		    NULL);
1081 	}
1082 
1083 	/*
1084 	 * Need to set up retransmission state; merge the new info with the
1085 	 * current state (which may be null).  If the timer is not currently
1086 	 * running, start it (need to do a delayed start of the timer as
1087 	 * we're currently in the sq).
1088 	 */
1089 	rp = mcast_merge_rtx(ilm, rp, flist);
1090 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1091 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1092 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1093 		mutex_enter(&ipst->ips_igmp_timer_lock);
1094 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1095 		    ilm->ilm_rtx.rtx_timer);
1096 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1097 		mutex_exit(&ipst->ips_igmp_timer_lock);
1098 	}
1099 
1100 	mutex_exit(&ill->ill_lock);
1101 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1102 }
1103 
1104 void
1105 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1106 {
1107 	ill_t *ill;
1108 	mrec_t *rp = NULL;
1109 	ip_stack_t	*ipst = ilm->ilm_ipst;
1110 
1111 	ASSERT(ilm != NULL);
1112 
1113 	ill = ilm->ilm_ill;
1114 
1115 	/* only need to send if we have an mldv2-capable router */
1116 	mutex_enter(&ill->ill_lock);
1117 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1118 		mutex_exit(&ill->ill_lock);
1119 		return;
1120 	}
1121 
1122 	/*
1123 	 * Compare existing (old) state with the new state passed in
1124 	 * and send appropriate MLDv2 State Change Report.
1125 	 *
1126 	 *	Old State	New State	State Change Report
1127 	 *
1128 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1129 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1130 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1131 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1132 	 */
1133 	if (ilm->ilm_fmode == fmode) {
1134 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1135 		slist_t *allow, *block;
1136 		if (((a_minus_b = l_alloc()) == NULL) ||
1137 		    ((b_minus_a = l_alloc()) == NULL)) {
1138 			l_free(a_minus_b);
1139 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1140 				goto send_to_ex;
1141 			else
1142 				goto send_to_in;
1143 		}
1144 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1145 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1146 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1147 			allow = b_minus_a;
1148 			block = a_minus_b;
1149 		} else {
1150 			allow = a_minus_b;
1151 			block = b_minus_a;
1152 		}
1153 		if (!SLIST_IS_EMPTY(allow))
1154 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1155 			    allow, rp);
1156 		if (!SLIST_IS_EMPTY(block))
1157 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1158 			    block, rp);
1159 		l_free(a_minus_b);
1160 		l_free(b_minus_a);
1161 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1162 send_to_ex:
1163 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1164 		    NULL);
1165 	} else {
1166 send_to_in:
1167 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1168 		    NULL);
1169 	}
1170 
1171 	/*
1172 	 * Need to set up retransmission state; merge the new info with the
1173 	 * current state (which may be null).  If the timer is not currently
1174 	 * running, start it (need to do a deferred start of the timer as
1175 	 * we're currently in the sq).
1176 	 */
1177 	rp = mcast_merge_rtx(ilm, rp, flist);
1178 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1179 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1180 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1181 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1182 		mutex_enter(&ipst->ips_mld_timer_lock);
1183 		ipst->ips_mld_deferred_next =
1184 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1185 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1186 		mutex_exit(&ipst->ips_mld_timer_lock);
1187 	}
1188 
1189 	mutex_exit(&ill->ill_lock);
1190 	mldv2_sendrpt(ill, rp);
1191 }
1192 
1193 uint_t
1194 igmp_timeout_handler_per_ill(ill_t *ill)
1195 {
1196 	uint_t	next = INFINITY, current;
1197 	ilm_t	*ilm;
1198 	ipif_t	*ipif;
1199 	mrec_t	*rp = NULL;
1200 	mrec_t	*rtxrp = NULL;
1201 	rtx_state_t *rtxp;
1202 	mcast_record_t	rtype;
1203 
1204 	ASSERT(IAM_WRITER_ILL(ill));
1205 
1206 	mutex_enter(&ill->ill_lock);
1207 
1208 	current = CURRENT_MSTIME;
1209 	/* First check the global timer on this interface */
1210 	if (ill->ill_global_timer == INFINITY)
1211 		goto per_ilm_timer;
1212 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1213 		ill->ill_global_timer = INFINITY;
1214 		/*
1215 		 * Send report for each group on this interface.
1216 		 * Since we just set the global timer (received a v3 general
1217 		 * query), need to skip the all hosts addr (224.0.0.1), per
1218 		 * RFC 3376 section 5.
1219 		 */
1220 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1221 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1222 				continue;
1223 			ASSERT(ilm->ilm_ipif != NULL);
1224 			ilm->ilm_ipif->ipif_igmp_rpt =
1225 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1226 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1227 			/*
1228 			 * Since we're sending a report on this group, okay
1229 			 * to delete pending group-specific timers.  Note
1230 			 * that group-specific retransmit timers still need
1231 			 * to be checked in the per_ilm_timer for-loop.
1232 			 */
1233 			ilm->ilm_timer = INFINITY;
1234 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1235 			FREE_SLIST(ilm->ilm_pendsrcs);
1236 			ilm->ilm_pendsrcs = NULL;
1237 		}
1238 		/*
1239 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1240 		 * and send a report for each ipif that has an mrec list.
1241 		 */
1242 		for (ipif = ill->ill_ipif; ipif != NULL;
1243 		    ipif = ipif->ipif_next) {
1244 			if (ipif->ipif_igmp_rpt == NULL)
1245 				continue;
1246 			mutex_exit(&ill->ill_lock);
1247 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1248 			mutex_enter(&ill->ill_lock);
1249 			/* mrec list was freed by igmpv3_sendrpt() */
1250 			ipif->ipif_igmp_rpt = NULL;
1251 		}
1252 	} else {
1253 		if ((ill->ill_global_timer - current) < next)
1254 			next = ill->ill_global_timer - current;
1255 	}
1256 
1257 per_ilm_timer:
1258 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1259 		if (ilm->ilm_timer == INFINITY)
1260 			goto per_ilm_rtxtimer;
1261 
1262 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1263 			if ((ilm->ilm_timer - current) < next)
1264 				next = ilm->ilm_timer - current;
1265 
1266 			if (ip_debug > 1) {
1267 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1268 				    "igmp_timo_hlr 2: ilm_timr %d "
1269 				    "typ %d nxt %d",
1270 				    (int)ntohl(ilm->ilm_timer - current),
1271 				    (ill->ill_mcast_type), next);
1272 			}
1273 
1274 			goto per_ilm_rtxtimer;
1275 		}
1276 
1277 		/* the timer has expired, need to take action */
1278 		ilm->ilm_timer = INFINITY;
1279 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1280 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1281 			mutex_exit(&ill->ill_lock);
1282 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1283 			mutex_enter(&ill->ill_lock);
1284 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1285 			mutex_exit(&ill->ill_lock);
1286 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1287 			mutex_enter(&ill->ill_lock);
1288 		} else {
1289 			slist_t *rsp;
1290 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1291 			    (rsp = l_alloc()) != NULL) {
1292 				/*
1293 				 * Contents of reply depend on pending
1294 				 * requested source list.
1295 				 */
1296 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1297 					l_intersection(ilm->ilm_filter,
1298 					    ilm->ilm_pendsrcs, rsp);
1299 				} else {
1300 					l_difference(ilm->ilm_pendsrcs,
1301 					    ilm->ilm_filter, rsp);
1302 				}
1303 				FREE_SLIST(ilm->ilm_pendsrcs);
1304 				ilm->ilm_pendsrcs = NULL;
1305 				if (!SLIST_IS_EMPTY(rsp))
1306 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1307 					    &ilm->ilm_v6addr, rsp, rp);
1308 				FREE_SLIST(rsp);
1309 			} else {
1310 				/*
1311 				 * Either the pending request is just group-
1312 				 * specific, or we couldn't get the resources
1313 				 * (rsp) to build a source-specific reply.
1314 				 */
1315 				rp = mcast_bldmrec(ilm->ilm_fmode,
1316 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1317 			}
1318 			mutex_exit(&ill->ill_lock);
1319 			igmpv3_sendrpt(ill->ill_ipif, rp);
1320 			mutex_enter(&ill->ill_lock);
1321 			rp = NULL;
1322 		}
1323 
1324 per_ilm_rtxtimer:
1325 		rtxp = &ilm->ilm_rtx;
1326 
1327 		if (rtxp->rtx_timer == INFINITY)
1328 			continue;
1329 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1330 			if ((rtxp->rtx_timer - current) < next)
1331 				next = rtxp->rtx_timer - current;
1332 			continue;
1333 		}
1334 
1335 		rtxp->rtx_timer = INFINITY;
1336 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1337 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1338 			mutex_exit(&ill->ill_lock);
1339 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1340 			mutex_enter(&ill->ill_lock);
1341 			continue;
1342 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1343 			mutex_exit(&ill->ill_lock);
1344 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1345 			mutex_enter(&ill->ill_lock);
1346 			continue;
1347 		}
1348 
1349 		/*
1350 		 * The retransmit timer has popped, and our router is
1351 		 * IGMPv3.  We have to delve into the retransmit state
1352 		 * stored in the ilm.
1353 		 *
1354 		 * Decrement the retransmit count.  If the fmode rtx
1355 		 * count is active, decrement it, and send a filter
1356 		 * mode change report with the ilm's source list.
1357 		 * Otherwise, send a source list change report with
1358 		 * the current retransmit lists.
1359 		 */
1360 		ASSERT(rtxp->rtx_cnt > 0);
1361 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1362 		rtxp->rtx_cnt--;
1363 		if (rtxp->rtx_fmode_cnt > 0) {
1364 			rtxp->rtx_fmode_cnt--;
1365 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1366 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1367 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1368 			    ilm->ilm_filter, rtxrp);
1369 		} else {
1370 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1371 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1372 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1373 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1374 		}
1375 		if (rtxp->rtx_cnt > 0) {
1376 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1377 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1378 			if (rtxp->rtx_timer < next)
1379 				next = rtxp->rtx_timer;
1380 			rtxp->rtx_timer += current;
1381 		} else {
1382 			CLEAR_SLIST(rtxp->rtx_allow);
1383 			CLEAR_SLIST(rtxp->rtx_block);
1384 		}
1385 		mutex_exit(&ill->ill_lock);
1386 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1387 		mutex_enter(&ill->ill_lock);
1388 		rtxrp = NULL;
1389 	}
1390 
1391 	mutex_exit(&ill->ill_lock);
1392 
1393 	return (next);
1394 }
1395 
1396 /*
1397  * igmp_timeout_handler:
1398  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1399  * Returns number of ticks to next event (or 0 if none).
1400  *
1401  * As part of multicast join and leave igmp we may need to send out an
1402  * igmp request. The igmp related state variables in the ilm are protected
1403  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1404  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1405  * starts the igmp timer if needed. It serializes multiple threads trying to
1406  * simultaneously start the timer using the igmp_timer_setter_active flag.
1407  *
1408  * igmp_input() receives igmp queries and responds to the queries
1409  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1410  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1411  * performs the action exclusively after entering each ill's ipsq as writer.
1412  * The actual igmp timeout handler needs to run in the ipsq since it has to
1413  * access the ilm's and we don't want another exclusive operation like
1414  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1415  * another.
1416  *
1417  * The igmp_slowtimeo() function is called thru another timer.
1418  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1419  */
1420 void
1421 igmp_timeout_handler(void *arg)
1422 {
1423 	ill_t	*ill;
1424 	uint_t  global_next = INFINITY;
1425 	uint_t  next;
1426 	ill_walk_context_t ctx;
1427 	boolean_t success;
1428 	ip_stack_t *ipst = (ip_stack_t *)arg;
1429 
1430 	ASSERT(arg != NULL);
1431 	mutex_enter(&ipst->ips_igmp_timer_lock);
1432 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1433 	ipst->ips_igmp_timer_scheduled_last = 0;
1434 	ipst->ips_igmp_time_to_next = 0;
1435 	mutex_exit(&ipst->ips_igmp_timer_lock);
1436 
1437 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1438 	ill = ILL_START_WALK_V4(&ctx, ipst);
1439 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1440 		ASSERT(!ill->ill_isv6);
1441 		/*
1442 		 * We may not be able to refhold the ill if the ill/ipif
1443 		 * is changing. But we need to make sure that the ill will
1444 		 * not vanish. So we just bump up the ill_waiter count.
1445 		 */
1446 		if (!ill_waiter_inc(ill))
1447 			continue;
1448 		rw_exit(&ipst->ips_ill_g_lock);
1449 		success = ipsq_enter(ill, B_TRUE);
1450 		if (success) {
1451 			next = igmp_timeout_handler_per_ill(ill);
1452 			if (next < global_next)
1453 				global_next = next;
1454 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_FALSE,
1455 			    B_TRUE);
1456 		}
1457 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1458 		ill_waiter_dcr(ill);
1459 	}
1460 	rw_exit(&ipst->ips_ill_g_lock);
1461 
1462 	mutex_enter(&ipst->ips_igmp_timer_lock);
1463 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1464 	ipst->ips_igmp_timeout_id = 0;
1465 	mutex_exit(&ipst->ips_igmp_timer_lock);
1466 
1467 	if (global_next != INFINITY)
1468 		igmp_start_timers(global_next, ipst);
1469 }
1470 
1471 /*
1472  * mld_timeout_handler:
1473  * Called when there are timeout events, every next (tick).
1474  * Returns number of ticks to next event (or 0 if none).
1475  */
1476 /* ARGSUSED */
1477 uint_t
1478 mld_timeout_handler_per_ill(ill_t *ill)
1479 {
1480 	ilm_t 	*ilm;
1481 	uint_t	next = INFINITY, current;
1482 	mrec_t	*rp, *rtxrp;
1483 	rtx_state_t *rtxp;
1484 	mcast_record_t	rtype;
1485 
1486 	ASSERT(IAM_WRITER_ILL(ill));
1487 
1488 	mutex_enter(&ill->ill_lock);
1489 
1490 	current = CURRENT_MSTIME;
1491 	/*
1492 	 * First check the global timer on this interface; the global timer
1493 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1494 	 */
1495 	if (ill->ill_global_timer == INFINITY)
1496 		goto per_ilm_timer;
1497 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1498 		ill->ill_global_timer = INFINITY;
1499 		/*
1500 		 * Send report for each group on this interface.
1501 		 * Since we just set the global timer (received a v2 general
1502 		 * query), need to skip the all hosts addr (ff02::1), per
1503 		 * RFC 3810 section 6.
1504 		 */
1505 		rp = NULL;
1506 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1507 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1508 			    &ipv6_all_hosts_mcast))
1509 				continue;
1510 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1511 			    ilm->ilm_filter, rp);
1512 			/*
1513 			 * Since we're sending a report on this group, okay
1514 			 * to delete pending group-specific timers.  Note
1515 			 * that group-specific retransmit timers still need
1516 			 * to be checked in the per_ilm_timer for-loop.
1517 			 */
1518 			ilm->ilm_timer = INFINITY;
1519 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1520 			FREE_SLIST(ilm->ilm_pendsrcs);
1521 			ilm->ilm_pendsrcs = NULL;
1522 		}
1523 		mutex_exit(&ill->ill_lock);
1524 		mldv2_sendrpt(ill, rp);
1525 		mutex_enter(&ill->ill_lock);
1526 	} else {
1527 		if ((ill->ill_global_timer - current) < next)
1528 			next = ill->ill_global_timer - current;
1529 	}
1530 
1531 per_ilm_timer:
1532 	rp = rtxrp = NULL;
1533 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1534 		if (ilm->ilm_timer == INFINITY)
1535 			goto per_ilm_rtxtimer;
1536 
1537 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1538 			if ((ilm->ilm_timer - current) < next)
1539 				next = ilm->ilm_timer - current;
1540 
1541 			if (ip_debug > 1) {
1542 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1543 				    "igmp_timo_hlr 2: ilm_timr"
1544 				    " %d typ %d nxt %d",
1545 				    (int)ntohl(ilm->ilm_timer - current),
1546 				    (ill->ill_mcast_type), next);
1547 			}
1548 
1549 			goto per_ilm_rtxtimer;
1550 		}
1551 
1552 		/* the timer has expired, need to take action */
1553 		ilm->ilm_timer = INFINITY;
1554 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1555 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1556 			mutex_exit(&ill->ill_lock);
1557 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1558 			mutex_enter(&ill->ill_lock);
1559 		} else {
1560 			slist_t *rsp;
1561 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1562 			    (rsp = l_alloc()) != NULL) {
1563 				/*
1564 				 * Contents of reply depend on pending
1565 				 * requested source list.
1566 				 */
1567 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1568 					l_intersection(ilm->ilm_filter,
1569 					    ilm->ilm_pendsrcs, rsp);
1570 				} else {
1571 					l_difference(ilm->ilm_pendsrcs,
1572 					    ilm->ilm_filter, rsp);
1573 				}
1574 				FREE_SLIST(ilm->ilm_pendsrcs);
1575 				ilm->ilm_pendsrcs = NULL;
1576 				if (!SLIST_IS_EMPTY(rsp))
1577 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1578 					    &ilm->ilm_v6addr, rsp, rp);
1579 				FREE_SLIST(rsp);
1580 			} else {
1581 				rp = mcast_bldmrec(ilm->ilm_fmode,
1582 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1583 			}
1584 		}
1585 
1586 per_ilm_rtxtimer:
1587 		rtxp = &ilm->ilm_rtx;
1588 
1589 		if (rtxp->rtx_timer == INFINITY)
1590 			continue;
1591 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1592 			if ((rtxp->rtx_timer - current) < next)
1593 				next = rtxp->rtx_timer - current;
1594 			continue;
1595 		}
1596 
1597 		rtxp->rtx_timer = INFINITY;
1598 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1599 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1600 			mutex_exit(&ill->ill_lock);
1601 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1602 			mutex_enter(&ill->ill_lock);
1603 			continue;
1604 		}
1605 
1606 		/*
1607 		 * The retransmit timer has popped, and our router is
1608 		 * MLDv2.  We have to delve into the retransmit state
1609 		 * stored in the ilm.
1610 		 *
1611 		 * Decrement the retransmit count.  If the fmode rtx
1612 		 * count is active, decrement it, and send a filter
1613 		 * mode change report with the ilm's source list.
1614 		 * Otherwise, send a source list change report with
1615 		 * the current retransmit lists.
1616 		 */
1617 		ASSERT(rtxp->rtx_cnt > 0);
1618 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1619 		rtxp->rtx_cnt--;
1620 		if (rtxp->rtx_fmode_cnt > 0) {
1621 			rtxp->rtx_fmode_cnt--;
1622 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1623 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1624 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1625 			    ilm->ilm_filter, rtxrp);
1626 		} else {
1627 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1628 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1629 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1630 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1631 		}
1632 		if (rtxp->rtx_cnt > 0) {
1633 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1634 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1635 			if (rtxp->rtx_timer < next)
1636 				next = rtxp->rtx_timer;
1637 			rtxp->rtx_timer += current;
1638 		} else {
1639 			CLEAR_SLIST(rtxp->rtx_allow);
1640 			CLEAR_SLIST(rtxp->rtx_block);
1641 		}
1642 	}
1643 
1644 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1645 		mutex_exit(&ill->ill_lock);
1646 		mldv2_sendrpt(ill, rp);
1647 		mldv2_sendrpt(ill, rtxrp);
1648 		return (next);
1649 	}
1650 
1651 	mutex_exit(&ill->ill_lock);
1652 
1653 	return (next);
1654 }
1655 
1656 /*
1657  * mld_timeout_handler:
1658  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1659  * Returns number of ticks to next event (or 0 if none).
1660  * MT issues are same as igmp_timeout_handler
1661  */
1662 void
1663 mld_timeout_handler(void *arg)
1664 {
1665 	ill_t	*ill;
1666 	uint_t  global_next = INFINITY;
1667 	uint_t  next;
1668 	ill_walk_context_t ctx;
1669 	boolean_t success;
1670 	ip_stack_t *ipst = (ip_stack_t *)arg;
1671 
1672 	ASSERT(arg != NULL);
1673 	mutex_enter(&ipst->ips_mld_timer_lock);
1674 	ASSERT(ipst->ips_mld_timeout_id != 0);
1675 	ipst->ips_mld_timer_scheduled_last = 0;
1676 	ipst->ips_mld_time_to_next = 0;
1677 	mutex_exit(&ipst->ips_mld_timer_lock);
1678 
1679 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1680 	ill = ILL_START_WALK_V6(&ctx, ipst);
1681 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1682 		ASSERT(ill->ill_isv6);
1683 		/*
1684 		 * We may not be able to refhold the ill if the ill/ipif
1685 		 * is changing. But we need to make sure that the ill will
1686 		 * not vanish. So we just bump up the ill_waiter count.
1687 		 */
1688 		if (!ill_waiter_inc(ill))
1689 			continue;
1690 		rw_exit(&ipst->ips_ill_g_lock);
1691 		success = ipsq_enter(ill, B_TRUE);
1692 		if (success) {
1693 			next = mld_timeout_handler_per_ill(ill);
1694 			if (next < global_next)
1695 				global_next = next;
1696 			ipsq_exit(ill->ill_phyint->phyint_ipsq, B_TRUE,
1697 			    B_FALSE);
1698 		}
1699 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1700 		ill_waiter_dcr(ill);
1701 	}
1702 	rw_exit(&ipst->ips_ill_g_lock);
1703 
1704 	mutex_enter(&ipst->ips_mld_timer_lock);
1705 	ASSERT(ipst->ips_mld_timeout_id != 0);
1706 	ipst->ips_mld_timeout_id = 0;
1707 	mutex_exit(&ipst->ips_mld_timer_lock);
1708 
1709 	if (global_next != INFINITY)
1710 		mld_start_timers(global_next, ipst);
1711 }
1712 
1713 /*
1714  * Calculate the Older Version Querier Present timeout value, in number
1715  * of slowtimo intervals, for the given ill.
1716  */
1717 #define	OVQP(ill) \
1718 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1719 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1720 
1721 /*
1722  * igmp_slowtimo:
1723  * - Resets to new router if we didnt we hear from the router
1724  *   in IGMP_AGE_THRESHOLD seconds.
1725  * - Resets slowtimeout.
1726  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1727  * IGMP version than configured.
1728  */
1729 void
1730 igmp_slowtimo(void *arg)
1731 {
1732 	ill_t	*ill;
1733 	ill_if_t *ifp;
1734 	avl_tree_t *avl_tree;
1735 	ip_stack_t *ipst = (ip_stack_t *)arg;
1736 
1737 	ASSERT(arg != NULL);
1738 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1739 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1740 
1741 	/*
1742 	 * The ill_if_t list is circular, hence the odd loop parameters.
1743 	 *
1744 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1745 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1746 	 * structure (allowing us to skip if none of the instances have timers
1747 	 * running).
1748 	 */
1749 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1750 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1751 	    ifp = ifp->illif_next) {
1752 		/*
1753 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1754 		 * a V1 or V2 query now and we miss seeing the count now,
1755 		 * we will see it the next time igmp_slowtimo is called.
1756 		 */
1757 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1758 			continue;
1759 
1760 		avl_tree = &ifp->illif_avl_by_ppa;
1761 		for (ill = avl_first(avl_tree); ill != NULL;
1762 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1763 			mutex_enter(&ill->ill_lock);
1764 			if (ill->ill_mcast_v1_tset == 1)
1765 				ill->ill_mcast_v1_time++;
1766 			if (ill->ill_mcast_v2_tset == 1)
1767 				ill->ill_mcast_v2_time++;
1768 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1769 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1770 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1771 				if ((ill->ill_mcast_v2_tset > 0) ||
1772 				    (ipst->ips_igmp_max_version ==
1773 				    IGMP_V2_ROUTER)) {
1774 					ip1dbg(("V1 query timer "
1775 					    "expired on %s; switching "
1776 					    "mode to IGMP_V2\n",
1777 					    ill->ill_name));
1778 					ill->ill_mcast_type =
1779 					    IGMP_V2_ROUTER;
1780 				} else {
1781 					ip1dbg(("V1 query timer "
1782 					    "expired on %s; switching "
1783 					    "mode to IGMP_V3\n",
1784 					    ill->ill_name));
1785 					ill->ill_mcast_type =
1786 					    IGMP_V3_ROUTER;
1787 				}
1788 				ill->ill_mcast_v1_time = 0;
1789 				ill->ill_mcast_v1_tset = 0;
1790 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1791 			}
1792 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1793 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1794 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1795 				ip1dbg(("V2 query timer expired on "
1796 				    "%s; switching mode to IGMP_V3\n",
1797 				    ill->ill_name));
1798 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1799 				ill->ill_mcast_v2_time = 0;
1800 				ill->ill_mcast_v2_tset = 0;
1801 				atomic_add_16(&ifp->illif_mcast_v2, -1);
1802 			}
1803 			mutex_exit(&ill->ill_lock);
1804 		}
1805 	}
1806 	rw_exit(&ipst->ips_ill_g_lock);
1807 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1808 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1809 		MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1810 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1811 }
1812 
1813 /*
1814  * mld_slowtimo:
1815  * - Resets to newer version if we didn't hear from the older version router
1816  *   in MLD_AGE_THRESHOLD seconds.
1817  * - Restarts slowtimeout.
1818  * Check for ips_mld_max_version ensures that we don't revert to a higher
1819  * IGMP version than configured.
1820  */
1821 /* ARGSUSED */
1822 void
1823 mld_slowtimo(void *arg)
1824 {
1825 	ill_t *ill;
1826 	ill_if_t *ifp;
1827 	avl_tree_t *avl_tree;
1828 	ip_stack_t *ipst = (ip_stack_t *)arg;
1829 
1830 	ASSERT(arg != NULL);
1831 	/* See comments in igmp_slowtimo() above... */
1832 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1833 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1834 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1835 	    ifp = ifp->illif_next) {
1836 		if (ifp->illif_mcast_v1 == 0)
1837 			continue;
1838 
1839 		avl_tree = &ifp->illif_avl_by_ppa;
1840 		for (ill = avl_first(avl_tree); ill != NULL;
1841 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1842 			mutex_enter(&ill->ill_lock);
1843 			if (ill->ill_mcast_v1_tset == 1)
1844 				ill->ill_mcast_v1_time++;
1845 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1846 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1847 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1848 				ip1dbg(("MLD query timer expired on"
1849 				    " %s; switching mode to MLD_V2\n",
1850 				    ill->ill_name));
1851 				ill->ill_mcast_type = MLD_V2_ROUTER;
1852 				ill->ill_mcast_v1_time = 0;
1853 				ill->ill_mcast_v1_tset = 0;
1854 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1855 			}
1856 			mutex_exit(&ill->ill_lock);
1857 		}
1858 	}
1859 	rw_exit(&ipst->ips_ill_g_lock);
1860 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1861 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1862 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1863 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1864 }
1865 
1866 /*
1867  * igmp_sendpkt:
1868  * This will send to ip_wput like icmp_inbound.
1869  * Note that the lower ill (on which the membership is kept) is used
1870  * as an upper ill to pass in the multicast parameters.
1871  */
1872 static void
1873 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1874 {
1875 	mblk_t	*mp;
1876 	igmpa_t	*igmpa;
1877 	uint8_t *rtralert;
1878 	ipha_t	*ipha;
1879 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1880 	size_t	size  = hdrlen + sizeof (igmpa_t);
1881 	ipif_t 	*ipif = ilm->ilm_ipif;
1882 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1883 	mblk_t	*first_mp;
1884 	ipsec_out_t *io;
1885 	zoneid_t zoneid;
1886 	ip_stack_t *ipst = ill->ill_ipst;
1887 
1888 	/*
1889 	 * We need to make sure this packet goes out on an ipif. If
1890 	 * there is some global policy match in ip_wput_ire, we need
1891 	 * to get to the right interface after IPSEC processing.
1892 	 * To make sure this multicast packet goes out on the right
1893 	 * interface, we attach an ipsec_out and initialize ill_index
1894 	 * like we did in ip_wput. To make sure that this packet does
1895 	 * not get forwarded on other interfaces or looped back, we
1896 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1897 	 * to B_FALSE.
1898 	 *
1899 	 * We also need to make sure that this does not get load balanced
1900 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1901 	 * here. If it gets load balanced, switches supporting igmp snooping
1902 	 * will send the packet that it receives for this multicast group
1903 	 * to the interface that we are sending on. As we have joined the
1904 	 * multicast group on this ill, by sending the packet out on this
1905 	 * ill, we receive all the packets back on this ill.
1906 	 */
1907 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1908 	if (first_mp == NULL)
1909 		return;
1910 
1911 	first_mp->b_datap->db_type = M_CTL;
1912 	first_mp->b_wptr += sizeof (ipsec_info_t);
1913 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1914 	/* ipsec_out_secure is B_FALSE now */
1915 	io = (ipsec_out_t *)first_mp->b_rptr;
1916 	io->ipsec_out_type = IPSEC_OUT;
1917 	io->ipsec_out_len = sizeof (ipsec_out_t);
1918 	io->ipsec_out_use_global_policy = B_TRUE;
1919 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1920 	io->ipsec_out_attach_if = B_TRUE;
1921 	io->ipsec_out_multicast_loop = B_FALSE;
1922 	io->ipsec_out_dontroute = B_TRUE;
1923 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1924 		zoneid = GLOBAL_ZONEID;
1925 	io->ipsec_out_zoneid = zoneid;
1926 	io->ipsec_out_ns = ipst->ips_netstack;	/* No netstack_hold */
1927 
1928 	mp = allocb(size, BPRI_HI);
1929 	if (mp == NULL) {
1930 		freemsg(first_mp);
1931 		return;
1932 	}
1933 	mp->b_wptr = mp->b_rptr + size;
1934 	first_mp->b_cont = mp;
1935 
1936 	ipha = (ipha_t *)mp->b_rptr;
1937 	rtralert = (uint8_t *)&(ipha[1]);
1938 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1939 	igmpa->igmpa_type   = type;
1940 	igmpa->igmpa_code   = 0;
1941 	igmpa->igmpa_group  = ilm->ilm_addr;
1942 	igmpa->igmpa_cksum  = 0;
1943 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1944 
1945 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1946 	rtralert[1] = RTRALERT_LEN;
1947 	rtralert[2] = 0;
1948 	rtralert[3] = 0;
1949 
1950 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1951 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1952 	ipha->ipha_type_of_service 	= 0;
1953 	ipha->ipha_length = htons(size);
1954 	ipha->ipha_ident = 0;
1955 	ipha->ipha_fragment_offset_and_flags = 0;
1956 	ipha->ipha_ttl 		= IGMP_TTL;
1957 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1958 	ipha->ipha_hdr_checksum 	= 0;
1959 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1960 	ipha->ipha_src 		= ipif->ipif_src_addr;
1961 	/*
1962 	 * Request loopback of the report if we are acting as a multicast
1963 	 * router, so that the process-level routing demon can hear it.
1964 	 */
1965 	/*
1966 	 * This will run multiple times for the same group if there are members
1967 	 * on the same group for multiple ipif's on the same ill. The
1968 	 * igmp_input code will suppress this due to the loopback thus we
1969 	 * always loopback membership report.
1970 	 */
1971 	ASSERT(ill->ill_rq != NULL);
1972 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1973 
1974 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
1975 
1976 	++ipst->ips_igmpstat.igps_snd_reports;
1977 }
1978 
1979 /*
1980  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1981  * with the passed-in ipif.  The report will contain one group record
1982  * for each element of reclist.  If this causes packet length to
1983  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1984  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1985  * and those buffers are freed here.
1986  */
1987 static void
1988 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1989 {
1990 	ipsec_out_t *io;
1991 	igmp3ra_t *igmp3ra;
1992 	grphdra_t *grphdr;
1993 	mblk_t *first_mp, *mp;
1994 	ipha_t *ipha;
1995 	uint8_t *rtralert;
1996 	ipaddr_t *src_array;
1997 	int i, j, numrec, more_src_cnt;
1998 	size_t hdrsize, size, rsize;
1999 	ill_t *ill = ipif->ipif_ill;
2000 	mrec_t *rp, *cur_reclist;
2001 	mrec_t *next_reclist = reclist;
2002 	boolean_t morepkts;
2003 	zoneid_t zoneid;
2004 	ip_stack_t	 *ipst = ill->ill_ipst;
2005 
2006 	/* if there aren't any records, there's nothing to send */
2007 	if (reclist == NULL)
2008 		return;
2009 
2010 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
2011 nextpkt:
2012 	size = hdrsize + sizeof (igmp3ra_t);
2013 	morepkts = B_FALSE;
2014 	more_src_cnt = 0;
2015 	cur_reclist = next_reclist;
2016 	numrec = 0;
2017 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2018 		rsize = sizeof (grphdra_t) +
2019 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2020 		if (size + rsize > ill->ill_max_frag) {
2021 			if (rp == cur_reclist) {
2022 				/*
2023 				 * If the first mrec we looked at is too big
2024 				 * to fit in a single packet (i.e the source
2025 				 * list is too big), we must either truncate
2026 				 * the list (if TO_EX or IS_EX), or send
2027 				 * multiple reports for the same group (all
2028 				 * other types).
2029 				 */
2030 				int srcspace, srcsperpkt;
2031 				srcspace = ill->ill_max_frag - (size +
2032 				    sizeof (grphdra_t));
2033 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2034 				/*
2035 				 * Increment size and numrec, because we will
2036 				 * be sending a record for the mrec we're
2037 				 * looking at now.
2038 				 */
2039 				size += sizeof (grphdra_t) +
2040 				    (srcsperpkt * sizeof (ipaddr_t));
2041 				numrec++;
2042 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2043 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2044 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2045 					if (rp->mrec_next == NULL) {
2046 						/* no more packets to send */
2047 						break;
2048 					} else {
2049 						/*
2050 						 * more packets, but we're
2051 						 * done with this mrec.
2052 						 */
2053 						next_reclist = rp->mrec_next;
2054 					}
2055 				} else {
2056 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2057 					    - srcsperpkt;
2058 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2059 					/*
2060 					 * We'll fix up this mrec (remove the
2061 					 * srcs we've already sent) before
2062 					 * returning to nextpkt above.
2063 					 */
2064 					next_reclist = rp;
2065 				}
2066 			} else {
2067 				next_reclist = rp;
2068 			}
2069 			morepkts = B_TRUE;
2070 			break;
2071 		}
2072 		size += rsize;
2073 		numrec++;
2074 	}
2075 
2076 	/*
2077 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2078 	 * load balancing requirements.
2079 	 */
2080 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2081 	if (first_mp == NULL)
2082 		goto free_reclist;
2083 
2084 	first_mp->b_datap->db_type = M_CTL;
2085 	first_mp->b_wptr += sizeof (ipsec_info_t);
2086 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2087 	/* ipsec_out_secure is B_FALSE now */
2088 	io = (ipsec_out_t *)first_mp->b_rptr;
2089 	io->ipsec_out_type = IPSEC_OUT;
2090 	io->ipsec_out_len = sizeof (ipsec_out_t);
2091 	io->ipsec_out_use_global_policy = B_TRUE;
2092 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2093 	io->ipsec_out_attach_if = B_TRUE;
2094 	io->ipsec_out_multicast_loop = B_FALSE;
2095 	io->ipsec_out_dontroute = B_TRUE;
2096 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2097 		zoneid = GLOBAL_ZONEID;
2098 	io->ipsec_out_zoneid = zoneid;
2099 
2100 	mp = allocb(size, BPRI_HI);
2101 	if (mp == NULL) {
2102 		freemsg(first_mp);
2103 		goto free_reclist;
2104 	}
2105 	bzero((char *)mp->b_rptr, size);
2106 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2107 	first_mp->b_cont = mp;
2108 
2109 	ipha = (ipha_t *)mp->b_rptr;
2110 	rtralert = (uint8_t *)&(ipha[1]);
2111 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2112 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2113 
2114 	rp = cur_reclist;
2115 	for (i = 0; i < numrec; i++) {
2116 		grphdr->grphdra_type = rp->mrec_type;
2117 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2118 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2119 		src_array = (ipaddr_t *)&(grphdr[1]);
2120 
2121 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2122 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2123 
2124 		grphdr = (grphdra_t *)&(src_array[j]);
2125 		rp = rp->mrec_next;
2126 	}
2127 
2128 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2129 	igmp3ra->igmp3ra_numrec = htons(numrec);
2130 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2131 
2132 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2133 	rtralert[1] = RTRALERT_LEN;
2134 	rtralert[2] = 0;
2135 	rtralert[3] = 0;
2136 
2137 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2138 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2139 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2140 	ipha->ipha_length = htons(size);
2141 	ipha->ipha_ttl = IGMP_TTL;
2142 	ipha->ipha_protocol = IPPROTO_IGMP;
2143 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2144 	ipha->ipha_src = ipif->ipif_src_addr;
2145 
2146 	/*
2147 	 * Request loopback of the report if we are acting as a multicast
2148 	 * router, so that the process-level routing daemon can hear it.
2149 	 *
2150 	 * This will run multiple times for the same group if there are
2151 	 * members on the same group for multiple ipifs on the same ill.
2152 	 * The igmp_input code will suppress this due to the loopback;
2153 	 * thus we always loopback membership report.
2154 	 */
2155 	ASSERT(ill->ill_rq != NULL);
2156 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2157 
2158 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
2159 
2160 	++ipst->ips_igmpstat.igps_snd_reports;
2161 
2162 	if (morepkts) {
2163 		if (more_src_cnt > 0) {
2164 			int index, mvsize;
2165 			slist_t *sl = &next_reclist->mrec_srcs;
2166 			index = sl->sl_numsrc;
2167 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2168 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2169 			    mvsize);
2170 			sl->sl_numsrc = more_src_cnt;
2171 		}
2172 		goto nextpkt;
2173 	}
2174 
2175 free_reclist:
2176 	while (reclist != NULL) {
2177 		rp = reclist->mrec_next;
2178 		mi_free(reclist);
2179 		reclist = rp;
2180 	}
2181 }
2182 
2183 /*
2184  * mld_input:
2185  */
2186 /* ARGSUSED */
2187 void
2188 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2189 {
2190 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2191 	mld_hdr_t	*mldh;
2192 	ilm_t		*ilm;
2193 	ipif_t		*ipif;
2194 	uint16_t	hdr_length, exthdr_length;
2195 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2196 	uint_t		next;
2197 	int		mldlen;
2198 	ip_stack_t	*ipst = ill->ill_ipst;
2199 
2200 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2201 
2202 	/* Make sure the src address of the packet is link-local */
2203 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2204 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2205 		freemsg(mp);
2206 		return;
2207 	}
2208 
2209 	if (ip6h->ip6_hlim != 1) {
2210 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2211 		freemsg(mp);
2212 		return;
2213 	}
2214 
2215 	/* Get to the icmp header part */
2216 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2217 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2218 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2219 	} else {
2220 		hdr_length = IPV6_HDR_LEN;
2221 		exthdr_length = 0;
2222 	}
2223 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2224 
2225 	/* An MLD packet must at least be 24 octets to be valid */
2226 	if (mldlen < MLD_MINLEN) {
2227 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2228 		freemsg(mp);
2229 		return;
2230 	}
2231 
2232 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2233 
2234 	switch (mldh->mld_type) {
2235 	case MLD_LISTENER_QUERY:
2236 		/*
2237 		 * packet length differentiates between v1 and v2.  v1
2238 		 * query should be exactly 24 octets long; v2 is >= 28.
2239 		 */
2240 		if ((mldlen == MLD_MINLEN) ||
2241 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2242 			next = mld_query_in(mldh, ill);
2243 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2244 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2245 		} else {
2246 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2247 			freemsg(mp);
2248 			return;
2249 		}
2250 		if (next == 0) {
2251 			freemsg(mp);
2252 			return;
2253 		}
2254 
2255 		if (next != INFINITY)
2256 			mld_start_timers(next, ipst);
2257 		break;
2258 
2259 	case MLD_LISTENER_REPORT: {
2260 
2261 		ASSERT(ill->ill_ipif != NULL);
2262 		/*
2263 		 * For fast leave to work, we have to know that we are the
2264 		 * last person to send a report for this group.  Reports
2265 		 * generated by us are looped back since we could potentially
2266 		 * be a multicast router, so discard reports sourced by me.
2267 		 */
2268 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2269 		mutex_enter(&ill->ill_lock);
2270 		for (ipif = ill->ill_ipif; ipif != NULL;
2271 		    ipif = ipif->ipif_next) {
2272 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2273 			    lcladdr_ptr)) {
2274 				if (ip_debug > 1) {
2275 					char    buf1[INET6_ADDRSTRLEN];
2276 					char	buf2[INET6_ADDRSTRLEN];
2277 
2278 					(void) mi_strlog(ill->ill_rq,
2279 					    1,
2280 					    SL_TRACE,
2281 					    "mld_input: we are only "
2282 					    "member src %s ipif_local %s",
2283 					    inet_ntop(AF_INET6, lcladdr_ptr,
2284 					    buf1, sizeof (buf1)),
2285 					    inet_ntop(AF_INET6,
2286 					    &ipif->ipif_v6lcl_addr,
2287 					    buf2, sizeof (buf2)));
2288 				}
2289 				mutex_exit(&ill->ill_lock);
2290 				freemsg(mp);
2291 				return;
2292 			}
2293 		}
2294 		mutex_exit(&ill->ill_lock);
2295 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2296 
2297 		v6group_ptr = &mldh->mld_addr;
2298 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2299 			BUMP_MIB(ill->ill_icmp6_mib,
2300 			    ipv6IfIcmpInGroupMembBadReports);
2301 			freemsg(mp);
2302 			return;
2303 		}
2304 
2305 
2306 		/*
2307 		 * If we belong to the group being reported, and we are a
2308 		 * 'Delaying member' per the RFC terminology, stop our timer
2309 		 * for that group and 'clear flag' i.e. mark ilm_state as
2310 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2311 		 * membership entries for the same group address (one per zone)
2312 		 * so we need to walk the ill_ilm list.
2313 		 */
2314 		mutex_enter(&ill->ill_lock);
2315 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2316 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2317 			    continue;
2318 			BUMP_MIB(ill->ill_icmp6_mib,
2319 			    ipv6IfIcmpInGroupMembOurReports);
2320 
2321 			ilm->ilm_timer = INFINITY;
2322 			ilm->ilm_state = IGMP_OTHERMEMBER;
2323 		}
2324 		mutex_exit(&ill->ill_lock);
2325 		break;
2326 	}
2327 	case MLD_LISTENER_REDUCTION:
2328 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2329 		break;
2330 	}
2331 	/*
2332 	 * All MLD packets have already been passed up to any
2333 	 * process(es) listening on a ICMP6 raw socket. This
2334 	 * has been accomplished in ip_deliver_local_v6 prior to
2335 	 * this function call. It is assumed that the multicast daemon
2336 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2337 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2338 	 * Thus we can free the MLD message block here
2339 	 */
2340 	freemsg(mp);
2341 }
2342 
2343 /*
2344  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2345  * (non-zero, unsigned) timer value to be set on success.
2346  */
2347 static uint_t
2348 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2349 {
2350 	ilm_t	*ilm;
2351 	int	timer;
2352 	uint_t	next, current;
2353 	in6_addr_t *v6group;
2354 
2355 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2356 
2357 	/*
2358 	 * In the MLD specification, there are 3 states and a flag.
2359 	 *
2360 	 * In Non-Listener state, we simply don't have a membership record.
2361 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2362 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2363 	 * INFINITY)
2364 	 *
2365 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2366 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2367 	 * if I sent the last report.
2368 	 */
2369 	v6group = &mldh->mld_addr;
2370 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2371 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2372 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2373 		return (0);
2374 	}
2375 
2376 	/* Need to do compatibility mode checking */
2377 	mutex_enter(&ill->ill_lock);
2378 	ill->ill_mcast_v1_time = 0;
2379 	ill->ill_mcast_v1_tset = 1;
2380 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2381 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2382 		    "MLD_V1_ROUTER\n", ill->ill_name));
2383 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2384 		ill->ill_mcast_type = MLD_V1_ROUTER;
2385 	}
2386 	mutex_exit(&ill->ill_lock);
2387 
2388 	timer = (int)ntohs(mldh->mld_maxdelay);
2389 	if (ip_debug > 1) {
2390 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2391 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2392 		    timer, (int)mldh->mld_type);
2393 	}
2394 
2395 	/*
2396 	 * -Start the timers in all of our membership records for
2397 	 * the physical interface on which the query arrived,
2398 	 * excl:
2399 	 *	1.  those that belong to the "all hosts" group,
2400 	 *	2.  those with 0 scope, or 1 node-local scope.
2401 	 *
2402 	 * -Restart any timer that is already running but has a value
2403 	 * longer that the requested timeout.
2404 	 * -Use the value specified in the query message as the
2405 	 * maximum timeout.
2406 	 */
2407 	next = INFINITY;
2408 	mutex_enter(&ill->ill_lock);
2409 
2410 	current = CURRENT_MSTIME;
2411 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2412 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2413 
2414 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2415 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2416 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2417 			continue;
2418 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2419 		    &ipv6_all_hosts_mcast)) &&
2420 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2421 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2422 			if (timer == 0) {
2423 				/* Respond immediately */
2424 				ilm->ilm_timer = INFINITY;
2425 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2426 				mutex_exit(&ill->ill_lock);
2427 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2428 				mutex_enter(&ill->ill_lock);
2429 				break;
2430 			}
2431 			if (ilm->ilm_timer > timer) {
2432 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2433 				if (ilm->ilm_timer < next)
2434 					next = ilm->ilm_timer;
2435 				ilm->ilm_timer += current;
2436 			}
2437 			break;
2438 		}
2439 	}
2440 	mutex_exit(&ill->ill_lock);
2441 
2442 	return (next);
2443 }
2444 
2445 /*
2446  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2447  * returns the appropriate (non-zero, unsigned) timer value (which may
2448  * be INFINITY) to be set.
2449  */
2450 static uint_t
2451 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2452 {
2453 	ilm_t	*ilm;
2454 	in6_addr_t *v6group, *src_array;
2455 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2456 	uint8_t	qrv;
2457 
2458 	v6group = &mld2q->mld2q_addr;
2459 	numsrc = ntohs(mld2q->mld2q_numsrc);
2460 
2461 	/* make sure numsrc matches packet size */
2462 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2463 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2464 		return (0);
2465 	}
2466 	src_array = (in6_addr_t *)&mld2q[1];
2467 
2468 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2469 
2470 	/* extract Maximum Response Delay from code in header */
2471 	mrd = ntohs(mld2q->mld2q_mxrc);
2472 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2473 		uint_t hdrval, mant, exp;
2474 		hdrval = mrd;
2475 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2476 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2477 		mrd = (mant | 0x1000) << (exp + 3);
2478 	}
2479 	if (mrd == 0)
2480 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2481 
2482 	MCAST_RANDOM_DELAY(delay, mrd);
2483 	next = (unsigned)INFINITY;
2484 	current = CURRENT_MSTIME;
2485 
2486 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2487 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2488 	else
2489 		ill->ill_mcast_rv = qrv;
2490 
2491 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2492 		uint_t mant, exp;
2493 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2494 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2495 		qqi = (mant | 0x10) << (exp + 3);
2496 	}
2497 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2498 
2499 	/*
2500 	 * If we have a pending general query response that's scheduled
2501 	 * sooner than the delay we calculated for this response, then
2502 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2503 	 */
2504 	mutex_enter(&ill->ill_lock);
2505 	if (ill->ill_global_timer < (current + delay)) {
2506 		mutex_exit(&ill->ill_lock);
2507 		return (next);
2508 	}
2509 	mutex_exit(&ill->ill_lock);
2510 
2511 	/*
2512 	 * Now take action depending on query type: general,
2513 	 * group specific, or group/source specific.
2514 	 */
2515 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2516 		/*
2517 		 * general query
2518 		 * We know global timer is either not running or is
2519 		 * greater than our calculated delay, so reset it to
2520 		 * our delay (random value in range [0, response time])
2521 		 */
2522 		mutex_enter(&ill->ill_lock);
2523 		ill->ill_global_timer = current + delay;
2524 		mutex_exit(&ill->ill_lock);
2525 		next = delay;
2526 
2527 	} else {
2528 		/* group or group/source specific query */
2529 		mutex_enter(&ill->ill_lock);
2530 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2531 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2532 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2533 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2534 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2535 				continue;
2536 
2537 			/*
2538 			 * If the query is group specific or we have a
2539 			 * pending group specific query, the response is
2540 			 * group specific (pending sources list should be
2541 			 * empty).  Otherwise, need to update the pending
2542 			 * sources list for the group and source specific
2543 			 * response.
2544 			 */
2545 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2546 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2547 group_query:
2548 				FREE_SLIST(ilm->ilm_pendsrcs);
2549 				ilm->ilm_pendsrcs = NULL;
2550 			} else {
2551 				boolean_t overflow;
2552 				slist_t *pktl;
2553 				if (numsrc > MAX_FILTER_SIZE ||
2554 				    (ilm->ilm_pendsrcs == NULL &&
2555 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2556 					/*
2557 					 * We've been sent more sources than
2558 					 * we can deal with; or we can't deal
2559 					 * with a source list at all. Revert
2560 					 * to a group specific query.
2561 					 */
2562 					goto group_query;
2563 				}
2564 				if ((pktl = l_alloc()) == NULL)
2565 					goto group_query;
2566 				pktl->sl_numsrc = numsrc;
2567 				for (i = 0; i < numsrc; i++)
2568 					pktl->sl_addr[i] = src_array[i];
2569 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2570 				    &overflow);
2571 				l_free(pktl);
2572 				if (overflow)
2573 					goto group_query;
2574 			}
2575 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2576 			    INFINITY : (ilm->ilm_timer - current);
2577 			/* set timer to soonest value */
2578 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2579 			if (ilm->ilm_timer < next)
2580 				next = ilm->ilm_timer;
2581 			ilm->ilm_timer += current;
2582 			break;
2583 		}
2584 		mutex_exit(&ill->ill_lock);
2585 	}
2586 
2587 	return (next);
2588 }
2589 
2590 /*
2591  * Send MLDv1 response packet with hoplimit 1
2592  */
2593 static void
2594 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2595 {
2596 	mblk_t		*mp;
2597 	mld_hdr_t	*mldh;
2598 	ip6_t 		*ip6h;
2599 	ip6_hbh_t	*ip6hbh;
2600 	struct ip6_opt_router	*ip6router;
2601 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2602 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2603 	ipif_t		*ipif;
2604 	ip6i_t		*ip6i;
2605 
2606 	/*
2607 	 * We need to place a router alert option in this packet.  The length
2608 	 * of the options must be a multiple of 8.  The hbh option header is 2
2609 	 * bytes followed by the 4 byte router alert option.  That leaves
2610 	 * 2 bytes of pad for a total of 8 bytes.
2611 	 */
2612 	const int	router_alert_length = 8;
2613 
2614 	ASSERT(ill->ill_isv6);
2615 
2616 	/*
2617 	 * We need to make sure that this packet does not get load balanced.
2618 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2619 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2620 	 * If it gets load balanced, switches supporting MLD snooping
2621 	 * (in the future) will send the packet that it receives for this
2622 	 * multicast group to the interface that we are sending on. As we have
2623 	 * joined the multicast group on this ill, by sending the packet out
2624 	 * on this ill, we receive all the packets back on this ill.
2625 	 */
2626 	size += sizeof (ip6i_t) + router_alert_length;
2627 	mp = allocb(size, BPRI_HI);
2628 	if (mp == NULL)
2629 		return;
2630 	bzero(mp->b_rptr, size);
2631 	mp->b_wptr = mp->b_rptr + size;
2632 
2633 	ip6i = (ip6i_t *)mp->b_rptr;
2634 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2635 	ip6i->ip6i_nxt = IPPROTO_RAW;
2636 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2637 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2638 
2639 	ip6h = (ip6_t *)&ip6i[1];
2640 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2641 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2642 	/*
2643 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2644 	 * above will pad between ip6router and mld.
2645 	 */
2646 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2647 
2648 	mldh->mld_type = type;
2649 	mldh->mld_addr = ilm->ilm_v6addr;
2650 
2651 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2652 	ip6router->ip6or_len = 2;
2653 	ip6router->ip6or_value[0] = 0;
2654 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2655 
2656 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2657 	ip6hbh->ip6h_len = 0;
2658 
2659 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2660 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2661 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2662 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2663 	if (v6addr == NULL)
2664 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2665 	else
2666 		ip6h->ip6_dst = *v6addr;
2667 
2668 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2669 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2670 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2671 		ipif_refrele(ipif);
2672 	} else {
2673 		/* Otherwise, use IPv6 default address selection. */
2674 		ip6h->ip6_src = ipv6_all_zeros;
2675 	}
2676 
2677 	/*
2678 	 * Prepare for checksum by putting icmp length in the icmp
2679 	 * checksum field. The checksum is calculated in ip_wput_v6.
2680 	 */
2681 	mldh->mld_cksum = htons(sizeof (*mldh));
2682 
2683 	/*
2684 	 * ip_wput will automatically loopback the multicast packet to
2685 	 * the conn if multicast loopback is enabled.
2686 	 * The MIB stats corresponding to this outgoing MLD packet
2687 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2688 	 * ->icmp_update_out_mib_v6 function call.
2689 	 */
2690 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2691 }
2692 
2693 /*
2694  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2695  * report will contain one multicast address record for each element of
2696  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2697  * multiple reports are sent.  reclist is assumed to be made up of
2698  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2699  */
2700 static void
2701 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2702 {
2703 	mblk_t		*mp;
2704 	mld2r_t		*mld2r;
2705 	mld2mar_t	*mld2mar;
2706 	in6_addr_t	*srcarray;
2707 	ip6_t		*ip6h;
2708 	ip6_hbh_t	*ip6hbh;
2709 	ip6i_t		*ip6i;
2710 	struct ip6_opt_router	*ip6router;
2711 	size_t		size, optlen, padlen, icmpsize, rsize;
2712 	ipif_t		*ipif;
2713 	int		i, numrec, more_src_cnt;
2714 	mrec_t		*rp, *cur_reclist;
2715 	mrec_t		*next_reclist = reclist;
2716 	boolean_t	morepkts;
2717 
2718 	/* If there aren't any records, there's nothing to send */
2719 	if (reclist == NULL)
2720 		return;
2721 
2722 	ASSERT(ill->ill_isv6);
2723 
2724 	/*
2725 	 * Total option length (optlen + padlen) must be a multiple of
2726 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2727 	 * length will be 8.  Assert this in case anything ever changes.
2728 	 */
2729 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2730 	ASSERT(optlen <= 8);
2731 	padlen = 8 - optlen;
2732 nextpkt:
2733 	icmpsize = sizeof (mld2r_t);
2734 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2735 	morepkts = B_FALSE;
2736 	more_src_cnt = 0;
2737 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2738 	    rp = rp->mrec_next, numrec++) {
2739 		rsize = sizeof (mld2mar_t) +
2740 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2741 		if (size + rsize > ill->ill_max_frag) {
2742 			if (rp == cur_reclist) {
2743 				/*
2744 				 * If the first mrec we looked at is too big
2745 				 * to fit in a single packet (i.e the source
2746 				 * list is too big), we must either truncate
2747 				 * the list (if TO_EX or IS_EX), or send
2748 				 * multiple reports for the same group (all
2749 				 * other types).
2750 				 */
2751 				int srcspace, srcsperpkt;
2752 				srcspace = ill->ill_max_frag -
2753 				    (size + sizeof (mld2mar_t));
2754 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2755 				/*
2756 				 * Increment icmpsize and size, because we will
2757 				 * be sending a record for the mrec we're
2758 				 * looking at now.
2759 				 */
2760 				rsize = sizeof (mld2mar_t) +
2761 				    (srcsperpkt * sizeof (in6_addr_t));
2762 				icmpsize += rsize;
2763 				size += rsize;
2764 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2765 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2766 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2767 					if (rp->mrec_next == NULL) {
2768 						/* no more packets to send */
2769 						break;
2770 					} else {
2771 						/*
2772 						 * more packets, but we're
2773 						 * done with this mrec.
2774 						 */
2775 						next_reclist = rp->mrec_next;
2776 					}
2777 				} else {
2778 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2779 					    - srcsperpkt;
2780 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2781 					/*
2782 					 * We'll fix up this mrec (remove the
2783 					 * srcs we've already sent) before
2784 					 * returning to nextpkt above.
2785 					 */
2786 					next_reclist = rp;
2787 				}
2788 			} else {
2789 				next_reclist = rp;
2790 			}
2791 			morepkts = B_TRUE;
2792 			break;
2793 		}
2794 		icmpsize += rsize;
2795 		size += rsize;
2796 	}
2797 
2798 	/*
2799 	 * We need to make sure that this packet does not get load balanced.
2800 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2801 	 * ip_newroute_ipif_v6 know how to handle such packets.
2802 	 * If it gets load balanced, switches supporting MLD snooping
2803 	 * (in the future) will send the packet that it receives for this
2804 	 * multicast group to the interface that we are sending on. As we have
2805 	 * joined the multicast group on this ill, by sending the packet out
2806 	 * on this ill, we receive all the packets back on this ill.
2807 	 */
2808 	size += sizeof (ip6i_t);
2809 	mp = allocb(size, BPRI_HI);
2810 	if (mp == NULL)
2811 		goto free_reclist;
2812 	bzero(mp->b_rptr, size);
2813 	mp->b_wptr = mp->b_rptr + size;
2814 
2815 	ip6i = (ip6i_t *)mp->b_rptr;
2816 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2817 	ip6i->ip6i_nxt = IPPROTO_RAW;
2818 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2819 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2820 
2821 	ip6h = (ip6_t *)&(ip6i[1]);
2822 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2823 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2824 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2825 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2826 
2827 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2828 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2829 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2830 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2831 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2832 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2833 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2834 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2835 		ipif_refrele(ipif);
2836 	} else {
2837 		/* otherwise, use IPv6 default address selection. */
2838 		ip6h->ip6_src = ipv6_all_zeros;
2839 	}
2840 
2841 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2842 	/*
2843 	 * ip6h_len is the number of 8-byte words, not including the first
2844 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2845 	 */
2846 	ip6hbh->ip6h_len = 0;
2847 
2848 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2849 	ip6router->ip6or_len = 2;
2850 	ip6router->ip6or_value[0] = 0;
2851 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2852 
2853 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2854 	mld2r->mld2r_nummar = htons(numrec);
2855 	/*
2856 	 * Prepare for the checksum by putting icmp length in the icmp
2857 	 * checksum field. The checksum is calculated in ip_wput_v6.
2858 	 */
2859 	mld2r->mld2r_cksum = htons(icmpsize);
2860 
2861 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2862 		mld2mar->mld2mar_type = rp->mrec_type;
2863 		mld2mar->mld2mar_auxlen = 0;
2864 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2865 		mld2mar->mld2mar_group = rp->mrec_group;
2866 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2867 
2868 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2869 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2870 
2871 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2872 	}
2873 
2874 	/*
2875 	 * ip_wput will automatically loopback the multicast packet to
2876 	 * the conn if multicast loopback is enabled.
2877 	 * The MIB stats corresponding to this outgoing MLD packet
2878 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2879 	 * ->icmp_update_out_mib_v6 function call.
2880 	 */
2881 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2882 
2883 	if (morepkts) {
2884 		if (more_src_cnt > 0) {
2885 			int index, mvsize;
2886 			slist_t *sl = &next_reclist->mrec_srcs;
2887 			index = sl->sl_numsrc;
2888 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2889 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2890 			    mvsize);
2891 			sl->sl_numsrc = more_src_cnt;
2892 		}
2893 		goto nextpkt;
2894 	}
2895 
2896 free_reclist:
2897 	while (reclist != NULL) {
2898 		rp = reclist->mrec_next;
2899 		mi_free(reclist);
2900 		reclist = rp;
2901 	}
2902 }
2903 
2904 static mrec_t *
2905 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2906     mrec_t *next)
2907 {
2908 	mrec_t *rp;
2909 	int i;
2910 
2911 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2912 	    SLIST_IS_EMPTY(srclist))
2913 		return (next);
2914 
2915 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2916 	if (rp == NULL)
2917 		return (next);
2918 
2919 	rp->mrec_next = next;
2920 	rp->mrec_type = type;
2921 	rp->mrec_auxlen = 0;
2922 	rp->mrec_group = *grp;
2923 	if (srclist == NULL) {
2924 		rp->mrec_srcs.sl_numsrc = 0;
2925 	} else {
2926 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2927 		for (i = 0; i < srclist->sl_numsrc; i++)
2928 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2929 	}
2930 
2931 	return (rp);
2932 }
2933 
2934 /*
2935  * Set up initial retransmit state.  If memory cannot be allocated for
2936  * the source lists, simply create as much state as is possible; memory
2937  * allocation failures are considered one type of transient error that
2938  * the retransmissions are designed to overcome (and if they aren't
2939  * transient, there are bigger problems than failing to notify the
2940  * router about multicast group membership state changes).
2941  */
2942 static void
2943 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2944     slist_t *flist)
2945 {
2946 	/*
2947 	 * There are only three possibilities for rtype:
2948 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2949 	 *	  => rtype is ALLOW_NEW_SOURCES
2950 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2951 	 *	  => rtype is CHANGE_TO_EXCLUDE
2952 	 *	State change that involves a filter mode change
2953 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2954 	 */
2955 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2956 	    rtype == ALLOW_NEW_SOURCES);
2957 
2958 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2959 
2960 	switch (rtype) {
2961 	case CHANGE_TO_EXCLUDE:
2962 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2963 		CLEAR_SLIST(rtxp->rtx_allow);
2964 		COPY_SLIST(flist, rtxp->rtx_block);
2965 		break;
2966 	case ALLOW_NEW_SOURCES:
2967 	case CHANGE_TO_INCLUDE:
2968 		rtxp->rtx_fmode_cnt =
2969 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2970 		CLEAR_SLIST(rtxp->rtx_block);
2971 		COPY_SLIST(flist, rtxp->rtx_allow);
2972 		break;
2973 	}
2974 }
2975 
2976 /*
2977  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2978  * RFC 3376 section 5.1, covers three cases:
2979  *	* The current state change is a filter mode change
2980  *		Set filter mode retransmit counter; set retransmit allow or
2981  *		block list to new source list as appropriate, and clear the
2982  *		retransmit list that was not set; send TO_IN or TO_EX with
2983  *		new source list.
2984  *	* The current state change is a source list change, but the filter
2985  *	  mode retransmit counter is > 0
2986  *		Decrement filter mode retransmit counter; set retransmit
2987  *		allow or block list to  new source list as appropriate,
2988  *		and clear the retransmit list that was not set; send TO_IN
2989  *		or TO_EX with new source list.
2990  *	* The current state change is a source list change, and the filter
2991  *	  mode retransmit counter is 0.
2992  *		Merge existing rtx allow and block lists with new state:
2993  *		  rtx_allow = (new allow + rtx_allow) - new block
2994  *		  rtx_block = (new block + rtx_block) - new allow
2995  *		Send ALLOW and BLOCK records for new retransmit lists;
2996  *		decrement retransmit counter.
2997  *
2998  * As is the case for mcast_init_rtx(), memory allocation failures are
2999  * acceptable; we just create as much state as we can.
3000  */
3001 static mrec_t *
3002 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
3003 {
3004 	ill_t *ill;
3005 	rtx_state_t *rtxp = &ilm->ilm_rtx;
3006 	mcast_record_t txtype;
3007 	mrec_t *rp, *rpnext, *rtnmrec;
3008 	boolean_t ovf;
3009 
3010 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
3011 
3012 	if (mreclist == NULL)
3013 		return (mreclist);
3014 
3015 	/*
3016 	 * A filter mode change is indicated by a single mrec, which is
3017 	 * either TO_IN or TO_EX.  In this case, we just need to set new
3018 	 * retransmit state as if this were an initial join.  There is
3019 	 * no change to the mrec list.
3020 	 */
3021 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
3022 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
3023 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
3024 		    &mreclist->mrec_srcs);
3025 		return (mreclist);
3026 	}
3027 
3028 	/*
3029 	 * Only the source list has changed
3030 	 */
3031 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3032 	if (rtxp->rtx_fmode_cnt > 0) {
3033 		/* but we're still sending filter mode change reports */
3034 		rtxp->rtx_fmode_cnt--;
3035 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3036 			CLEAR_SLIST(rtxp->rtx_block);
3037 			COPY_SLIST(flist, rtxp->rtx_allow);
3038 			txtype = CHANGE_TO_INCLUDE;
3039 		} else {
3040 			CLEAR_SLIST(rtxp->rtx_allow);
3041 			COPY_SLIST(flist, rtxp->rtx_block);
3042 			txtype = CHANGE_TO_EXCLUDE;
3043 		}
3044 		/* overwrite first mrec with new info */
3045 		mreclist->mrec_type = txtype;
3046 		l_copy(flist, &mreclist->mrec_srcs);
3047 		/* then free any remaining mrecs */
3048 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3049 			rpnext = rp->mrec_next;
3050 			mi_free(rp);
3051 		}
3052 		mreclist->mrec_next = NULL;
3053 		rtnmrec = mreclist;
3054 	} else {
3055 		mrec_t *allow_mrec, *block_mrec;
3056 		/*
3057 		 * Just send the source change reports; but we need to
3058 		 * recalculate the ALLOW and BLOCK lists based on previous
3059 		 * state and new changes.
3060 		 */
3061 		rtnmrec = mreclist;
3062 		allow_mrec = block_mrec = NULL;
3063 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3064 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3065 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3066 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3067 				allow_mrec = rp;
3068 			else
3069 				block_mrec = rp;
3070 		}
3071 		/*
3072 		 * Perform calculations:
3073 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3074 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3075 		 *
3076 		 * Each calc requires two steps, for example:
3077 		 *   rtx_allow = rtx_allow - mrec_block;
3078 		 *   new_allow = mrec_allow + rtx_allow;
3079 		 *
3080 		 * Store results in mrec lists, and then copy into rtx lists.
3081 		 * We do it in this order in case the rtx list hasn't been
3082 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3083 		 * Overflows are also okay.
3084 		 */
3085 		if (block_mrec != NULL) {
3086 			l_difference_in_a(rtxp->rtx_allow,
3087 			    &block_mrec->mrec_srcs);
3088 		}
3089 		if (allow_mrec != NULL) {
3090 			l_difference_in_a(rtxp->rtx_block,
3091 			    &allow_mrec->mrec_srcs);
3092 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3093 			    &ovf);
3094 		}
3095 		if (block_mrec != NULL) {
3096 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3097 			    &ovf);
3098 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3099 		} else {
3100 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3101 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3102 		}
3103 		if (allow_mrec != NULL) {
3104 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3105 		} else {
3106 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3107 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3108 		}
3109 	}
3110 
3111 	return (rtnmrec);
3112 }
3113