xref: /titanic_50/usr/src/uts/common/inet/ip/igmp.c (revision 825ba0f20a74fd9c5d0d1ce2c195da2cc88a7f77)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Internet Group Management Protocol (IGMP) routines.
31  * Multicast Listener Discovery Protocol (MLD) routines.
32  *
33  * Written by Steve Deering, Stanford, May 1988.
34  * Modified by Rosen Sharma, Stanford, Aug 1994.
35  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
36  *
37  * MULTICAST 3.5.1.1
38  */
39 
40 #include <sys/types.h>
41 #include <sys/stream.h>
42 #include <sys/stropts.h>
43 #include <sys/strlog.h>
44 #include <sys/strsun.h>
45 #include <sys/systm.h>
46 #include <sys/ddi.h>
47 #include <sys/sunddi.h>
48 #include <sys/cmn_err.h>
49 #include <sys/atomic.h>
50 #include <sys/zone.h>
51 
52 #include <sys/param.h>
53 #include <sys/socket.h>
54 #include <inet/ipclassifier.h>
55 #include <net/if.h>
56 #include <net/route.h>
57 #include <netinet/in.h>
58 #include <netinet/igmp_var.h>
59 #include <netinet/ip6.h>
60 #include <netinet/icmp6.h>
61 
62 #include <inet/common.h>
63 #include <inet/mi.h>
64 #include <inet/nd.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_if.h>
72 #include <net/pfkeyv2.h>
73 #include <inet/ipsec_info.h>
74 
75 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
76 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
77 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
78 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
79 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
80 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
81 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
82 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
83 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
84 		    slist_t *srclist, mrec_t *next);
85 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
86 		    mcast_record_t rtype, slist_t *flist);
87 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
88 
89 
90 /*
91  * Macros used to do timer len conversions.  Timer values are always
92  * stored and passed to the timer functions as milliseconds; but the
93  * default values and values from the wire may not be.
94  *
95  * And yes, it's obscure, but decisecond is easier to abbreviate than
96  * "tenths of a second".
97  */
98 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
99 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
100 
101 /*
102  * A running timer (scheduled thru timeout) can be cancelled if another
103  * timer with a shorter timeout value is scheduled before it has timed
104  * out.  When the shorter timer expires, the original timer is updated
105  * to account for the time elapsed while the shorter timer ran; but this
106  * does not take into account the amount of time already spent in timeout
107  * state before being preempted by the shorter timer, that is the time
108  * interval between time scheduled to time cancelled.  This can cause
109  * delays in sending out multicast membership reports.  To resolve this
110  * problem, wallclock time (absolute time) is used instead of deltas
111  * (relative time) to track timers.
112  *
113  * The MACRO below gets the lbolt value, used for proper timer scheduling
114  * and firing. Therefore multicast membership reports are sent on time.
115  * The timer does not exactly fire at the time it was scehduled to fire,
116  * there is a difference of a few milliseconds observed. An offset is used
117  * to take care of the difference.
118  */
119 
120 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
121 #define	CURRENT_OFFSET	(999)
122 
123 /*
124  * The first multicast join will trigger the igmp timers / mld timers
125  * The unit for next is milliseconds.
126  */
127 void
128 igmp_start_timers(unsigned next, ip_stack_t *ipst)
129 {
130 	int	time_left;
131 	int	ret;
132 
133 	ASSERT(next != 0 && next != INFINITY);
134 
135 	mutex_enter(&ipst->ips_igmp_timer_lock);
136 
137 	if (ipst->ips_igmp_timer_setter_active) {
138 		/*
139 		 * Serialize timer setters, one at a time. If the
140 		 * timer is currently being set by someone,
141 		 * just record the next time when it has to be
142 		 * invoked and return. The current setter will
143 		 * take care.
144 		 */
145 		ipst->ips_igmp_time_to_next =
146 		    MIN(ipst->ips_igmp_time_to_next, next);
147 		mutex_exit(&ipst->ips_igmp_timer_lock);
148 		return;
149 	} else {
150 		ipst->ips_igmp_timer_setter_active = B_TRUE;
151 	}
152 	if (ipst->ips_igmp_timeout_id == 0) {
153 		/*
154 		 * The timer is inactive. We need to start a timer
155 		 */
156 		ipst->ips_igmp_time_to_next = next;
157 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
158 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
159 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
160 		ipst->ips_igmp_timer_setter_active = B_FALSE;
161 		mutex_exit(&ipst->ips_igmp_timer_lock);
162 		return;
163 	}
164 
165 	/*
166 	 * The timer was scheduled sometime back for firing in
167 	 * 'igmp_time_to_next' ms and is active. We need to
168 	 * reschedule the timeout if the new 'next' will happen
169 	 * earlier than the currently scheduled timeout
170 	 */
171 	time_left = ipst->ips_igmp_timer_scheduled_last +
172 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
173 	if (time_left < MSEC_TO_TICK(next)) {
174 		ipst->ips_igmp_timer_setter_active = B_FALSE;
175 		mutex_exit(&ipst->ips_igmp_timer_lock);
176 		return;
177 	}
178 
179 	mutex_exit(&ipst->ips_igmp_timer_lock);
180 	ret = untimeout(ipst->ips_igmp_timeout_id);
181 	mutex_enter(&ipst->ips_igmp_timer_lock);
182 	/*
183 	 * The timeout was cancelled, or the timeout handler
184 	 * completed, while we were blocked in the untimeout.
185 	 * No other thread could have set the timer meanwhile
186 	 * since we serialized all the timer setters. Thus
187 	 * no timer is currently active nor executing nor will
188 	 * any timer fire in the future. We start the timer now
189 	 * if needed.
190 	 */
191 	if (ret == -1) {
192 		ASSERT(ipst->ips_igmp_timeout_id == 0);
193 	} else {
194 		ASSERT(ipst->ips_igmp_timeout_id != 0);
195 		ipst->ips_igmp_timeout_id = 0;
196 	}
197 	if (ipst->ips_igmp_time_to_next != 0) {
198 		ipst->ips_igmp_time_to_next =
199 		    MIN(ipst->ips_igmp_time_to_next, next);
200 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
201 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
202 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
203 	}
204 	ipst->ips_igmp_timer_setter_active = B_FALSE;
205 	mutex_exit(&ipst->ips_igmp_timer_lock);
206 }
207 
208 /*
209  * mld_start_timers:
210  * The unit for next is milliseconds.
211  */
212 void
213 mld_start_timers(unsigned next, ip_stack_t *ipst)
214 {
215 	int	time_left;
216 	int	ret;
217 
218 	ASSERT(next != 0 && next != INFINITY);
219 
220 	mutex_enter(&ipst->ips_mld_timer_lock);
221 	if (ipst->ips_mld_timer_setter_active) {
222 		/*
223 		 * Serialize timer setters, one at a time. If the
224 		 * timer is currently being set by someone,
225 		 * just record the next time when it has to be
226 		 * invoked and return. The current setter will
227 		 * take care.
228 		 */
229 		ipst->ips_mld_time_to_next =
230 		    MIN(ipst->ips_mld_time_to_next, next);
231 		mutex_exit(&ipst->ips_mld_timer_lock);
232 		return;
233 	} else {
234 		ipst->ips_mld_timer_setter_active = B_TRUE;
235 	}
236 	if (ipst->ips_mld_timeout_id == 0) {
237 		/*
238 		 * The timer is inactive. We need to start a timer
239 		 */
240 		ipst->ips_mld_time_to_next = next;
241 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
242 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
243 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
244 		ipst->ips_mld_timer_setter_active = B_FALSE;
245 		mutex_exit(&ipst->ips_mld_timer_lock);
246 		return;
247 	}
248 
249 	/*
250 	 * The timer was scheduled sometime back for firing in
251 	 * 'igmp_time_to_next' ms and is active. We need to
252 	 * reschedule the timeout if the new 'next' will happen
253 	 * earlier than the currently scheduled timeout
254 	 */
255 	time_left = ipst->ips_mld_timer_scheduled_last +
256 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
257 	if (time_left < MSEC_TO_TICK(next)) {
258 		ipst->ips_mld_timer_setter_active = B_FALSE;
259 		mutex_exit(&ipst->ips_mld_timer_lock);
260 		return;
261 	}
262 
263 	mutex_exit(&ipst->ips_mld_timer_lock);
264 	ret = untimeout(ipst->ips_mld_timeout_id);
265 	mutex_enter(&ipst->ips_mld_timer_lock);
266 	/*
267 	 * The timeout was cancelled, or the timeout handler
268 	 * completed, while we were blocked in the untimeout.
269 	 * No other thread could have set the timer meanwhile
270 	 * since we serialized all the timer setters. Thus
271 	 * no timer is currently active nor executing nor will
272 	 * any timer fire in the future. We start the timer now
273 	 * if needed.
274 	 */
275 	if (ret == -1) {
276 		ASSERT(ipst->ips_mld_timeout_id == 0);
277 	} else {
278 		ASSERT(ipst->ips_mld_timeout_id != 0);
279 		ipst->ips_mld_timeout_id = 0;
280 	}
281 	if (ipst->ips_mld_time_to_next != 0) {
282 		ipst->ips_mld_time_to_next =
283 		    MIN(ipst->ips_mld_time_to_next, next);
284 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
285 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
286 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
287 	}
288 	ipst->ips_mld_timer_setter_active = B_FALSE;
289 	mutex_exit(&ipst->ips_mld_timer_lock);
290 }
291 
292 /*
293  * igmp_input:
294  * Return NULL for a bad packet that is discarded here.
295  * Return mp if the message is OK and should be handed to "raw" receivers.
296  * Callers of igmp_input() may need to reinitialize variables that were copied
297  * from the mblk as this calls pullupmsg().
298  */
299 /* ARGSUSED */
300 mblk_t *
301 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
302 {
303 	igmpa_t 	*igmpa;
304 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
305 	int		iphlen, igmplen, mblklen;
306 	ilm_t 		*ilm;
307 	uint32_t	src, dst;
308 	uint32_t 	group;
309 	uint_t		next;
310 	ipif_t 		*ipif;
311 	ip_stack_t	 *ipst;
312 
313 	ASSERT(ill != NULL);
314 	ASSERT(!ill->ill_isv6);
315 	ipst = ill->ill_ipst;
316 	++ipst->ips_igmpstat.igps_rcv_total;
317 
318 	mblklen = MBLKL(mp);
319 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
320 		++ipst->ips_igmpstat.igps_rcv_tooshort;
321 		goto bad_pkt;
322 	}
323 	igmplen = ntohs(ipha->ipha_length) - iphlen;
324 	/*
325 	 * Since msg sizes are more variable with v3, just pullup the
326 	 * whole thing now.
327 	 */
328 	if (MBLKL(mp) < (igmplen + iphlen)) {
329 		mblk_t *mp1;
330 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
331 			++ipst->ips_igmpstat.igps_rcv_tooshort;
332 			goto bad_pkt;
333 		}
334 		freemsg(mp);
335 		mp = mp1;
336 		ipha = (ipha_t *)(mp->b_rptr);
337 	}
338 
339 	/*
340 	 * Validate lengths
341 	 */
342 	if (igmplen < IGMP_MINLEN) {
343 		++ipst->ips_igmpstat.igps_rcv_tooshort;
344 		goto bad_pkt;
345 	}
346 	/*
347 	 * Validate checksum
348 	 */
349 	if (IP_CSUM(mp, iphlen, 0)) {
350 		++ipst->ips_igmpstat.igps_rcv_badsum;
351 		goto bad_pkt;
352 	}
353 
354 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
355 	src = ipha->ipha_src;
356 	dst = ipha->ipha_dst;
357 	if (ip_debug > 1)
358 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
359 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
360 		    (int)ntohl(src), (int)ntohl(dst),
361 		    ill->ill_name);
362 
363 	switch (igmpa->igmpa_type) {
364 	case IGMP_MEMBERSHIP_QUERY:
365 		/*
366 		 * packet length differentiates between v1/v2 and v3
367 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
368 		 */
369 		if ((igmplen == IGMP_MINLEN) ||
370 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
371 			next = igmp_query_in(ipha, igmpa, ill);
372 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
373 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
374 			    igmplen);
375 		} else {
376 			++ipst->ips_igmpstat.igps_rcv_tooshort;
377 			goto bad_pkt;
378 		}
379 		if (next == 0)
380 			goto bad_pkt;
381 
382 		if (next != INFINITY)
383 			igmp_start_timers(next, ipst);
384 
385 		break;
386 
387 	case IGMP_V1_MEMBERSHIP_REPORT:
388 	case IGMP_V2_MEMBERSHIP_REPORT:
389 		/*
390 		 * For fast leave to work, we have to know that we are the
391 		 * last person to send a report for this group. Reports
392 		 * generated by us are looped back since we could potentially
393 		 * be a multicast router, so discard reports sourced by me.
394 		 */
395 		mutex_enter(&ill->ill_lock);
396 		for (ipif = ill->ill_ipif; ipif != NULL;
397 		    ipif = ipif->ipif_next) {
398 			if (ipif->ipif_lcl_addr == src) {
399 				if (ip_debug > 1) {
400 					(void) mi_strlog(ill->ill_rq,
401 					    1,
402 					    SL_TRACE,
403 					    "igmp_input: we are only "
404 					    "member src 0x%x ipif_local 0x%x",
405 					    (int)ntohl(src),
406 					    (int)
407 					    ntohl(ipif->ipif_lcl_addr));
408 				}
409 				mutex_exit(&ill->ill_lock);
410 				return (mp);
411 			}
412 		}
413 		mutex_exit(&ill->ill_lock);
414 
415 		++ipst->ips_igmpstat.igps_rcv_reports;
416 		group = igmpa->igmpa_group;
417 		if (!CLASSD(group)) {
418 			++ipst->ips_igmpstat.igps_rcv_badreports;
419 			goto bad_pkt;
420 		}
421 
422 		/*
423 		 * KLUDGE: if the IP source address of the report has an
424 		 * unspecified (i.e., zero) subnet number, as is allowed for
425 		 * a booting host, replace it with the correct subnet number
426 		 * so that a process-level multicast routing demon can
427 		 * determine which subnet it arrived from.  This is necessary
428 		 * to compensate for the lack of any way for a process to
429 		 * determine the arrival interface of an incoming packet.
430 		 *
431 		 * Requires that a copy of *this* message it passed up
432 		 * to the raw interface which is done by our caller.
433 		 */
434 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
435 			/* Pick the first ipif on this ill */
436 			mutex_enter(&ill->ill_lock);
437 			src = ill->ill_ipif->ipif_subnet;
438 			mutex_exit(&ill->ill_lock);
439 			ip1dbg(("igmp_input: changed src to 0x%x\n",
440 			    (int)ntohl(src)));
441 			ipha->ipha_src = src;
442 		}
443 
444 		/*
445 		 * If we belong to the group being reported, and
446 		 * we are a 'Delaying member' in the RFC terminology,
447 		 * stop our timer for that group and 'clear flag' i.e.
448 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
449 		 * interfaces on the given physical interface.
450 		 */
451 		mutex_enter(&ill->ill_lock);
452 		for (ipif = ill->ill_ipif; ipif != NULL;
453 		    ipif = ipif->ipif_next) {
454 			ilm = ilm_lookup_ipif(ipif, group);
455 			if (ilm != NULL) {
456 				++ipst->ips_igmpstat.igps_rcv_ourreports;
457 				ilm->ilm_timer = INFINITY;
458 				ilm->ilm_state = IGMP_OTHERMEMBER;
459 			}
460 		} /* for */
461 		mutex_exit(&ill->ill_lock);
462 		break;
463 
464 	case IGMP_V3_MEMBERSHIP_REPORT:
465 		/*
466 		 * Currently nothing to do here; IGMP router is not
467 		 * implemented in ip, and v3 hosts don't pay attention
468 		 * to membership reports.
469 		 */
470 		break;
471 	}
472 	/*
473 	 * Pass all valid IGMP packets up to any process(es) listening
474 	 * on a raw IGMP socket. Do not free the packet.
475 	 */
476 	return (mp);
477 
478 bad_pkt:
479 	freemsg(mp);
480 	return (NULL);
481 }
482 
483 static uint_t
484 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
485 {
486 	ilm_t	*ilm;
487 	int	timer;
488 	uint_t	next, current;
489 	ip_stack_t	 *ipst;
490 
491 	ipst = ill->ill_ipst;
492 	++ipst->ips_igmpstat.igps_rcv_queries;
493 
494 	/*
495 	 * In the IGMPv2 specification, there are 3 states and a flag.
496 	 *
497 	 * In Non-Member state, we simply don't have a membership record.
498 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
499 	 * < INFINITY).  In Idle Member state, our timer is not running
500 	 * (ilm->ilm_timer == INFINITY).
501 	 *
502 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
503 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
504 	 * if I sent the last report.
505 	 */
506 	if ((igmpa->igmpa_code == 0) ||
507 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
508 		/*
509 		 * Query from an old router.
510 		 * Remember that the querier on this interface is old,
511 		 * and set the timer to the value in RFC 1112.
512 		 */
513 
514 
515 		mutex_enter(&ill->ill_lock);
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 		mutex_exit(&ill->ill_lock);
525 
526 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
527 
528 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
529 		    igmpa->igmpa_group != 0) {
530 			++ipst->ips_igmpstat.igps_rcv_badqueries;
531 			return (0);
532 		}
533 
534 	} else {
535 		in_addr_t group;
536 
537 		/*
538 		 * Query from a new router
539 		 * Simply do a validity check
540 		 */
541 		group = igmpa->igmpa_group;
542 		if (group != 0 && (!CLASSD(group))) {
543 			++ipst->ips_igmpstat.igps_rcv_badqueries;
544 			return (0);
545 		}
546 
547 		/*
548 		 * Switch interface state to v2 on receipt of a v2 query
549 		 * ONLY IF current state is v3.  Let things be if current
550 		 * state if v1 but do reset the v2-querier-present timer.
551 		 */
552 		mutex_enter(&ill->ill_lock);
553 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
554 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
555 			    "to IGMP_V2_ROUTER", ill->ill_name));
556 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
557 			ill->ill_mcast_type = IGMP_V2_ROUTER;
558 		}
559 		ill->ill_mcast_v2_time = 0;
560 		ill->ill_mcast_v2_tset = 1;
561 		mutex_exit(&ill->ill_lock);
562 
563 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
564 	}
565 
566 	if (ip_debug > 1) {
567 		mutex_enter(&ill->ill_lock);
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 		mutex_exit(&ill->ill_lock);
573 	}
574 
575 	/*
576 	 * -Start the timers in all of our membership records
577 	 *  for the physical interface on which the query
578 	 *  arrived, excluding those that belong to the "all
579 	 *  hosts" group (224.0.0.1).
580 	 *
581 	 * -Restart any timer that is already running but has
582 	 *  a value longer than the requested timeout.
583 	 *
584 	 * -Use the value specified in the query message as
585 	 *  the maximum timeout.
586 	 */
587 	next = (unsigned)INFINITY;
588 	mutex_enter(&ill->ill_lock);
589 
590 	current = CURRENT_MSTIME;
591 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
592 
593 		/*
594 		 * A multicast router joins INADDR_ANY address
595 		 * to enable promiscuous reception of all
596 		 * mcasts from the interface. This INADDR_ANY
597 		 * is stored in the ilm_v6addr as V6 unspec addr
598 		 */
599 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
600 			continue;
601 		if (ilm->ilm_addr == htonl(INADDR_ANY))
602 			continue;
603 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
604 		    (igmpa->igmpa_group == 0) ||
605 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
606 			if (ilm->ilm_timer > timer) {
607 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
608 				if (ilm->ilm_timer < next)
609 					next = ilm->ilm_timer;
610 				ilm->ilm_timer += current;
611 			}
612 		}
613 	}
614 	mutex_exit(&ill->ill_lock);
615 
616 	return (next);
617 }
618 
619 static uint_t
620 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
621 {
622 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
623 	uint_t		current;
624 	ilm_t		*ilm;
625 	ipaddr_t	*src_array;
626 	uint8_t		qrv;
627 	ip_stack_t	 *ipst;
628 
629 	ipst = ill->ill_ipst;
630 	/* make sure numsrc matches packet size */
631 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
632 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
633 		++ipst->ips_igmpstat.igps_rcv_tooshort;
634 		return (0);
635 	}
636 	src_array = (ipaddr_t *)&igmp3qa[1];
637 
638 	++ipst->ips_igmpstat.igps_rcv_queries;
639 
640 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
641 		uint_t hdrval, mant, exp;
642 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
643 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
644 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
645 		mrd = (mant | 0x10) << (exp + 3);
646 	}
647 	if (mrd == 0)
648 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
649 	timer = DSEC_TO_MSEC(mrd);
650 	MCAST_RANDOM_DELAY(delay, timer);
651 	next = (unsigned)INFINITY;
652 	current = CURRENT_MSTIME;
653 
654 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
655 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
656 	else
657 		ill->ill_mcast_rv = qrv;
658 
659 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
660 		uint_t hdrval, mant, exp;
661 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
662 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
663 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
664 		qqi = (mant | 0x10) << (exp + 3);
665 	}
666 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
667 
668 	/*
669 	 * If we have a pending general query response that's scheduled
670 	 * sooner than the delay we calculated for this response, then
671 	 * no action is required (RFC3376 section 5.2 rule 1)
672 	 */
673 	mutex_enter(&ill->ill_lock);
674 	if (ill->ill_global_timer < (current + delay)) {
675 		mutex_exit(&ill->ill_lock);
676 		return (next);
677 	}
678 	mutex_exit(&ill->ill_lock);
679 
680 	/*
681 	 * Now take action depending upon query type:
682 	 * general, group specific, or group/source specific.
683 	 */
684 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
685 		/*
686 		 * general query
687 		 * We know global timer is either not running or is
688 		 * greater than our calculated delay, so reset it to
689 		 * our delay (random value in range [0, response time]).
690 		 */
691 		mutex_enter(&ill->ill_lock);
692 		ill->ill_global_timer =  current + delay;
693 		mutex_exit(&ill->ill_lock);
694 		next = delay;
695 
696 	} else {
697 		/* group or group/source specific query */
698 		mutex_enter(&ill->ill_lock);
699 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
700 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
701 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
702 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
703 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
704 				continue;
705 			/*
706 			 * If the query is group specific or we have a
707 			 * pending group specific query, the response is
708 			 * group specific (pending sources list should be
709 			 * empty).  Otherwise, need to update the pending
710 			 * sources list for the group and source specific
711 			 * response.
712 			 */
713 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
714 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
715 group_query:
716 				FREE_SLIST(ilm->ilm_pendsrcs);
717 				ilm->ilm_pendsrcs = NULL;
718 			} else {
719 				boolean_t overflow;
720 				slist_t *pktl;
721 				if (numsrc > MAX_FILTER_SIZE ||
722 				    (ilm->ilm_pendsrcs == NULL &&
723 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
724 					/*
725 					 * We've been sent more sources than
726 					 * we can deal with; or we can't deal
727 					 * with a source list at all.  Revert
728 					 * to a group specific query.
729 					 */
730 					goto group_query;
731 				}
732 				if ((pktl = l_alloc()) == NULL)
733 					goto group_query;
734 				pktl->sl_numsrc = numsrc;
735 				for (i = 0; i < numsrc; i++)
736 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
737 					    &(pktl->sl_addr[i]));
738 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
739 				    &overflow);
740 				l_free(pktl);
741 				if (overflow)
742 					goto group_query;
743 			}
744 
745 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
746 			    INFINITY : (ilm->ilm_timer - current);
747 			/* choose soonest timer */
748 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
749 			if (ilm->ilm_timer < next)
750 				next = ilm->ilm_timer;
751 			ilm->ilm_timer += current;
752 		}
753 		mutex_exit(&ill->ill_lock);
754 	}
755 
756 	return (next);
757 }
758 
759 void
760 igmp_joingroup(ilm_t *ilm)
761 {
762 	uint_t	timer;
763 	ill_t	*ill;
764 	ip_stack_t	*ipst = ilm->ilm_ipst;
765 
766 	ill = ilm->ilm_ipif->ipif_ill;
767 
768 	ASSERT(IAM_WRITER_ILL(ill));
769 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
770 
771 	mutex_enter(&ill->ill_lock);
772 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
773 		ilm->ilm_rtx.rtx_timer = INFINITY;
774 		ilm->ilm_state = IGMP_OTHERMEMBER;
775 		mutex_exit(&ill->ill_lock);
776 	} else {
777 		ip1dbg(("Querier mode %d, sending report, group %x\n",
778 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
779 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
780 			mutex_exit(&ill->ill_lock);
781 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
782 			mutex_enter(&ill->ill_lock);
783 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
784 			mutex_exit(&ill->ill_lock);
785 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
786 			mutex_enter(&ill->ill_lock);
787 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
788 			mrec_t *rp;
789 			mcast_record_t rtype;
790 			/*
791 			 * The possible state changes we need to handle here:
792 			 *   Old State	New State	Report
793 			 *
794 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
795 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
796 			 *
797 			 * No need to send the BLOCK(0) report; ALLOW(X)
798 			 * is enough.
799 			 */
800 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
801 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
802 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
803 			    ilm->ilm_filter, NULL);
804 			mutex_exit(&ill->ill_lock);
805 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
806 			mutex_enter(&ill->ill_lock);
807 			/*
808 			 * Set up retransmission state.  Timer is set below,
809 			 * for both v3 and older versions.
810 			 */
811 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
812 			    ilm->ilm_filter);
813 		}
814 
815 		/* Set the ilm timer value */
816 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
817 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
818 		timer = ilm->ilm_rtx.rtx_timer;
819 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
820 		ilm->ilm_state = IGMP_IREPORTEDLAST;
821 		mutex_exit(&ill->ill_lock);
822 
823 		/*
824 		 * To avoid deadlock, we defer igmp_start_timers() to
825 		 * ipsq_exit().  See the comment in ipsq_exit() for details.
826 		 */
827 		mutex_enter(&ipst->ips_igmp_timer_lock);
828 		ipst->ips_igmp_deferred_next = MIN(timer,
829 		    ipst->ips_igmp_deferred_next);
830 		mutex_exit(&ipst->ips_igmp_timer_lock);
831 	}
832 
833 	if (ip_debug > 1) {
834 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
835 		    "igmp_joingroup: multicast_type %d timer %d",
836 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
837 		    (int)ntohl(timer));
838 	}
839 }
840 
841 void
842 mld_joingroup(ilm_t *ilm)
843 {
844 	uint_t	timer;
845 	ill_t	*ill;
846 	ip_stack_t	*ipst = ilm->ilm_ipst;
847 
848 	ill = ilm->ilm_ill;
849 
850 	ASSERT(IAM_WRITER_ILL(ill));
851 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
852 
853 	mutex_enter(&ill->ill_lock);
854 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
855 		ilm->ilm_rtx.rtx_timer = INFINITY;
856 		ilm->ilm_state = IGMP_OTHERMEMBER;
857 		mutex_exit(&ill->ill_lock);
858 	} else {
859 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
860 			mutex_exit(&ill->ill_lock);
861 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
862 			mutex_enter(&ill->ill_lock);
863 		} else {
864 			mrec_t *rp;
865 			mcast_record_t rtype;
866 			/*
867 			 * The possible state changes we need to handle here:
868 			 *	Old State   New State	Report
869 			 *
870 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
871 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
872 			 *
873 			 * No need to send the BLOCK(0) report; ALLOW(X)
874 			 * is enough
875 			 */
876 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
877 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
878 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
879 			    ilm->ilm_filter, NULL);
880 			mutex_exit(&ill->ill_lock);
881 			mldv2_sendrpt(ill, rp);
882 			mutex_enter(&ill->ill_lock);
883 			/*
884 			 * Set up retransmission state.  Timer is set below,
885 			 * for both v2 and v1.
886 			 */
887 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
888 			    ilm->ilm_filter);
889 		}
890 
891 		/* Set the ilm timer value */
892 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
893 		    ilm->ilm_rtx.rtx_cnt > 0);
894 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
895 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
896 		timer = ilm->ilm_rtx.rtx_timer;
897 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
898 		ilm->ilm_state = IGMP_IREPORTEDLAST;
899 		mutex_exit(&ill->ill_lock);
900 
901 		/*
902 		 * To avoid deadlock, we defer mld_start_timers() to
903 		 * ipsq_exit().  See the comment in ipsq_exit() for details.
904 		 */
905 		mutex_enter(&ipst->ips_mld_timer_lock);
906 		ipst->ips_mld_deferred_next = MIN(timer,
907 		    ipst->ips_mld_deferred_next);
908 		mutex_exit(&ipst->ips_mld_timer_lock);
909 	}
910 
911 	if (ip_debug > 1) {
912 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
913 		    "mld_joingroup: multicast_type %d timer %d",
914 		    (ilm->ilm_ill->ill_mcast_type),
915 		    (int)ntohl(timer));
916 	}
917 }
918 
919 void
920 igmp_leavegroup(ilm_t *ilm)
921 {
922 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
923 
924 	ASSERT(ilm->ilm_ill == NULL);
925 	ASSERT(!ill->ill_isv6);
926 
927 	mutex_enter(&ill->ill_lock);
928 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
929 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
930 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
931 		mutex_exit(&ill->ill_lock);
932 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
933 		    (htonl(INADDR_ALLRTRS_GROUP)));
934 		return;
935 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
936 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
937 		mrec_t *rp;
938 		/*
939 		 * The possible state changes we need to handle here:
940 		 *	Old State	New State	Report
941 		 *
942 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
943 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
944 		 *
945 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
946 		 */
947 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
948 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
949 			    ilm->ilm_filter, NULL);
950 		} else {
951 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
952 			    NULL, NULL);
953 		}
954 		mutex_exit(&ill->ill_lock);
955 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
956 		return;
957 	}
958 	mutex_exit(&ill->ill_lock);
959 }
960 
961 void
962 mld_leavegroup(ilm_t *ilm)
963 {
964 	ill_t *ill = ilm->ilm_ill;
965 
966 	ASSERT(ilm->ilm_ipif == NULL);
967 	ASSERT(ill->ill_isv6);
968 
969 	mutex_enter(&ill->ill_lock);
970 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
971 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
972 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
973 		mutex_exit(&ill->ill_lock);
974 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
975 		return;
976 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
977 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
978 		mrec_t *rp;
979 		/*
980 		 * The possible state changes we need to handle here:
981 		 *	Old State	New State	Report
982 		 *
983 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
984 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
985 		 *
986 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
987 		 */
988 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
989 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
990 			    ilm->ilm_filter, NULL);
991 		} else {
992 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
993 			    NULL, NULL);
994 		}
995 		mutex_exit(&ill->ill_lock);
996 		mldv2_sendrpt(ill, rp);
997 		return;
998 	}
999 	mutex_exit(&ill->ill_lock);
1000 }
1001 
1002 void
1003 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1004 {
1005 	ill_t *ill;
1006 	mrec_t *rp;
1007 	ip_stack_t	*ipst = ilm->ilm_ipst;
1008 
1009 	ASSERT(ilm != NULL);
1010 
1011 	/* state change reports should only be sent if the router is v3 */
1012 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
1013 		return;
1014 
1015 	if (ilm->ilm_ill == NULL) {
1016 		ASSERT(ilm->ilm_ipif != NULL);
1017 		ill = ilm->ilm_ipif->ipif_ill;
1018 	} else {
1019 		ill = ilm->ilm_ill;
1020 	}
1021 
1022 	mutex_enter(&ill->ill_lock);
1023 
1024 	/*
1025 	 * Compare existing(old) state with the new state and prepare
1026 	 * State Change Report, according to the rules in RFC 3376:
1027 	 *
1028 	 *	Old State	New State	State Change Report
1029 	 *
1030 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1031 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1032 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1033 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1034 	 */
1035 
1036 	if (ilm->ilm_fmode == fmode) {
1037 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1038 		slist_t *allow, *block;
1039 		if (((a_minus_b = l_alloc()) == NULL) ||
1040 		    ((b_minus_a = l_alloc()) == NULL)) {
1041 			l_free(a_minus_b);
1042 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1043 				goto send_to_ex;
1044 			else
1045 				goto send_to_in;
1046 		}
1047 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1048 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1049 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1050 			allow = b_minus_a;
1051 			block = a_minus_b;
1052 		} else {
1053 			allow = a_minus_b;
1054 			block = b_minus_a;
1055 		}
1056 		rp = NULL;
1057 		if (!SLIST_IS_EMPTY(allow))
1058 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1059 			    allow, rp);
1060 		if (!SLIST_IS_EMPTY(block))
1061 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1062 			    block, rp);
1063 		l_free(a_minus_b);
1064 		l_free(b_minus_a);
1065 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1066 send_to_ex:
1067 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1068 		    NULL);
1069 	} else {
1070 send_to_in:
1071 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1072 		    NULL);
1073 	}
1074 
1075 	/*
1076 	 * Need to set up retransmission state; merge the new info with the
1077 	 * current state (which may be null).  If the timer is not currently
1078 	 * running, start it (need to do a delayed start of the timer as
1079 	 * we're currently in the sq).
1080 	 */
1081 	rp = mcast_merge_rtx(ilm, rp, flist);
1082 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1083 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1084 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1085 		mutex_enter(&ipst->ips_igmp_timer_lock);
1086 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1087 		    ilm->ilm_rtx.rtx_timer);
1088 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1089 		mutex_exit(&ipst->ips_igmp_timer_lock);
1090 	}
1091 
1092 	mutex_exit(&ill->ill_lock);
1093 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1094 }
1095 
1096 void
1097 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1098 {
1099 	ill_t *ill;
1100 	mrec_t *rp = NULL;
1101 	ip_stack_t	*ipst = ilm->ilm_ipst;
1102 
1103 	ASSERT(ilm != NULL);
1104 
1105 	ill = ilm->ilm_ill;
1106 
1107 	/* only need to send if we have an mldv2-capable router */
1108 	mutex_enter(&ill->ill_lock);
1109 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1110 		mutex_exit(&ill->ill_lock);
1111 		return;
1112 	}
1113 
1114 	/*
1115 	 * Compare existing (old) state with the new state passed in
1116 	 * and send appropriate MLDv2 State Change Report.
1117 	 *
1118 	 *	Old State	New State	State Change Report
1119 	 *
1120 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1121 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1122 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1123 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1124 	 */
1125 	if (ilm->ilm_fmode == fmode) {
1126 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1127 		slist_t *allow, *block;
1128 		if (((a_minus_b = l_alloc()) == NULL) ||
1129 		    ((b_minus_a = l_alloc()) == NULL)) {
1130 			l_free(a_minus_b);
1131 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1132 				goto send_to_ex;
1133 			else
1134 				goto send_to_in;
1135 		}
1136 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1137 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1138 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1139 			allow = b_minus_a;
1140 			block = a_minus_b;
1141 		} else {
1142 			allow = a_minus_b;
1143 			block = b_minus_a;
1144 		}
1145 		if (!SLIST_IS_EMPTY(allow))
1146 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1147 			    allow, rp);
1148 		if (!SLIST_IS_EMPTY(block))
1149 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1150 			    block, rp);
1151 		l_free(a_minus_b);
1152 		l_free(b_minus_a);
1153 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1154 send_to_ex:
1155 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1156 		    NULL);
1157 	} else {
1158 send_to_in:
1159 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1160 		    NULL);
1161 	}
1162 
1163 	/*
1164 	 * Need to set up retransmission state; merge the new info with the
1165 	 * current state (which may be null).  If the timer is not currently
1166 	 * running, start it (need to do a deferred start of the timer as
1167 	 * we're currently in the sq).
1168 	 */
1169 	rp = mcast_merge_rtx(ilm, rp, flist);
1170 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1171 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1172 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1173 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1174 		mutex_enter(&ipst->ips_mld_timer_lock);
1175 		ipst->ips_mld_deferred_next =
1176 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1177 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1178 		mutex_exit(&ipst->ips_mld_timer_lock);
1179 	}
1180 
1181 	mutex_exit(&ill->ill_lock);
1182 	mldv2_sendrpt(ill, rp);
1183 }
1184 
1185 uint_t
1186 igmp_timeout_handler_per_ill(ill_t *ill)
1187 {
1188 	uint_t	next = INFINITY, current;
1189 	ilm_t	*ilm;
1190 	ipif_t	*ipif;
1191 	mrec_t	*rp = NULL;
1192 	mrec_t	*rtxrp = NULL;
1193 	rtx_state_t *rtxp;
1194 	mcast_record_t	rtype;
1195 
1196 	ASSERT(IAM_WRITER_ILL(ill));
1197 
1198 	mutex_enter(&ill->ill_lock);
1199 
1200 	current = CURRENT_MSTIME;
1201 	/* First check the global timer on this interface */
1202 	if (ill->ill_global_timer == INFINITY)
1203 		goto per_ilm_timer;
1204 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1205 		ill->ill_global_timer = INFINITY;
1206 		/*
1207 		 * Send report for each group on this interface.
1208 		 * Since we just set the global timer (received a v3 general
1209 		 * query), need to skip the all hosts addr (224.0.0.1), per
1210 		 * RFC 3376 section 5.
1211 		 */
1212 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1213 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1214 				continue;
1215 			ASSERT(ilm->ilm_ipif != NULL);
1216 			ilm->ilm_ipif->ipif_igmp_rpt =
1217 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1218 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1219 			/*
1220 			 * Since we're sending a report on this group, okay
1221 			 * to delete pending group-specific timers.  Note
1222 			 * that group-specific retransmit timers still need
1223 			 * to be checked in the per_ilm_timer for-loop.
1224 			 */
1225 			ilm->ilm_timer = INFINITY;
1226 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1227 			FREE_SLIST(ilm->ilm_pendsrcs);
1228 			ilm->ilm_pendsrcs = NULL;
1229 		}
1230 		/*
1231 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1232 		 * and send a report for each ipif that has an mrec list.
1233 		 */
1234 		for (ipif = ill->ill_ipif; ipif != NULL;
1235 		    ipif = ipif->ipif_next) {
1236 			if (ipif->ipif_igmp_rpt == NULL)
1237 				continue;
1238 			mutex_exit(&ill->ill_lock);
1239 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1240 			mutex_enter(&ill->ill_lock);
1241 			/* mrec list was freed by igmpv3_sendrpt() */
1242 			ipif->ipif_igmp_rpt = NULL;
1243 		}
1244 	} else {
1245 		if ((ill->ill_global_timer - current) < next)
1246 			next = ill->ill_global_timer - current;
1247 	}
1248 
1249 per_ilm_timer:
1250 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1251 		if (ilm->ilm_timer == INFINITY)
1252 			goto per_ilm_rtxtimer;
1253 
1254 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1255 			if ((ilm->ilm_timer - current) < next)
1256 				next = ilm->ilm_timer - current;
1257 
1258 			if (ip_debug > 1) {
1259 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1260 				    "igmp_timo_hlr 2: ilm_timr %d "
1261 				    "typ %d nxt %d",
1262 				    (int)ntohl(ilm->ilm_timer - current),
1263 				    (ill->ill_mcast_type), next);
1264 			}
1265 
1266 			goto per_ilm_rtxtimer;
1267 		}
1268 
1269 		/* the timer has expired, need to take action */
1270 		ilm->ilm_timer = INFINITY;
1271 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1272 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1273 			mutex_exit(&ill->ill_lock);
1274 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1275 			mutex_enter(&ill->ill_lock);
1276 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1277 			mutex_exit(&ill->ill_lock);
1278 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1279 			mutex_enter(&ill->ill_lock);
1280 		} else {
1281 			slist_t *rsp;
1282 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1283 			    (rsp = l_alloc()) != NULL) {
1284 				/*
1285 				 * Contents of reply depend on pending
1286 				 * requested source list.
1287 				 */
1288 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1289 					l_intersection(ilm->ilm_filter,
1290 					    ilm->ilm_pendsrcs, rsp);
1291 				} else {
1292 					l_difference(ilm->ilm_pendsrcs,
1293 					    ilm->ilm_filter, rsp);
1294 				}
1295 				FREE_SLIST(ilm->ilm_pendsrcs);
1296 				ilm->ilm_pendsrcs = NULL;
1297 				if (!SLIST_IS_EMPTY(rsp))
1298 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1299 					    &ilm->ilm_v6addr, rsp, rp);
1300 				FREE_SLIST(rsp);
1301 			} else {
1302 				/*
1303 				 * Either the pending request is just group-
1304 				 * specific, or we couldn't get the resources
1305 				 * (rsp) to build a source-specific reply.
1306 				 */
1307 				rp = mcast_bldmrec(ilm->ilm_fmode,
1308 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1309 			}
1310 			mutex_exit(&ill->ill_lock);
1311 			igmpv3_sendrpt(ill->ill_ipif, rp);
1312 			mutex_enter(&ill->ill_lock);
1313 			rp = NULL;
1314 		}
1315 
1316 per_ilm_rtxtimer:
1317 		rtxp = &ilm->ilm_rtx;
1318 
1319 		if (rtxp->rtx_timer == INFINITY)
1320 			continue;
1321 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1322 			if ((rtxp->rtx_timer - current) < next)
1323 				next = rtxp->rtx_timer - current;
1324 			continue;
1325 		}
1326 
1327 		rtxp->rtx_timer = INFINITY;
1328 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1329 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1330 			mutex_exit(&ill->ill_lock);
1331 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1332 			mutex_enter(&ill->ill_lock);
1333 			continue;
1334 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1335 			mutex_exit(&ill->ill_lock);
1336 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1337 			mutex_enter(&ill->ill_lock);
1338 			continue;
1339 		}
1340 
1341 		/*
1342 		 * The retransmit timer has popped, and our router is
1343 		 * IGMPv3.  We have to delve into the retransmit state
1344 		 * stored in the ilm.
1345 		 *
1346 		 * Decrement the retransmit count.  If the fmode rtx
1347 		 * count is active, decrement it, and send a filter
1348 		 * mode change report with the ilm's source list.
1349 		 * Otherwise, send a source list change report with
1350 		 * the current retransmit lists.
1351 		 */
1352 		ASSERT(rtxp->rtx_cnt > 0);
1353 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1354 		rtxp->rtx_cnt--;
1355 		if (rtxp->rtx_fmode_cnt > 0) {
1356 			rtxp->rtx_fmode_cnt--;
1357 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1358 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1359 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1360 			    ilm->ilm_filter, rtxrp);
1361 		} else {
1362 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1363 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1364 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1365 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1366 		}
1367 		if (rtxp->rtx_cnt > 0) {
1368 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1369 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1370 			if (rtxp->rtx_timer < next)
1371 				next = rtxp->rtx_timer;
1372 			rtxp->rtx_timer += current;
1373 		} else {
1374 			CLEAR_SLIST(rtxp->rtx_allow);
1375 			CLEAR_SLIST(rtxp->rtx_block);
1376 		}
1377 		mutex_exit(&ill->ill_lock);
1378 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1379 		mutex_enter(&ill->ill_lock);
1380 		rtxrp = NULL;
1381 	}
1382 
1383 	mutex_exit(&ill->ill_lock);
1384 
1385 	return (next);
1386 }
1387 
1388 /*
1389  * igmp_timeout_handler:
1390  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1391  * Returns number of ticks to next event (or 0 if none).
1392  *
1393  * As part of multicast join and leave igmp we may need to send out an
1394  * igmp request. The igmp related state variables in the ilm are protected
1395  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1396  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1397  * starts the igmp timer if needed. It serializes multiple threads trying to
1398  * simultaneously start the timer using the igmp_timer_setter_active flag.
1399  *
1400  * igmp_input() receives igmp queries and responds to the queries
1401  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1402  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1403  * performs the action exclusively after entering each ill's ipsq as writer.
1404  * The actual igmp timeout handler needs to run in the ipsq since it has to
1405  * access the ilm's and we don't want another exclusive operation like
1406  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1407  * another.
1408  *
1409  * The igmp_slowtimeo() function is called thru another timer.
1410  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1411  */
1412 void
1413 igmp_timeout_handler(void *arg)
1414 {
1415 	ill_t	*ill;
1416 	uint_t  global_next = INFINITY;
1417 	uint_t  next;
1418 	ill_walk_context_t ctx;
1419 	boolean_t success;
1420 	ip_stack_t *ipst = arg;
1421 
1422 	ASSERT(arg != NULL);
1423 	mutex_enter(&ipst->ips_igmp_timer_lock);
1424 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1425 	ipst->ips_igmp_timer_thread = curthread;
1426 	ipst->ips_igmp_timer_scheduled_last = 0;
1427 	ipst->ips_igmp_time_to_next = 0;
1428 	mutex_exit(&ipst->ips_igmp_timer_lock);
1429 
1430 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1431 	ill = ILL_START_WALK_V4(&ctx, ipst);
1432 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1433 		ASSERT(!ill->ill_isv6);
1434 		/*
1435 		 * We may not be able to refhold the ill if the ill/ipif
1436 		 * is changing. But we need to make sure that the ill will
1437 		 * not vanish. So we just bump up the ill_waiter count.
1438 		 */
1439 		if (!ill_waiter_inc(ill))
1440 			continue;
1441 		rw_exit(&ipst->ips_ill_g_lock);
1442 		success = ipsq_enter(ill, B_TRUE);
1443 		if (success) {
1444 			next = igmp_timeout_handler_per_ill(ill);
1445 			if (next < global_next)
1446 				global_next = next;
1447 			ipsq_exit(ill->ill_phyint->phyint_ipsq);
1448 		}
1449 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1450 		ill_waiter_dcr(ill);
1451 	}
1452 	rw_exit(&ipst->ips_ill_g_lock);
1453 
1454 	mutex_enter(&ipst->ips_igmp_timer_lock);
1455 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1456 	ipst->ips_igmp_timeout_id = 0;
1457 	ipst->ips_igmp_timer_thread = NULL;
1458 	mutex_exit(&ipst->ips_igmp_timer_lock);
1459 
1460 	if (global_next != INFINITY)
1461 		igmp_start_timers(global_next, ipst);
1462 }
1463 
1464 /*
1465  * mld_timeout_handler:
1466  * Called when there are timeout events, every next (tick).
1467  * Returns number of ticks to next event (or 0 if none).
1468  */
1469 /* ARGSUSED */
1470 uint_t
1471 mld_timeout_handler_per_ill(ill_t *ill)
1472 {
1473 	ilm_t 	*ilm;
1474 	uint_t	next = INFINITY, current;
1475 	mrec_t	*rp, *rtxrp;
1476 	rtx_state_t *rtxp;
1477 	mcast_record_t	rtype;
1478 
1479 	ASSERT(IAM_WRITER_ILL(ill));
1480 
1481 	mutex_enter(&ill->ill_lock);
1482 
1483 	current = CURRENT_MSTIME;
1484 	/*
1485 	 * First check the global timer on this interface; the global timer
1486 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1487 	 */
1488 	if (ill->ill_global_timer == INFINITY)
1489 		goto per_ilm_timer;
1490 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1491 		ill->ill_global_timer = INFINITY;
1492 		/*
1493 		 * Send report for each group on this interface.
1494 		 * Since we just set the global timer (received a v2 general
1495 		 * query), need to skip the all hosts addr (ff02::1), per
1496 		 * RFC 3810 section 6.
1497 		 */
1498 		rp = NULL;
1499 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1500 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1501 			    &ipv6_all_hosts_mcast))
1502 				continue;
1503 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1504 			    ilm->ilm_filter, rp);
1505 			/*
1506 			 * Since we're sending a report on this group, okay
1507 			 * to delete pending group-specific timers.  Note
1508 			 * that group-specific retransmit timers still need
1509 			 * to be checked in the per_ilm_timer for-loop.
1510 			 */
1511 			ilm->ilm_timer = INFINITY;
1512 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1513 			FREE_SLIST(ilm->ilm_pendsrcs);
1514 			ilm->ilm_pendsrcs = NULL;
1515 		}
1516 		mutex_exit(&ill->ill_lock);
1517 		mldv2_sendrpt(ill, rp);
1518 		mutex_enter(&ill->ill_lock);
1519 	} else {
1520 		if ((ill->ill_global_timer - current) < next)
1521 			next = ill->ill_global_timer - current;
1522 	}
1523 
1524 per_ilm_timer:
1525 	rp = rtxrp = NULL;
1526 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1527 		if (ilm->ilm_timer == INFINITY)
1528 			goto per_ilm_rtxtimer;
1529 
1530 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1531 			if ((ilm->ilm_timer - current) < next)
1532 				next = ilm->ilm_timer - current;
1533 
1534 			if (ip_debug > 1) {
1535 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1536 				    "igmp_timo_hlr 2: ilm_timr"
1537 				    " %d typ %d nxt %d",
1538 				    (int)ntohl(ilm->ilm_timer - current),
1539 				    (ill->ill_mcast_type), next);
1540 			}
1541 
1542 			goto per_ilm_rtxtimer;
1543 		}
1544 
1545 		/* the timer has expired, need to take action */
1546 		ilm->ilm_timer = INFINITY;
1547 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1548 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1549 			mutex_exit(&ill->ill_lock);
1550 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1551 			mutex_enter(&ill->ill_lock);
1552 		} else {
1553 			slist_t *rsp;
1554 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1555 			    (rsp = l_alloc()) != NULL) {
1556 				/*
1557 				 * Contents of reply depend on pending
1558 				 * requested source list.
1559 				 */
1560 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1561 					l_intersection(ilm->ilm_filter,
1562 					    ilm->ilm_pendsrcs, rsp);
1563 				} else {
1564 					l_difference(ilm->ilm_pendsrcs,
1565 					    ilm->ilm_filter, rsp);
1566 				}
1567 				FREE_SLIST(ilm->ilm_pendsrcs);
1568 				ilm->ilm_pendsrcs = NULL;
1569 				if (!SLIST_IS_EMPTY(rsp))
1570 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1571 					    &ilm->ilm_v6addr, rsp, rp);
1572 				FREE_SLIST(rsp);
1573 			} else {
1574 				rp = mcast_bldmrec(ilm->ilm_fmode,
1575 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1576 			}
1577 		}
1578 
1579 per_ilm_rtxtimer:
1580 		rtxp = &ilm->ilm_rtx;
1581 
1582 		if (rtxp->rtx_timer == INFINITY)
1583 			continue;
1584 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1585 			if ((rtxp->rtx_timer - current) < next)
1586 				next = rtxp->rtx_timer - current;
1587 			continue;
1588 		}
1589 
1590 		rtxp->rtx_timer = INFINITY;
1591 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1592 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1593 			mutex_exit(&ill->ill_lock);
1594 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1595 			mutex_enter(&ill->ill_lock);
1596 			continue;
1597 		}
1598 
1599 		/*
1600 		 * The retransmit timer has popped, and our router is
1601 		 * MLDv2.  We have to delve into the retransmit state
1602 		 * stored in the ilm.
1603 		 *
1604 		 * Decrement the retransmit count.  If the fmode rtx
1605 		 * count is active, decrement it, and send a filter
1606 		 * mode change report with the ilm's source list.
1607 		 * Otherwise, send a source list change report with
1608 		 * the current retransmit lists.
1609 		 */
1610 		ASSERT(rtxp->rtx_cnt > 0);
1611 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1612 		rtxp->rtx_cnt--;
1613 		if (rtxp->rtx_fmode_cnt > 0) {
1614 			rtxp->rtx_fmode_cnt--;
1615 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1616 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1617 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1618 			    ilm->ilm_filter, rtxrp);
1619 		} else {
1620 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1621 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1622 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1623 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1624 		}
1625 		if (rtxp->rtx_cnt > 0) {
1626 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1627 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1628 			if (rtxp->rtx_timer < next)
1629 				next = rtxp->rtx_timer;
1630 			rtxp->rtx_timer += current;
1631 		} else {
1632 			CLEAR_SLIST(rtxp->rtx_allow);
1633 			CLEAR_SLIST(rtxp->rtx_block);
1634 		}
1635 	}
1636 
1637 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1638 		mutex_exit(&ill->ill_lock);
1639 		mldv2_sendrpt(ill, rp);
1640 		mldv2_sendrpt(ill, rtxrp);
1641 		return (next);
1642 	}
1643 
1644 	mutex_exit(&ill->ill_lock);
1645 
1646 	return (next);
1647 }
1648 
1649 /*
1650  * mld_timeout_handler:
1651  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1652  * Returns number of ticks to next event (or 0 if none).
1653  * MT issues are same as igmp_timeout_handler
1654  */
1655 void
1656 mld_timeout_handler(void *arg)
1657 {
1658 	ill_t	*ill;
1659 	uint_t  global_next = INFINITY;
1660 	uint_t  next;
1661 	ill_walk_context_t ctx;
1662 	boolean_t success;
1663 	ip_stack_t *ipst = arg;
1664 
1665 	ASSERT(arg != NULL);
1666 	mutex_enter(&ipst->ips_mld_timer_lock);
1667 	ASSERT(ipst->ips_mld_timeout_id != 0);
1668 	ipst->ips_mld_timer_thread = curthread;
1669 	ipst->ips_mld_timer_scheduled_last = 0;
1670 	ipst->ips_mld_time_to_next = 0;
1671 	mutex_exit(&ipst->ips_mld_timer_lock);
1672 
1673 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674 	ill = ILL_START_WALK_V6(&ctx, ipst);
1675 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1676 		ASSERT(ill->ill_isv6);
1677 		/*
1678 		 * We may not be able to refhold the ill if the ill/ipif
1679 		 * is changing. But we need to make sure that the ill will
1680 		 * not vanish. So we just bump up the ill_waiter count.
1681 		 */
1682 		if (!ill_waiter_inc(ill))
1683 			continue;
1684 		rw_exit(&ipst->ips_ill_g_lock);
1685 		success = ipsq_enter(ill, B_TRUE);
1686 		if (success) {
1687 			next = mld_timeout_handler_per_ill(ill);
1688 			if (next < global_next)
1689 				global_next = next;
1690 			ipsq_exit(ill->ill_phyint->phyint_ipsq);
1691 		}
1692 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1693 		ill_waiter_dcr(ill);
1694 	}
1695 	rw_exit(&ipst->ips_ill_g_lock);
1696 
1697 	mutex_enter(&ipst->ips_mld_timer_lock);
1698 	ASSERT(ipst->ips_mld_timeout_id != 0);
1699 	ipst->ips_mld_timeout_id = 0;
1700 	ipst->ips_mld_timer_thread = NULL;
1701 	mutex_exit(&ipst->ips_mld_timer_lock);
1702 
1703 	if (global_next != INFINITY)
1704 		mld_start_timers(global_next, ipst);
1705 }
1706 
1707 /*
1708  * Calculate the Older Version Querier Present timeout value, in number
1709  * of slowtimo intervals, for the given ill.
1710  */
1711 #define	OVQP(ill) \
1712 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1713 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1714 
1715 /*
1716  * igmp_slowtimo:
1717  * - Resets to new router if we didnt we hear from the router
1718  *   in IGMP_AGE_THRESHOLD seconds.
1719  * - Resets slowtimeout.
1720  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1721  * IGMP version than configured.
1722  */
1723 void
1724 igmp_slowtimo(void *arg)
1725 {
1726 	ill_t	*ill;
1727 	ill_if_t *ifp;
1728 	avl_tree_t *avl_tree;
1729 	ip_stack_t *ipst = (ip_stack_t *)arg;
1730 
1731 	ASSERT(arg != NULL);
1732 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1733 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1734 
1735 	/*
1736 	 * The ill_if_t list is circular, hence the odd loop parameters.
1737 	 *
1738 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1739 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1740 	 * structure (allowing us to skip if none of the instances have timers
1741 	 * running).
1742 	 */
1743 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1744 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1745 	    ifp = ifp->illif_next) {
1746 		/*
1747 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1748 		 * a V1 or V2 query now and we miss seeing the count now,
1749 		 * we will see it the next time igmp_slowtimo is called.
1750 		 */
1751 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1752 			continue;
1753 
1754 		avl_tree = &ifp->illif_avl_by_ppa;
1755 		for (ill = avl_first(avl_tree); ill != NULL;
1756 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1757 			mutex_enter(&ill->ill_lock);
1758 			if (ill->ill_mcast_v1_tset == 1)
1759 				ill->ill_mcast_v1_time++;
1760 			if (ill->ill_mcast_v2_tset == 1)
1761 				ill->ill_mcast_v2_time++;
1762 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1763 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1764 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1765 				if ((ill->ill_mcast_v2_tset > 0) ||
1766 				    (ipst->ips_igmp_max_version ==
1767 				    IGMP_V2_ROUTER)) {
1768 					ip1dbg(("V1 query timer "
1769 					    "expired on %s; switching "
1770 					    "mode to IGMP_V2\n",
1771 					    ill->ill_name));
1772 					ill->ill_mcast_type =
1773 					    IGMP_V2_ROUTER;
1774 				} else {
1775 					ip1dbg(("V1 query timer "
1776 					    "expired on %s; switching "
1777 					    "mode to IGMP_V3\n",
1778 					    ill->ill_name));
1779 					ill->ill_mcast_type =
1780 					    IGMP_V3_ROUTER;
1781 				}
1782 				ill->ill_mcast_v1_time = 0;
1783 				ill->ill_mcast_v1_tset = 0;
1784 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1785 			}
1786 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1787 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1788 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1789 				ip1dbg(("V2 query timer expired on "
1790 				    "%s; switching mode to IGMP_V3\n",
1791 				    ill->ill_name));
1792 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1793 				ill->ill_mcast_v2_time = 0;
1794 				ill->ill_mcast_v2_tset = 0;
1795 				atomic_add_16(&ifp->illif_mcast_v2, -1);
1796 			}
1797 			mutex_exit(&ill->ill_lock);
1798 		}
1799 	}
1800 	rw_exit(&ipst->ips_ill_g_lock);
1801 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1802 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1803 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1804 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1805 }
1806 
1807 /*
1808  * mld_slowtimo:
1809  * - Resets to newer version if we didn't hear from the older version router
1810  *   in MLD_AGE_THRESHOLD seconds.
1811  * - Restarts slowtimeout.
1812  * Check for ips_mld_max_version ensures that we don't revert to a higher
1813  * IGMP version than configured.
1814  */
1815 /* ARGSUSED */
1816 void
1817 mld_slowtimo(void *arg)
1818 {
1819 	ill_t *ill;
1820 	ill_if_t *ifp;
1821 	avl_tree_t *avl_tree;
1822 	ip_stack_t *ipst = (ip_stack_t *)arg;
1823 
1824 	ASSERT(arg != NULL);
1825 	/* See comments in igmp_slowtimo() above... */
1826 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1827 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1828 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1829 	    ifp = ifp->illif_next) {
1830 		if (ifp->illif_mcast_v1 == 0)
1831 			continue;
1832 
1833 		avl_tree = &ifp->illif_avl_by_ppa;
1834 		for (ill = avl_first(avl_tree); ill != NULL;
1835 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1836 			mutex_enter(&ill->ill_lock);
1837 			if (ill->ill_mcast_v1_tset == 1)
1838 				ill->ill_mcast_v1_time++;
1839 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1840 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1841 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1842 				ip1dbg(("MLD query timer expired on"
1843 				    " %s; switching mode to MLD_V2\n",
1844 				    ill->ill_name));
1845 				ill->ill_mcast_type = MLD_V2_ROUTER;
1846 				ill->ill_mcast_v1_time = 0;
1847 				ill->ill_mcast_v1_tset = 0;
1848 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1849 			}
1850 			mutex_exit(&ill->ill_lock);
1851 		}
1852 	}
1853 	rw_exit(&ipst->ips_ill_g_lock);
1854 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1855 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1856 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1857 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1858 }
1859 
1860 /*
1861  * igmp_sendpkt:
1862  * This will send to ip_wput like icmp_inbound.
1863  * Note that the lower ill (on which the membership is kept) is used
1864  * as an upper ill to pass in the multicast parameters.
1865  */
1866 static void
1867 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1868 {
1869 	mblk_t	*mp;
1870 	igmpa_t	*igmpa;
1871 	uint8_t *rtralert;
1872 	ipha_t	*ipha;
1873 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1874 	size_t	size  = hdrlen + sizeof (igmpa_t);
1875 	ipif_t 	*ipif = ilm->ilm_ipif;
1876 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1877 	mblk_t	*first_mp;
1878 	ipsec_out_t *io;
1879 	zoneid_t zoneid;
1880 	ip_stack_t *ipst = ill->ill_ipst;
1881 
1882 	/*
1883 	 * We need to make sure this packet goes out on an ipif. If
1884 	 * there is some global policy match in ip_wput_ire, we need
1885 	 * to get to the right interface after IPSEC processing.
1886 	 * To make sure this multicast packet goes out on the right
1887 	 * interface, we attach an ipsec_out and initialize ill_index
1888 	 * like we did in ip_wput. To make sure that this packet does
1889 	 * not get forwarded on other interfaces or looped back, we
1890 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1891 	 * to B_FALSE.
1892 	 *
1893 	 * We also need to make sure that this does not get load balanced
1894 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1895 	 * here. If it gets load balanced, switches supporting igmp snooping
1896 	 * will send the packet that it receives for this multicast group
1897 	 * to the interface that we are sending on. As we have joined the
1898 	 * multicast group on this ill, by sending the packet out on this
1899 	 * ill, we receive all the packets back on this ill.
1900 	 */
1901 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1902 	if (first_mp == NULL)
1903 		return;
1904 
1905 	first_mp->b_datap->db_type = M_CTL;
1906 	first_mp->b_wptr += sizeof (ipsec_info_t);
1907 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1908 	/* ipsec_out_secure is B_FALSE now */
1909 	io = (ipsec_out_t *)first_mp->b_rptr;
1910 	io->ipsec_out_type = IPSEC_OUT;
1911 	io->ipsec_out_len = sizeof (ipsec_out_t);
1912 	io->ipsec_out_use_global_policy = B_TRUE;
1913 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1914 	io->ipsec_out_attach_if = B_TRUE;
1915 	io->ipsec_out_multicast_loop = B_FALSE;
1916 	io->ipsec_out_dontroute = B_TRUE;
1917 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1918 		zoneid = GLOBAL_ZONEID;
1919 	io->ipsec_out_zoneid = zoneid;
1920 	io->ipsec_out_ns = ipst->ips_netstack;	/* No netstack_hold */
1921 
1922 	mp = allocb(size, BPRI_HI);
1923 	if (mp == NULL) {
1924 		freemsg(first_mp);
1925 		return;
1926 	}
1927 	mp->b_wptr = mp->b_rptr + size;
1928 	first_mp->b_cont = mp;
1929 
1930 	ipha = (ipha_t *)mp->b_rptr;
1931 	rtralert = (uint8_t *)&(ipha[1]);
1932 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1933 	igmpa->igmpa_type   = type;
1934 	igmpa->igmpa_code   = 0;
1935 	igmpa->igmpa_group  = ilm->ilm_addr;
1936 	igmpa->igmpa_cksum  = 0;
1937 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1938 
1939 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1940 	rtralert[1] = RTRALERT_LEN;
1941 	rtralert[2] = 0;
1942 	rtralert[3] = 0;
1943 
1944 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1945 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1946 	ipha->ipha_type_of_service 	= 0;
1947 	ipha->ipha_length = htons(size);
1948 	ipha->ipha_ident = 0;
1949 	ipha->ipha_fragment_offset_and_flags = 0;
1950 	ipha->ipha_ttl 		= IGMP_TTL;
1951 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1952 	ipha->ipha_hdr_checksum 	= 0;
1953 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1954 	ipha->ipha_src 		= ipif->ipif_src_addr;
1955 	/*
1956 	 * Request loopback of the report if we are acting as a multicast
1957 	 * router, so that the process-level routing demon can hear it.
1958 	 */
1959 	/*
1960 	 * This will run multiple times for the same group if there are members
1961 	 * on the same group for multiple ipif's on the same ill. The
1962 	 * igmp_input code will suppress this due to the loopback thus we
1963 	 * always loopback membership report.
1964 	 */
1965 	ASSERT(ill->ill_rq != NULL);
1966 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1967 
1968 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
1969 
1970 	++ipst->ips_igmpstat.igps_snd_reports;
1971 }
1972 
1973 /*
1974  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1975  * with the passed-in ipif.  The report will contain one group record
1976  * for each element of reclist.  If this causes packet length to
1977  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1978  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1979  * and those buffers are freed here.
1980  */
1981 static void
1982 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1983 {
1984 	ipsec_out_t *io;
1985 	igmp3ra_t *igmp3ra;
1986 	grphdra_t *grphdr;
1987 	mblk_t *first_mp, *mp;
1988 	ipha_t *ipha;
1989 	uint8_t *rtralert;
1990 	ipaddr_t *src_array;
1991 	int i, j, numrec, more_src_cnt;
1992 	size_t hdrsize, size, rsize;
1993 	ill_t *ill = ipif->ipif_ill;
1994 	mrec_t *rp, *cur_reclist;
1995 	mrec_t *next_reclist = reclist;
1996 	boolean_t morepkts;
1997 	zoneid_t zoneid;
1998 	ip_stack_t	 *ipst = ill->ill_ipst;
1999 
2000 	/* if there aren't any records, there's nothing to send */
2001 	if (reclist == NULL)
2002 		return;
2003 
2004 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
2005 nextpkt:
2006 	size = hdrsize + sizeof (igmp3ra_t);
2007 	morepkts = B_FALSE;
2008 	more_src_cnt = 0;
2009 	cur_reclist = next_reclist;
2010 	numrec = 0;
2011 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2012 		rsize = sizeof (grphdra_t) +
2013 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2014 		if (size + rsize > ill->ill_max_frag) {
2015 			if (rp == cur_reclist) {
2016 				/*
2017 				 * If the first mrec we looked at is too big
2018 				 * to fit in a single packet (i.e the source
2019 				 * list is too big), we must either truncate
2020 				 * the list (if TO_EX or IS_EX), or send
2021 				 * multiple reports for the same group (all
2022 				 * other types).
2023 				 */
2024 				int srcspace, srcsperpkt;
2025 				srcspace = ill->ill_max_frag - (size +
2026 				    sizeof (grphdra_t));
2027 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2028 				/*
2029 				 * Increment size and numrec, because we will
2030 				 * be sending a record for the mrec we're
2031 				 * looking at now.
2032 				 */
2033 				size += sizeof (grphdra_t) +
2034 				    (srcsperpkt * sizeof (ipaddr_t));
2035 				numrec++;
2036 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2037 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2038 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2039 					if (rp->mrec_next == NULL) {
2040 						/* no more packets to send */
2041 						break;
2042 					} else {
2043 						/*
2044 						 * more packets, but we're
2045 						 * done with this mrec.
2046 						 */
2047 						next_reclist = rp->mrec_next;
2048 					}
2049 				} else {
2050 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2051 					    - srcsperpkt;
2052 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2053 					/*
2054 					 * We'll fix up this mrec (remove the
2055 					 * srcs we've already sent) before
2056 					 * returning to nextpkt above.
2057 					 */
2058 					next_reclist = rp;
2059 				}
2060 			} else {
2061 				next_reclist = rp;
2062 			}
2063 			morepkts = B_TRUE;
2064 			break;
2065 		}
2066 		size += rsize;
2067 		numrec++;
2068 	}
2069 
2070 	/*
2071 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2072 	 * load balancing requirements.
2073 	 */
2074 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2075 	if (first_mp == NULL)
2076 		goto free_reclist;
2077 
2078 	first_mp->b_datap->db_type = M_CTL;
2079 	first_mp->b_wptr += sizeof (ipsec_info_t);
2080 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2081 	/* ipsec_out_secure is B_FALSE now */
2082 	io = (ipsec_out_t *)first_mp->b_rptr;
2083 	io->ipsec_out_type = IPSEC_OUT;
2084 	io->ipsec_out_len = sizeof (ipsec_out_t);
2085 	io->ipsec_out_use_global_policy = B_TRUE;
2086 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2087 	io->ipsec_out_attach_if = B_TRUE;
2088 	io->ipsec_out_multicast_loop = B_FALSE;
2089 	io->ipsec_out_dontroute = B_TRUE;
2090 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2091 		zoneid = GLOBAL_ZONEID;
2092 	io->ipsec_out_zoneid = zoneid;
2093 
2094 	mp = allocb(size, BPRI_HI);
2095 	if (mp == NULL) {
2096 		freemsg(first_mp);
2097 		goto free_reclist;
2098 	}
2099 	bzero((char *)mp->b_rptr, size);
2100 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2101 	first_mp->b_cont = mp;
2102 
2103 	ipha = (ipha_t *)mp->b_rptr;
2104 	rtralert = (uint8_t *)&(ipha[1]);
2105 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2106 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2107 
2108 	rp = cur_reclist;
2109 	for (i = 0; i < numrec; i++) {
2110 		grphdr->grphdra_type = rp->mrec_type;
2111 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2112 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2113 		src_array = (ipaddr_t *)&(grphdr[1]);
2114 
2115 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2116 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2117 
2118 		grphdr = (grphdra_t *)&(src_array[j]);
2119 		rp = rp->mrec_next;
2120 	}
2121 
2122 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2123 	igmp3ra->igmp3ra_numrec = htons(numrec);
2124 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2125 
2126 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2127 	rtralert[1] = RTRALERT_LEN;
2128 	rtralert[2] = 0;
2129 	rtralert[3] = 0;
2130 
2131 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2132 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2133 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2134 	ipha->ipha_length = htons(size);
2135 	ipha->ipha_ttl = IGMP_TTL;
2136 	ipha->ipha_protocol = IPPROTO_IGMP;
2137 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2138 	ipha->ipha_src = ipif->ipif_src_addr;
2139 
2140 	/*
2141 	 * Request loopback of the report if we are acting as a multicast
2142 	 * router, so that the process-level routing daemon can hear it.
2143 	 *
2144 	 * This will run multiple times for the same group if there are
2145 	 * members on the same group for multiple ipifs on the same ill.
2146 	 * The igmp_input code will suppress this due to the loopback;
2147 	 * thus we always loopback membership report.
2148 	 */
2149 	ASSERT(ill->ill_rq != NULL);
2150 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2151 
2152 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
2153 
2154 	++ipst->ips_igmpstat.igps_snd_reports;
2155 
2156 	if (morepkts) {
2157 		if (more_src_cnt > 0) {
2158 			int index, mvsize;
2159 			slist_t *sl = &next_reclist->mrec_srcs;
2160 			index = sl->sl_numsrc;
2161 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2162 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2163 			    mvsize);
2164 			sl->sl_numsrc = more_src_cnt;
2165 		}
2166 		goto nextpkt;
2167 	}
2168 
2169 free_reclist:
2170 	while (reclist != NULL) {
2171 		rp = reclist->mrec_next;
2172 		mi_free(reclist);
2173 		reclist = rp;
2174 	}
2175 }
2176 
2177 /*
2178  * mld_input:
2179  */
2180 /* ARGSUSED */
2181 void
2182 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2183 {
2184 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2185 	mld_hdr_t	*mldh;
2186 	ilm_t		*ilm;
2187 	ipif_t		*ipif;
2188 	uint16_t	hdr_length, exthdr_length;
2189 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2190 	uint_t		next;
2191 	int		mldlen;
2192 	ip_stack_t	*ipst = ill->ill_ipst;
2193 
2194 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2195 
2196 	/* Make sure the src address of the packet is link-local */
2197 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2198 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2199 		freemsg(mp);
2200 		return;
2201 	}
2202 
2203 	if (ip6h->ip6_hlim != 1) {
2204 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2205 		freemsg(mp);
2206 		return;
2207 	}
2208 
2209 	/* Get to the icmp header part */
2210 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2211 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2212 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2213 	} else {
2214 		hdr_length = IPV6_HDR_LEN;
2215 		exthdr_length = 0;
2216 	}
2217 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2218 
2219 	/* An MLD packet must at least be 24 octets to be valid */
2220 	if (mldlen < MLD_MINLEN) {
2221 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2222 		freemsg(mp);
2223 		return;
2224 	}
2225 
2226 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2227 
2228 	switch (mldh->mld_type) {
2229 	case MLD_LISTENER_QUERY:
2230 		/*
2231 		 * packet length differentiates between v1 and v2.  v1
2232 		 * query should be exactly 24 octets long; v2 is >= 28.
2233 		 */
2234 		if ((mldlen == MLD_MINLEN) ||
2235 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2236 			next = mld_query_in(mldh, ill);
2237 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2238 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2239 		} else {
2240 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2241 			freemsg(mp);
2242 			return;
2243 		}
2244 		if (next == 0) {
2245 			freemsg(mp);
2246 			return;
2247 		}
2248 
2249 		if (next != INFINITY)
2250 			mld_start_timers(next, ipst);
2251 		break;
2252 
2253 	case MLD_LISTENER_REPORT: {
2254 
2255 		ASSERT(ill->ill_ipif != NULL);
2256 		/*
2257 		 * For fast leave to work, we have to know that we are the
2258 		 * last person to send a report for this group.  Reports
2259 		 * generated by us are looped back since we could potentially
2260 		 * be a multicast router, so discard reports sourced by me.
2261 		 */
2262 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2263 		mutex_enter(&ill->ill_lock);
2264 		for (ipif = ill->ill_ipif; ipif != NULL;
2265 		    ipif = ipif->ipif_next) {
2266 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2267 			    lcladdr_ptr)) {
2268 				if (ip_debug > 1) {
2269 					char    buf1[INET6_ADDRSTRLEN];
2270 					char	buf2[INET6_ADDRSTRLEN];
2271 
2272 					(void) mi_strlog(ill->ill_rq,
2273 					    1,
2274 					    SL_TRACE,
2275 					    "mld_input: we are only "
2276 					    "member src %s ipif_local %s",
2277 					    inet_ntop(AF_INET6, lcladdr_ptr,
2278 					    buf1, sizeof (buf1)),
2279 					    inet_ntop(AF_INET6,
2280 					    &ipif->ipif_v6lcl_addr,
2281 					    buf2, sizeof (buf2)));
2282 				}
2283 				mutex_exit(&ill->ill_lock);
2284 				freemsg(mp);
2285 				return;
2286 			}
2287 		}
2288 		mutex_exit(&ill->ill_lock);
2289 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2290 
2291 		v6group_ptr = &mldh->mld_addr;
2292 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2293 			BUMP_MIB(ill->ill_icmp6_mib,
2294 			    ipv6IfIcmpInGroupMembBadReports);
2295 			freemsg(mp);
2296 			return;
2297 		}
2298 
2299 
2300 		/*
2301 		 * If we belong to the group being reported, and we are a
2302 		 * 'Delaying member' per the RFC terminology, stop our timer
2303 		 * for that group and 'clear flag' i.e. mark ilm_state as
2304 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2305 		 * membership entries for the same group address (one per zone)
2306 		 * so we need to walk the ill_ilm list.
2307 		 */
2308 		mutex_enter(&ill->ill_lock);
2309 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2310 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2311 				continue;
2312 			BUMP_MIB(ill->ill_icmp6_mib,
2313 			    ipv6IfIcmpInGroupMembOurReports);
2314 
2315 			ilm->ilm_timer = INFINITY;
2316 			ilm->ilm_state = IGMP_OTHERMEMBER;
2317 		}
2318 		mutex_exit(&ill->ill_lock);
2319 		break;
2320 	}
2321 	case MLD_LISTENER_REDUCTION:
2322 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2323 		break;
2324 	}
2325 	/*
2326 	 * All MLD packets have already been passed up to any
2327 	 * process(es) listening on a ICMP6 raw socket. This
2328 	 * has been accomplished in ip_deliver_local_v6 prior to
2329 	 * this function call. It is assumed that the multicast daemon
2330 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2331 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2332 	 * Thus we can free the MLD message block here
2333 	 */
2334 	freemsg(mp);
2335 }
2336 
2337 /*
2338  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2339  * (non-zero, unsigned) timer value to be set on success.
2340  */
2341 static uint_t
2342 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2343 {
2344 	ilm_t	*ilm;
2345 	int	timer;
2346 	uint_t	next, current;
2347 	in6_addr_t *v6group;
2348 
2349 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2350 
2351 	/*
2352 	 * In the MLD specification, there are 3 states and a flag.
2353 	 *
2354 	 * In Non-Listener state, we simply don't have a membership record.
2355 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2356 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2357 	 * INFINITY)
2358 	 *
2359 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2360 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2361 	 * if I sent the last report.
2362 	 */
2363 	v6group = &mldh->mld_addr;
2364 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2365 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2366 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2367 		return (0);
2368 	}
2369 
2370 	/* Need to do compatibility mode checking */
2371 	mutex_enter(&ill->ill_lock);
2372 	ill->ill_mcast_v1_time = 0;
2373 	ill->ill_mcast_v1_tset = 1;
2374 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2375 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2376 		    "MLD_V1_ROUTER\n", ill->ill_name));
2377 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2378 		ill->ill_mcast_type = MLD_V1_ROUTER;
2379 	}
2380 	mutex_exit(&ill->ill_lock);
2381 
2382 	timer = (int)ntohs(mldh->mld_maxdelay);
2383 	if (ip_debug > 1) {
2384 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2385 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2386 		    timer, (int)mldh->mld_type);
2387 	}
2388 
2389 	/*
2390 	 * -Start the timers in all of our membership records for
2391 	 * the physical interface on which the query arrived,
2392 	 * excl:
2393 	 *	1.  those that belong to the "all hosts" group,
2394 	 *	2.  those with 0 scope, or 1 node-local scope.
2395 	 *
2396 	 * -Restart any timer that is already running but has a value
2397 	 * longer that the requested timeout.
2398 	 * -Use the value specified in the query message as the
2399 	 * maximum timeout.
2400 	 */
2401 	next = INFINITY;
2402 	mutex_enter(&ill->ill_lock);
2403 
2404 	current = CURRENT_MSTIME;
2405 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2406 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2407 
2408 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2409 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2410 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2411 			continue;
2412 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2413 		    &ipv6_all_hosts_mcast)) &&
2414 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2415 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2416 			if (timer == 0) {
2417 				/* Respond immediately */
2418 				ilm->ilm_timer = INFINITY;
2419 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2420 				mutex_exit(&ill->ill_lock);
2421 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2422 				mutex_enter(&ill->ill_lock);
2423 				break;
2424 			}
2425 			if (ilm->ilm_timer > timer) {
2426 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2427 				if (ilm->ilm_timer < next)
2428 					next = ilm->ilm_timer;
2429 				ilm->ilm_timer += current;
2430 			}
2431 			break;
2432 		}
2433 	}
2434 	mutex_exit(&ill->ill_lock);
2435 
2436 	return (next);
2437 }
2438 
2439 /*
2440  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2441  * returns the appropriate (non-zero, unsigned) timer value (which may
2442  * be INFINITY) to be set.
2443  */
2444 static uint_t
2445 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2446 {
2447 	ilm_t	*ilm;
2448 	in6_addr_t *v6group, *src_array;
2449 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2450 	uint8_t	qrv;
2451 
2452 	v6group = &mld2q->mld2q_addr;
2453 	numsrc = ntohs(mld2q->mld2q_numsrc);
2454 
2455 	/* make sure numsrc matches packet size */
2456 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2457 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2458 		return (0);
2459 	}
2460 	src_array = (in6_addr_t *)&mld2q[1];
2461 
2462 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2463 
2464 	/* extract Maximum Response Delay from code in header */
2465 	mrd = ntohs(mld2q->mld2q_mxrc);
2466 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2467 		uint_t hdrval, mant, exp;
2468 		hdrval = mrd;
2469 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2470 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2471 		mrd = (mant | 0x1000) << (exp + 3);
2472 	}
2473 	if (mrd == 0)
2474 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2475 
2476 	MCAST_RANDOM_DELAY(delay, mrd);
2477 	next = (unsigned)INFINITY;
2478 	current = CURRENT_MSTIME;
2479 
2480 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2481 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2482 	else
2483 		ill->ill_mcast_rv = qrv;
2484 
2485 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2486 		uint_t mant, exp;
2487 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2488 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2489 		qqi = (mant | 0x10) << (exp + 3);
2490 	}
2491 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2492 
2493 	/*
2494 	 * If we have a pending general query response that's scheduled
2495 	 * sooner than the delay we calculated for this response, then
2496 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2497 	 */
2498 	mutex_enter(&ill->ill_lock);
2499 	if (ill->ill_global_timer < (current + delay)) {
2500 		mutex_exit(&ill->ill_lock);
2501 		return (next);
2502 	}
2503 	mutex_exit(&ill->ill_lock);
2504 
2505 	/*
2506 	 * Now take action depending on query type: general,
2507 	 * group specific, or group/source specific.
2508 	 */
2509 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2510 		/*
2511 		 * general query
2512 		 * We know global timer is either not running or is
2513 		 * greater than our calculated delay, so reset it to
2514 		 * our delay (random value in range [0, response time])
2515 		 */
2516 		mutex_enter(&ill->ill_lock);
2517 		ill->ill_global_timer = current + delay;
2518 		mutex_exit(&ill->ill_lock);
2519 		next = delay;
2520 
2521 	} else {
2522 		/* group or group/source specific query */
2523 		mutex_enter(&ill->ill_lock);
2524 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2525 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2526 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2527 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2528 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2529 				continue;
2530 
2531 			/*
2532 			 * If the query is group specific or we have a
2533 			 * pending group specific query, the response is
2534 			 * group specific (pending sources list should be
2535 			 * empty).  Otherwise, need to update the pending
2536 			 * sources list for the group and source specific
2537 			 * response.
2538 			 */
2539 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2540 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2541 group_query:
2542 				FREE_SLIST(ilm->ilm_pendsrcs);
2543 				ilm->ilm_pendsrcs = NULL;
2544 			} else {
2545 				boolean_t overflow;
2546 				slist_t *pktl;
2547 				if (numsrc > MAX_FILTER_SIZE ||
2548 				    (ilm->ilm_pendsrcs == NULL &&
2549 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2550 					/*
2551 					 * We've been sent more sources than
2552 					 * we can deal with; or we can't deal
2553 					 * with a source list at all. Revert
2554 					 * to a group specific query.
2555 					 */
2556 					goto group_query;
2557 				}
2558 				if ((pktl = l_alloc()) == NULL)
2559 					goto group_query;
2560 				pktl->sl_numsrc = numsrc;
2561 				for (i = 0; i < numsrc; i++)
2562 					pktl->sl_addr[i] = src_array[i];
2563 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2564 				    &overflow);
2565 				l_free(pktl);
2566 				if (overflow)
2567 					goto group_query;
2568 			}
2569 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2570 			    INFINITY : (ilm->ilm_timer - current);
2571 			/* set timer to soonest value */
2572 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2573 			if (ilm->ilm_timer < next)
2574 				next = ilm->ilm_timer;
2575 			ilm->ilm_timer += current;
2576 			break;
2577 		}
2578 		mutex_exit(&ill->ill_lock);
2579 	}
2580 
2581 	return (next);
2582 }
2583 
2584 /*
2585  * Send MLDv1 response packet with hoplimit 1
2586  */
2587 static void
2588 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2589 {
2590 	mblk_t		*mp;
2591 	mld_hdr_t	*mldh;
2592 	ip6_t 		*ip6h;
2593 	ip6_hbh_t	*ip6hbh;
2594 	struct ip6_opt_router	*ip6router;
2595 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2596 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2597 	ipif_t		*ipif;
2598 	ip6i_t		*ip6i;
2599 
2600 	/*
2601 	 * We need to place a router alert option in this packet.  The length
2602 	 * of the options must be a multiple of 8.  The hbh option header is 2
2603 	 * bytes followed by the 4 byte router alert option.  That leaves
2604 	 * 2 bytes of pad for a total of 8 bytes.
2605 	 */
2606 	const int	router_alert_length = 8;
2607 
2608 	ASSERT(ill->ill_isv6);
2609 
2610 	/*
2611 	 * We need to make sure that this packet does not get load balanced.
2612 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2613 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2614 	 * If it gets load balanced, switches supporting MLD snooping
2615 	 * (in the future) will send the packet that it receives for this
2616 	 * multicast group to the interface that we are sending on. As we have
2617 	 * joined the multicast group on this ill, by sending the packet out
2618 	 * on this ill, we receive all the packets back on this ill.
2619 	 */
2620 	size += sizeof (ip6i_t) + router_alert_length;
2621 	mp = allocb(size, BPRI_HI);
2622 	if (mp == NULL)
2623 		return;
2624 	bzero(mp->b_rptr, size);
2625 	mp->b_wptr = mp->b_rptr + size;
2626 
2627 	ip6i = (ip6i_t *)mp->b_rptr;
2628 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2629 	ip6i->ip6i_nxt = IPPROTO_RAW;
2630 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2631 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2632 
2633 	ip6h = (ip6_t *)&ip6i[1];
2634 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2635 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2636 	/*
2637 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2638 	 * above will pad between ip6router and mld.
2639 	 */
2640 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2641 
2642 	mldh->mld_type = type;
2643 	mldh->mld_addr = ilm->ilm_v6addr;
2644 
2645 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2646 	ip6router->ip6or_len = 2;
2647 	ip6router->ip6or_value[0] = 0;
2648 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2649 
2650 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2651 	ip6hbh->ip6h_len = 0;
2652 
2653 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2654 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2655 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2656 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2657 	if (v6addr == NULL)
2658 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2659 	else
2660 		ip6h->ip6_dst = *v6addr;
2661 
2662 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2663 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2664 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2665 		ipif_refrele(ipif);
2666 	} else {
2667 		/* Otherwise, use IPv6 default address selection. */
2668 		ip6h->ip6_src = ipv6_all_zeros;
2669 	}
2670 
2671 	/*
2672 	 * Prepare for checksum by putting icmp length in the icmp
2673 	 * checksum field. The checksum is calculated in ip_wput_v6.
2674 	 */
2675 	mldh->mld_cksum = htons(sizeof (*mldh));
2676 
2677 	/*
2678 	 * ip_wput will automatically loopback the multicast packet to
2679 	 * the conn if multicast loopback is enabled.
2680 	 * The MIB stats corresponding to this outgoing MLD packet
2681 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2682 	 * ->icmp_update_out_mib_v6 function call.
2683 	 */
2684 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2685 }
2686 
2687 /*
2688  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2689  * report will contain one multicast address record for each element of
2690  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2691  * multiple reports are sent.  reclist is assumed to be made up of
2692  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2693  */
2694 static void
2695 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2696 {
2697 	mblk_t		*mp;
2698 	mld2r_t		*mld2r;
2699 	mld2mar_t	*mld2mar;
2700 	in6_addr_t	*srcarray;
2701 	ip6_t		*ip6h;
2702 	ip6_hbh_t	*ip6hbh;
2703 	ip6i_t		*ip6i;
2704 	struct ip6_opt_router	*ip6router;
2705 	size_t		size, optlen, padlen, icmpsize, rsize;
2706 	ipif_t		*ipif;
2707 	int		i, numrec, more_src_cnt;
2708 	mrec_t		*rp, *cur_reclist;
2709 	mrec_t		*next_reclist = reclist;
2710 	boolean_t	morepkts;
2711 
2712 	/* If there aren't any records, there's nothing to send */
2713 	if (reclist == NULL)
2714 		return;
2715 
2716 	ASSERT(ill->ill_isv6);
2717 
2718 	/*
2719 	 * Total option length (optlen + padlen) must be a multiple of
2720 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2721 	 * length will be 8.  Assert this in case anything ever changes.
2722 	 */
2723 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2724 	ASSERT(optlen <= 8);
2725 	padlen = 8 - optlen;
2726 nextpkt:
2727 	icmpsize = sizeof (mld2r_t);
2728 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2729 	morepkts = B_FALSE;
2730 	more_src_cnt = 0;
2731 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2732 	    rp = rp->mrec_next, numrec++) {
2733 		rsize = sizeof (mld2mar_t) +
2734 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2735 		if (size + rsize > ill->ill_max_frag) {
2736 			if (rp == cur_reclist) {
2737 				/*
2738 				 * If the first mrec we looked at is too big
2739 				 * to fit in a single packet (i.e the source
2740 				 * list is too big), we must either truncate
2741 				 * the list (if TO_EX or IS_EX), or send
2742 				 * multiple reports for the same group (all
2743 				 * other types).
2744 				 */
2745 				int srcspace, srcsperpkt;
2746 				srcspace = ill->ill_max_frag -
2747 				    (size + sizeof (mld2mar_t));
2748 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2749 				/*
2750 				 * Increment icmpsize and size, because we will
2751 				 * be sending a record for the mrec we're
2752 				 * looking at now.
2753 				 */
2754 				rsize = sizeof (mld2mar_t) +
2755 				    (srcsperpkt * sizeof (in6_addr_t));
2756 				icmpsize += rsize;
2757 				size += rsize;
2758 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2759 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2760 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2761 					if (rp->mrec_next == NULL) {
2762 						/* no more packets to send */
2763 						break;
2764 					} else {
2765 						/*
2766 						 * more packets, but we're
2767 						 * done with this mrec.
2768 						 */
2769 						next_reclist = rp->mrec_next;
2770 					}
2771 				} else {
2772 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2773 					    - srcsperpkt;
2774 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2775 					/*
2776 					 * We'll fix up this mrec (remove the
2777 					 * srcs we've already sent) before
2778 					 * returning to nextpkt above.
2779 					 */
2780 					next_reclist = rp;
2781 				}
2782 			} else {
2783 				next_reclist = rp;
2784 			}
2785 			morepkts = B_TRUE;
2786 			break;
2787 		}
2788 		icmpsize += rsize;
2789 		size += rsize;
2790 	}
2791 
2792 	/*
2793 	 * We need to make sure that this packet does not get load balanced.
2794 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2795 	 * ip_newroute_ipif_v6 know how to handle such packets.
2796 	 * If it gets load balanced, switches supporting MLD snooping
2797 	 * (in the future) will send the packet that it receives for this
2798 	 * multicast group to the interface that we are sending on. As we have
2799 	 * joined the multicast group on this ill, by sending the packet out
2800 	 * on this ill, we receive all the packets back on this ill.
2801 	 */
2802 	size += sizeof (ip6i_t);
2803 	mp = allocb(size, BPRI_HI);
2804 	if (mp == NULL)
2805 		goto free_reclist;
2806 	bzero(mp->b_rptr, size);
2807 	mp->b_wptr = mp->b_rptr + size;
2808 
2809 	ip6i = (ip6i_t *)mp->b_rptr;
2810 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2811 	ip6i->ip6i_nxt = IPPROTO_RAW;
2812 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2813 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2814 
2815 	ip6h = (ip6_t *)&(ip6i[1]);
2816 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2817 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2818 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2819 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2820 
2821 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2822 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2823 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2824 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2825 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2826 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2827 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2828 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2829 		ipif_refrele(ipif);
2830 	} else {
2831 		/* otherwise, use IPv6 default address selection. */
2832 		ip6h->ip6_src = ipv6_all_zeros;
2833 	}
2834 
2835 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2836 	/*
2837 	 * ip6h_len is the number of 8-byte words, not including the first
2838 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2839 	 */
2840 	ip6hbh->ip6h_len = 0;
2841 
2842 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2843 	ip6router->ip6or_len = 2;
2844 	ip6router->ip6or_value[0] = 0;
2845 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2846 
2847 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2848 	mld2r->mld2r_nummar = htons(numrec);
2849 	/*
2850 	 * Prepare for the checksum by putting icmp length in the icmp
2851 	 * checksum field. The checksum is calculated in ip_wput_v6.
2852 	 */
2853 	mld2r->mld2r_cksum = htons(icmpsize);
2854 
2855 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2856 		mld2mar->mld2mar_type = rp->mrec_type;
2857 		mld2mar->mld2mar_auxlen = 0;
2858 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2859 		mld2mar->mld2mar_group = rp->mrec_group;
2860 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2861 
2862 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2863 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2864 
2865 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2866 	}
2867 
2868 	/*
2869 	 * ip_wput will automatically loopback the multicast packet to
2870 	 * the conn if multicast loopback is enabled.
2871 	 * The MIB stats corresponding to this outgoing MLD packet
2872 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2873 	 * ->icmp_update_out_mib_v6 function call.
2874 	 */
2875 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2876 
2877 	if (morepkts) {
2878 		if (more_src_cnt > 0) {
2879 			int index, mvsize;
2880 			slist_t *sl = &next_reclist->mrec_srcs;
2881 			index = sl->sl_numsrc;
2882 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2883 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2884 			    mvsize);
2885 			sl->sl_numsrc = more_src_cnt;
2886 		}
2887 		goto nextpkt;
2888 	}
2889 
2890 free_reclist:
2891 	while (reclist != NULL) {
2892 		rp = reclist->mrec_next;
2893 		mi_free(reclist);
2894 		reclist = rp;
2895 	}
2896 }
2897 
2898 static mrec_t *
2899 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2900     mrec_t *next)
2901 {
2902 	mrec_t *rp;
2903 	int i;
2904 
2905 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2906 	    SLIST_IS_EMPTY(srclist))
2907 		return (next);
2908 
2909 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2910 	if (rp == NULL)
2911 		return (next);
2912 
2913 	rp->mrec_next = next;
2914 	rp->mrec_type = type;
2915 	rp->mrec_auxlen = 0;
2916 	rp->mrec_group = *grp;
2917 	if (srclist == NULL) {
2918 		rp->mrec_srcs.sl_numsrc = 0;
2919 	} else {
2920 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2921 		for (i = 0; i < srclist->sl_numsrc; i++)
2922 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2923 	}
2924 
2925 	return (rp);
2926 }
2927 
2928 /*
2929  * Set up initial retransmit state.  If memory cannot be allocated for
2930  * the source lists, simply create as much state as is possible; memory
2931  * allocation failures are considered one type of transient error that
2932  * the retransmissions are designed to overcome (and if they aren't
2933  * transient, there are bigger problems than failing to notify the
2934  * router about multicast group membership state changes).
2935  */
2936 static void
2937 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2938     slist_t *flist)
2939 {
2940 	/*
2941 	 * There are only three possibilities for rtype:
2942 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2943 	 *	  => rtype is ALLOW_NEW_SOURCES
2944 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2945 	 *	  => rtype is CHANGE_TO_EXCLUDE
2946 	 *	State change that involves a filter mode change
2947 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2948 	 */
2949 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2950 	    rtype == ALLOW_NEW_SOURCES);
2951 
2952 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2953 
2954 	switch (rtype) {
2955 	case CHANGE_TO_EXCLUDE:
2956 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2957 		CLEAR_SLIST(rtxp->rtx_allow);
2958 		COPY_SLIST(flist, rtxp->rtx_block);
2959 		break;
2960 	case ALLOW_NEW_SOURCES:
2961 	case CHANGE_TO_INCLUDE:
2962 		rtxp->rtx_fmode_cnt =
2963 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2964 		CLEAR_SLIST(rtxp->rtx_block);
2965 		COPY_SLIST(flist, rtxp->rtx_allow);
2966 		break;
2967 	}
2968 }
2969 
2970 /*
2971  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2972  * RFC 3376 section 5.1, covers three cases:
2973  *	* The current state change is a filter mode change
2974  *		Set filter mode retransmit counter; set retransmit allow or
2975  *		block list to new source list as appropriate, and clear the
2976  *		retransmit list that was not set; send TO_IN or TO_EX with
2977  *		new source list.
2978  *	* The current state change is a source list change, but the filter
2979  *	  mode retransmit counter is > 0
2980  *		Decrement filter mode retransmit counter; set retransmit
2981  *		allow or block list to  new source list as appropriate,
2982  *		and clear the retransmit list that was not set; send TO_IN
2983  *		or TO_EX with new source list.
2984  *	* The current state change is a source list change, and the filter
2985  *	  mode retransmit counter is 0.
2986  *		Merge existing rtx allow and block lists with new state:
2987  *		  rtx_allow = (new allow + rtx_allow) - new block
2988  *		  rtx_block = (new block + rtx_block) - new allow
2989  *		Send ALLOW and BLOCK records for new retransmit lists;
2990  *		decrement retransmit counter.
2991  *
2992  * As is the case for mcast_init_rtx(), memory allocation failures are
2993  * acceptable; we just create as much state as we can.
2994  */
2995 static mrec_t *
2996 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2997 {
2998 	ill_t *ill;
2999 	rtx_state_t *rtxp = &ilm->ilm_rtx;
3000 	mcast_record_t txtype;
3001 	mrec_t *rp, *rpnext, *rtnmrec;
3002 	boolean_t ovf;
3003 
3004 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
3005 
3006 	if (mreclist == NULL)
3007 		return (mreclist);
3008 
3009 	/*
3010 	 * A filter mode change is indicated by a single mrec, which is
3011 	 * either TO_IN or TO_EX.  In this case, we just need to set new
3012 	 * retransmit state as if this were an initial join.  There is
3013 	 * no change to the mrec list.
3014 	 */
3015 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
3016 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
3017 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
3018 		    &mreclist->mrec_srcs);
3019 		return (mreclist);
3020 	}
3021 
3022 	/*
3023 	 * Only the source list has changed
3024 	 */
3025 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3026 	if (rtxp->rtx_fmode_cnt > 0) {
3027 		/* but we're still sending filter mode change reports */
3028 		rtxp->rtx_fmode_cnt--;
3029 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3030 			CLEAR_SLIST(rtxp->rtx_block);
3031 			COPY_SLIST(flist, rtxp->rtx_allow);
3032 			txtype = CHANGE_TO_INCLUDE;
3033 		} else {
3034 			CLEAR_SLIST(rtxp->rtx_allow);
3035 			COPY_SLIST(flist, rtxp->rtx_block);
3036 			txtype = CHANGE_TO_EXCLUDE;
3037 		}
3038 		/* overwrite first mrec with new info */
3039 		mreclist->mrec_type = txtype;
3040 		l_copy(flist, &mreclist->mrec_srcs);
3041 		/* then free any remaining mrecs */
3042 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3043 			rpnext = rp->mrec_next;
3044 			mi_free(rp);
3045 		}
3046 		mreclist->mrec_next = NULL;
3047 		rtnmrec = mreclist;
3048 	} else {
3049 		mrec_t *allow_mrec, *block_mrec;
3050 		/*
3051 		 * Just send the source change reports; but we need to
3052 		 * recalculate the ALLOW and BLOCK lists based on previous
3053 		 * state and new changes.
3054 		 */
3055 		rtnmrec = mreclist;
3056 		allow_mrec = block_mrec = NULL;
3057 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3058 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3059 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3060 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3061 				allow_mrec = rp;
3062 			else
3063 				block_mrec = rp;
3064 		}
3065 		/*
3066 		 * Perform calculations:
3067 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3068 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3069 		 *
3070 		 * Each calc requires two steps, for example:
3071 		 *   rtx_allow = rtx_allow - mrec_block;
3072 		 *   new_allow = mrec_allow + rtx_allow;
3073 		 *
3074 		 * Store results in mrec lists, and then copy into rtx lists.
3075 		 * We do it in this order in case the rtx list hasn't been
3076 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3077 		 * Overflows are also okay.
3078 		 */
3079 		if (block_mrec != NULL) {
3080 			l_difference_in_a(rtxp->rtx_allow,
3081 			    &block_mrec->mrec_srcs);
3082 		}
3083 		if (allow_mrec != NULL) {
3084 			l_difference_in_a(rtxp->rtx_block,
3085 			    &allow_mrec->mrec_srcs);
3086 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3087 			    &ovf);
3088 		}
3089 		if (block_mrec != NULL) {
3090 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3091 			    &ovf);
3092 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3093 		} else {
3094 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3095 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3096 		}
3097 		if (allow_mrec != NULL) {
3098 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3099 		} else {
3100 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3101 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3102 		}
3103 	}
3104 
3105 	return (rtnmrec);
3106 }
3107