xref: /illumos-gate/usr/src/uts/common/inet/ip/igmp.c (revision 8c4267180173328ebba9487634f0f232387d067f)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Internet Group Management Protocol (IGMP) routines.
28  * Multicast Listener Discovery Protocol (MLD) routines.
29  *
30  * Written by Steve Deering, Stanford, May 1988.
31  * Modified by Rosen Sharma, Stanford, Aug 1994.
32  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
33  *
34  * MULTICAST 3.5.1.1
35  */
36 
37 #include <sys/types.h>
38 #include <sys/stream.h>
39 #include <sys/stropts.h>
40 #include <sys/strlog.h>
41 #include <sys/strsun.h>
42 #include <sys/systm.h>
43 #include <sys/ddi.h>
44 #include <sys/sunddi.h>
45 #include <sys/cmn_err.h>
46 #include <sys/atomic.h>
47 #include <sys/zone.h>
48 #include <sys/callb.h>
49 #include <sys/param.h>
50 #include <sys/socket.h>
51 #include <inet/ipclassifier.h>
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/igmp_var.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <inet/ipsec_impl.h>
59 
60 #include <inet/common.h>
61 #include <inet/mi.h>
62 #include <inet/nd.h>
63 #include <inet/tunables.h>
64 #include <inet/ip.h>
65 #include <inet/ip6.h>
66 #include <inet/ip_multi.h>
67 #include <inet/ip_listutils.h>
68 
69 #include <netinet/igmp.h>
70 #include <inet/ip_ndp.h>
71 #include <inet/ip_if.h>
72 
73 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
74 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
75 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
76 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
77 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
78 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
79 static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
80 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
81 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
82 		    slist_t *srclist, mrec_t *next);
83 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
84 		    mcast_record_t rtype, slist_t *flist);
85 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
86 
87 /*
88  * Macros used to do timer len conversions.  Timer values are always
89  * stored and passed to the timer functions as milliseconds; but the
90  * default values and values from the wire may not be.
91  *
92  * And yes, it's obscure, but decisecond is easier to abbreviate than
93  * "tenths of a second".
94  */
95 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
96 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
97 
98 /*
99  * A running timer (scheduled thru timeout) can be cancelled if another
100  * timer with a shorter timeout value is scheduled before it has timed
101  * out.  When the shorter timer expires, the original timer is updated
102  * to account for the time elapsed while the shorter timer ran; but this
103  * does not take into account the amount of time already spent in timeout
104  * state before being preempted by the shorter timer, that is the time
105  * interval between time scheduled to time cancelled.  This can cause
106  * delays in sending out multicast membership reports.  To resolve this
107  * problem, wallclock time (absolute time) is used instead of deltas
108  * (relative time) to track timers.
109  *
110  * The MACRO below gets the lbolt value, used for proper timer scheduling
111  * and firing. Therefore multicast membership reports are sent on time.
112  * The timer does not exactly fire at the time it was scehduled to fire,
113  * there is a difference of a few milliseconds observed. An offset is used
114  * to take care of the difference.
115  */
116 
117 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
118 #define	CURRENT_OFFSET	(999)
119 
120 /*
121  * The first multicast join will trigger the igmp timers / mld timers
122  * The unit for next is milliseconds.
123  */
124 void
125 igmp_start_timers(unsigned next, ip_stack_t *ipst)
126 {
127 	int	time_left;
128 	int	ret;
129 	timeout_id_t id;
130 
131 	ASSERT(next != 0 && next != INFINITY);
132 
133 	mutex_enter(&ipst->ips_igmp_timer_lock);
134 
135 	if (ipst->ips_igmp_timer_setter_active) {
136 		/*
137 		 * Serialize timer setters, one at a time. If the
138 		 * timer is currently being set by someone,
139 		 * just record the next time when it has to be
140 		 * invoked and return. The current setter will
141 		 * take care.
142 		 */
143 		ipst->ips_igmp_time_to_next =
144 		    MIN(ipst->ips_igmp_time_to_next, next);
145 		mutex_exit(&ipst->ips_igmp_timer_lock);
146 		return;
147 	} else {
148 		ipst->ips_igmp_timer_setter_active = B_TRUE;
149 	}
150 	if (ipst->ips_igmp_timeout_id == 0) {
151 		/*
152 		 * The timer is inactive. We need to start a timer if we haven't
153 		 * been asked to quiesce.
154 		 */
155 		ipst->ips_igmp_time_to_next = next;
156 		if (ipst->ips_igmp_timer_quiesce != B_TRUE) {
157 			ipst->ips_igmp_timeout_id =
158 			    timeout(igmp_timeout_handler, (void *)ipst,
159 			    MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
160 			ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
161 		}
162 		ipst->ips_igmp_timer_setter_active = B_FALSE;
163 		mutex_exit(&ipst->ips_igmp_timer_lock);
164 		return;
165 	}
166 
167 	/*
168 	 * The timer was scheduled sometime back for firing in
169 	 * 'igmp_time_to_next' ms and is active. We need to
170 	 * reschedule the timeout if the new 'next' will happen
171 	 * earlier than the currently scheduled timeout
172 	 */
173 	time_left = ipst->ips_igmp_timer_scheduled_last +
174 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
175 	if (time_left < MSEC_TO_TICK(next)) {
176 		ipst->ips_igmp_timer_setter_active = B_FALSE;
177 		mutex_exit(&ipst->ips_igmp_timer_lock);
178 		return;
179 	}
180 	id = ipst->ips_igmp_timeout_id;
181 
182 	mutex_exit(&ipst->ips_igmp_timer_lock);
183 	ret = untimeout(id);
184 	mutex_enter(&ipst->ips_igmp_timer_lock);
185 	/*
186 	 * The timeout was cancelled, or the timeout handler
187 	 * completed, while we were blocked in the untimeout.
188 	 * No other thread could have set the timer meanwhile
189 	 * since we serialized all the timer setters. Thus
190 	 * no timer is currently active nor executing nor will
191 	 * any timer fire in the future. We start the timer now
192 	 * if needed.
193 	 */
194 	if (ret == -1) {
195 		ASSERT(ipst->ips_igmp_timeout_id == 0);
196 	} else {
197 		ASSERT(ipst->ips_igmp_timeout_id != 0);
198 		ipst->ips_igmp_timeout_id = 0;
199 	}
200 	if (ipst->ips_igmp_time_to_next != 0 &&
201 	    ipst->ips_igmp_timer_quiesce != B_TRUE) {
202 		ipst->ips_igmp_time_to_next =
203 		    MIN(ipst->ips_igmp_time_to_next, next);
204 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
205 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
206 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
207 	}
208 	ipst->ips_igmp_timer_setter_active = B_FALSE;
209 	mutex_exit(&ipst->ips_igmp_timer_lock);
210 }
211 
212 /*
213  * mld_start_timers:
214  * The unit for next is milliseconds.
215  */
216 void
217 mld_start_timers(unsigned next, ip_stack_t *ipst)
218 {
219 	int	time_left;
220 	int	ret;
221 	timeout_id_t id;
222 
223 	ASSERT(next != 0 && next != INFINITY);
224 
225 	mutex_enter(&ipst->ips_mld_timer_lock);
226 	if (ipst->ips_mld_timer_setter_active) {
227 		/*
228 		 * Serialize timer setters, one at a time. If the
229 		 * timer is currently being set by someone,
230 		 * just record the next time when it has to be
231 		 * invoked and return. The current setter will
232 		 * take care.
233 		 */
234 		ipst->ips_mld_time_to_next =
235 		    MIN(ipst->ips_mld_time_to_next, next);
236 		mutex_exit(&ipst->ips_mld_timer_lock);
237 		return;
238 	} else {
239 		ipst->ips_mld_timer_setter_active = B_TRUE;
240 	}
241 	if (ipst->ips_mld_timeout_id == 0) {
242 		/*
243 		 * The timer is inactive. We need to start a timer, if we
244 		 * haven't been asked to quiesce.
245 		 */
246 		ipst->ips_mld_time_to_next = next;
247 		if (ipst->ips_mld_timer_quiesce != B_TRUE) {
248 			ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
249 			    (void *)ipst,
250 			    MSEC_TO_TICK(ipst->ips_mld_time_to_next));
251 			ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
252 		}
253 		ipst->ips_mld_timer_setter_active = B_FALSE;
254 		mutex_exit(&ipst->ips_mld_timer_lock);
255 		return;
256 	}
257 
258 	/*
259 	 * The timer was scheduled sometime back for firing in
260 	 * 'igmp_time_to_next' ms and is active. We need to
261 	 * reschedule the timeout if the new 'next' will happen
262 	 * earlier than the currently scheduled timeout
263 	 */
264 	time_left = ipst->ips_mld_timer_scheduled_last +
265 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
266 	if (time_left < MSEC_TO_TICK(next)) {
267 		ipst->ips_mld_timer_setter_active = B_FALSE;
268 		mutex_exit(&ipst->ips_mld_timer_lock);
269 		return;
270 	}
271 	id = ipst->ips_mld_timeout_id;
272 
273 	mutex_exit(&ipst->ips_mld_timer_lock);
274 	ret = untimeout(id);
275 	mutex_enter(&ipst->ips_mld_timer_lock);
276 	/*
277 	 * The timeout was cancelled, or the timeout handler
278 	 * completed, while we were blocked in the untimeout.
279 	 * No other thread could have set the timer meanwhile
280 	 * since we serialized all the timer setters. Thus
281 	 * no timer is currently active nor executing nor will
282 	 * any timer fire in the future. We start the timer now
283 	 * if needed.
284 	 */
285 	if (ret == -1) {
286 		ASSERT(ipst->ips_mld_timeout_id == 0);
287 	} else {
288 		ASSERT(ipst->ips_mld_timeout_id != 0);
289 		ipst->ips_mld_timeout_id = 0;
290 	}
291 	if (ipst->ips_mld_time_to_next != 0 &&
292 	    ipst->ips_mld_timer_quiesce == B_FALSE) {
293 		ipst->ips_mld_time_to_next =
294 		    MIN(ipst->ips_mld_time_to_next, next);
295 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
296 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
297 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
298 	}
299 	ipst->ips_mld_timer_setter_active = B_FALSE;
300 	mutex_exit(&ipst->ips_mld_timer_lock);
301 }
302 
303 /*
304  * igmp_input:
305  * Return NULL for a bad packet that is discarded here.
306  * Return mp if the message is OK and should be handed to "raw" receivers.
307  * Callers of igmp_input() may need to reinitialize variables that were copied
308  * from the mblk as this calls pullupmsg().
309  */
310 mblk_t *
311 igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
312 {
313 	igmpa_t		*igmpa;
314 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
315 	int		iphlen, igmplen, mblklen;
316 	ilm_t		*ilm;
317 	uint32_t	src, dst;
318 	uint32_t	group;
319 	in6_addr_t	v6group;
320 	uint_t		next;
321 	ipif_t		*ipif;
322 	ill_t		*ill = ira->ira_ill;
323 	ip_stack_t	*ipst = ill->ill_ipst;
324 
325 	ASSERT(!ill->ill_isv6);
326 	++ipst->ips_igmpstat.igps_rcv_total;
327 
328 	mblklen = MBLKL(mp);
329 	iphlen = ira->ira_ip_hdr_length;
330 	if (mblklen < 1 || mblklen < iphlen) {
331 		++ipst->ips_igmpstat.igps_rcv_tooshort;
332 		goto bad_pkt;
333 	}
334 	igmplen = ira->ira_pktlen - iphlen;
335 	/*
336 	 * Since msg sizes are more variable with v3, just pullup the
337 	 * whole thing now.
338 	 */
339 	if (MBLKL(mp) < (igmplen + iphlen)) {
340 		mblk_t *mp1;
341 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
342 			++ipst->ips_igmpstat.igps_rcv_tooshort;
343 			goto bad_pkt;
344 		}
345 		freemsg(mp);
346 		mp = mp1;
347 		ipha = (ipha_t *)(mp->b_rptr);
348 	}
349 
350 	/*
351 	 * Validate lengths
352 	 */
353 	if (igmplen < IGMP_MINLEN) {
354 		++ipst->ips_igmpstat.igps_rcv_tooshort;
355 		goto bad_pkt;
356 	}
357 
358 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
359 	src = ipha->ipha_src;
360 	dst = ipha->ipha_dst;
361 	if (ip_debug > 1)
362 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
363 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
364 		    (int)ntohl(src), (int)ntohl(dst),
365 		    ill->ill_name);
366 
367 	switch (igmpa->igmpa_type) {
368 	case IGMP_MEMBERSHIP_QUERY:
369 		/*
370 		 * packet length differentiates between v1/v2 and v3
371 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
372 		 */
373 		if ((igmplen == IGMP_MINLEN) ||
374 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
375 			next = igmp_query_in(ipha, igmpa, ill);
376 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
377 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
378 			    igmplen);
379 		} else {
380 			++ipst->ips_igmpstat.igps_rcv_tooshort;
381 			goto bad_pkt;
382 		}
383 		if (next == 0)
384 			goto bad_pkt;
385 
386 		if (next != INFINITY)
387 			igmp_start_timers(next, ipst);
388 
389 		break;
390 
391 	case IGMP_V1_MEMBERSHIP_REPORT:
392 	case IGMP_V2_MEMBERSHIP_REPORT:
393 		/*
394 		 * For fast leave to work, we have to know that we are the
395 		 * last person to send a report for this group. Reports
396 		 * generated by us are looped back since we could potentially
397 		 * be a multicast router, so discard reports sourced by me.
398 		 */
399 		mutex_enter(&ill->ill_lock);
400 		for (ipif = ill->ill_ipif; ipif != NULL;
401 		    ipif = ipif->ipif_next) {
402 			if (ipif->ipif_lcl_addr == src) {
403 				if (ip_debug > 1) {
404 					(void) mi_strlog(ill->ill_rq,
405 					    1,
406 					    SL_TRACE,
407 					    "igmp_input: we are only "
408 					    "member src 0x%x\n",
409 					    (int)ntohl(src));
410 				}
411 				mutex_exit(&ill->ill_lock);
412 				return (mp);
413 			}
414 		}
415 		mutex_exit(&ill->ill_lock);
416 
417 		++ipst->ips_igmpstat.igps_rcv_reports;
418 		group = igmpa->igmpa_group;
419 		if (!CLASSD(group)) {
420 			++ipst->ips_igmpstat.igps_rcv_badreports;
421 			goto bad_pkt;
422 		}
423 
424 		/*
425 		 * KLUDGE: if the IP source address of the report has an
426 		 * unspecified (i.e., zero) subnet number, as is allowed for
427 		 * a booting host, replace it with the correct subnet number
428 		 * so that a process-level multicast routing demon can
429 		 * determine which subnet it arrived from.  This is necessary
430 		 * to compensate for the lack of any way for a process to
431 		 * determine the arrival interface of an incoming packet.
432 		 *
433 		 * Requires that a copy of *this* message it passed up
434 		 * to the raw interface which is done by our caller.
435 		 */
436 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
437 			/* Pick the first ipif on this ill */
438 			mutex_enter(&ill->ill_lock);
439 			src = ill->ill_ipif->ipif_subnet;
440 			mutex_exit(&ill->ill_lock);
441 			ip1dbg(("igmp_input: changed src to 0x%x\n",
442 			    (int)ntohl(src)));
443 			ipha->ipha_src = src;
444 		}
445 
446 		/*
447 		 * If our ill has ILMs that belong to the group being
448 		 * reported, and we are a 'Delaying Member' in the RFC
449 		 * terminology, stop our timer for that group and 'clear
450 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
451 		 */
452 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
453 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
454 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
455 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
456 				continue;
457 
458 			++ipst->ips_igmpstat.igps_rcv_ourreports;
459 			ilm->ilm_timer = INFINITY;
460 			ilm->ilm_state = IGMP_OTHERMEMBER;
461 		} /* for */
462 		rw_exit(&ill->ill_mcast_lock);
463 		ill_mcast_timer_start(ill->ill_ipst);
464 		break;
465 
466 	case IGMP_V3_MEMBERSHIP_REPORT:
467 		/*
468 		 * Currently nothing to do here; IGMP router is not
469 		 * implemented in ip, and v3 hosts don't pay attention
470 		 * to membership reports.
471 		 */
472 		break;
473 	}
474 	/*
475 	 * Pass all valid IGMP packets up to any process(es) listening
476 	 * on a raw IGMP socket. Do not free the packet.
477 	 */
478 	return (mp);
479 
480 bad_pkt:
481 	freemsg(mp);
482 	return (NULL);
483 }
484 
485 static uint_t
486 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
487 {
488 	ilm_t	*ilm;
489 	int	timer;
490 	uint_t	next, current;
491 	ip_stack_t	 *ipst;
492 
493 	ipst = ill->ill_ipst;
494 	++ipst->ips_igmpstat.igps_rcv_queries;
495 
496 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
497 	/*
498 	 * In the IGMPv2 specification, there are 3 states and a flag.
499 	 *
500 	 * In Non-Member state, we simply don't have a membership record.
501 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
502 	 * < INFINITY).  In Idle Member state, our timer is not running
503 	 * (ilm->ilm_timer == INFINITY).
504 	 *
505 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
506 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
507 	 * if I sent the last report.
508 	 */
509 	if ((igmpa->igmpa_code == 0) ||
510 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
511 		/*
512 		 * Query from an old router.
513 		 * Remember that the querier on this interface is old,
514 		 * and set the timer to the value in RFC 1112.
515 		 */
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_inc_16(&ill->ill_ifptr->illif_mcast_v1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 
525 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
526 
527 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
528 		    igmpa->igmpa_group != 0) {
529 			++ipst->ips_igmpstat.igps_rcv_badqueries;
530 			rw_exit(&ill->ill_mcast_lock);
531 			ill_mcast_timer_start(ill->ill_ipst);
532 			return (0);
533 		}
534 
535 	} else {
536 		in_addr_t group;
537 
538 		/*
539 		 * Query from a new router
540 		 * Simply do a validity check
541 		 */
542 		group = igmpa->igmpa_group;
543 		if (group != 0 && (!CLASSD(group))) {
544 			++ipst->ips_igmpstat.igps_rcv_badqueries;
545 			rw_exit(&ill->ill_mcast_lock);
546 			ill_mcast_timer_start(ill->ill_ipst);
547 			return (0);
548 		}
549 
550 		/*
551 		 * Switch interface state to v2 on receipt of a v2 query
552 		 * ONLY IF current state is v3.  Let things be if current
553 		 * state if v1 but do reset the v2-querier-present timer.
554 		 */
555 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
556 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
557 			    "to IGMP_V2_ROUTER", ill->ill_name));
558 			atomic_inc_16(&ill->ill_ifptr->illif_mcast_v2);
559 			ill->ill_mcast_type = IGMP_V2_ROUTER;
560 		}
561 		ill->ill_mcast_v2_time = 0;
562 		ill->ill_mcast_v2_tset = 1;
563 
564 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
565 	}
566 
567 	if (ip_debug > 1) {
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 	}
573 
574 	/*
575 	 * -Start the timers in all of our membership records
576 	 *  for the physical interface on which the query
577 	 *  arrived, excluding those that belong to the "all
578 	 *  hosts" group (224.0.0.1).
579 	 *
580 	 * -Restart any timer that is already running but has
581 	 *  a value longer than the requested timeout.
582 	 *
583 	 * -Use the value specified in the query message as
584 	 *  the maximum timeout.
585 	 */
586 	next = (unsigned)INFINITY;
587 
588 	current = CURRENT_MSTIME;
589 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
590 
591 		/*
592 		 * A multicast router joins INADDR_ANY address
593 		 * to enable promiscuous reception of all
594 		 * mcasts from the interface. This INADDR_ANY
595 		 * is stored in the ilm_v6addr as V6 unspec addr
596 		 */
597 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
598 			continue;
599 		if (ilm->ilm_addr == htonl(INADDR_ANY))
600 			continue;
601 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
602 		    (igmpa->igmpa_group == 0) ||
603 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
604 			if (ilm->ilm_timer > timer) {
605 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
606 				if (ilm->ilm_timer < next)
607 					next = ilm->ilm_timer;
608 				ilm->ilm_timer += current;
609 			}
610 		}
611 	}
612 	rw_exit(&ill->ill_mcast_lock);
613 	/*
614 	 * No packets have been sent above - no
615 	 * ill_mcast_send_queued is needed.
616 	 */
617 	ill_mcast_timer_start(ill->ill_ipst);
618 
619 	return (next);
620 }
621 
622 static uint_t
623 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
624 {
625 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
626 	uint_t		current;
627 	ilm_t		*ilm;
628 	ipaddr_t	*src_array;
629 	uint8_t		qrv;
630 	ip_stack_t	 *ipst;
631 
632 	ipst = ill->ill_ipst;
633 	/* make sure numsrc matches packet size */
634 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
635 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
636 		++ipst->ips_igmpstat.igps_rcv_tooshort;
637 		return (0);
638 	}
639 	src_array = (ipaddr_t *)&igmp3qa[1];
640 
641 	++ipst->ips_igmpstat.igps_rcv_queries;
642 
643 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
644 
645 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
646 		uint_t hdrval, mant, exp;
647 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
648 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
649 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
650 		mrd = (mant | 0x10) << (exp + 3);
651 	}
652 	if (mrd == 0)
653 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
654 	timer = DSEC_TO_MSEC(mrd);
655 	MCAST_RANDOM_DELAY(delay, timer);
656 	next = (unsigned)INFINITY;
657 	current = CURRENT_MSTIME;
658 
659 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
660 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
661 	else
662 		ill->ill_mcast_rv = qrv;
663 
664 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
665 		uint_t hdrval, mant, exp;
666 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
667 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
668 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
669 		qqi = (mant | 0x10) << (exp + 3);
670 	}
671 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
672 
673 	/*
674 	 * If we have a pending general query response that's scheduled
675 	 * sooner than the delay we calculated for this response, then
676 	 * no action is required (RFC3376 section 5.2 rule 1)
677 	 */
678 	if (ill->ill_global_timer < (current + delay)) {
679 		rw_exit(&ill->ill_mcast_lock);
680 		ill_mcast_timer_start(ill->ill_ipst);
681 		return (next);
682 	}
683 
684 	/*
685 	 * Now take action depending upon query type:
686 	 * general, group specific, or group/source specific.
687 	 */
688 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
689 		/*
690 		 * general query
691 		 * We know global timer is either not running or is
692 		 * greater than our calculated delay, so reset it to
693 		 * our delay (random value in range [0, response time]).
694 		 */
695 		ill->ill_global_timer =  current + delay;
696 		next = delay;
697 	} else {
698 		/* group or group/source specific query */
699 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
700 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
701 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
702 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
703 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
704 				continue;
705 			/*
706 			 * If the query is group specific or we have a
707 			 * pending group specific query, the response is
708 			 * group specific (pending sources list should be
709 			 * empty).  Otherwise, need to update the pending
710 			 * sources list for the group and source specific
711 			 * response.
712 			 */
713 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
714 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
715 group_query:
716 				FREE_SLIST(ilm->ilm_pendsrcs);
717 				ilm->ilm_pendsrcs = NULL;
718 			} else {
719 				boolean_t overflow;
720 				slist_t *pktl;
721 				if (numsrc > MAX_FILTER_SIZE ||
722 				    (ilm->ilm_pendsrcs == NULL &&
723 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
724 					/*
725 					 * We've been sent more sources than
726 					 * we can deal with; or we can't deal
727 					 * with a source list at all.  Revert
728 					 * to a group specific query.
729 					 */
730 					goto group_query;
731 				}
732 				if ((pktl = l_alloc()) == NULL)
733 					goto group_query;
734 				pktl->sl_numsrc = numsrc;
735 				for (i = 0; i < numsrc; i++)
736 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
737 					    &(pktl->sl_addr[i]));
738 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
739 				    &overflow);
740 				l_free(pktl);
741 				if (overflow)
742 					goto group_query;
743 			}
744 
745 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
746 			    INFINITY : (ilm->ilm_timer - current);
747 			/* choose soonest timer */
748 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
749 			if (ilm->ilm_timer < next)
750 				next = ilm->ilm_timer;
751 			ilm->ilm_timer += current;
752 		}
753 	}
754 	rw_exit(&ill->ill_mcast_lock);
755 	/*
756 	 * No packets have been sent above - no
757 	 * ill_mcast_send_queued is needed.
758 	 */
759 	ill_mcast_timer_start(ill->ill_ipst);
760 
761 	return (next);
762 }
763 
764 /*
765  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
766  * and it gets sent after the lock is dropped.
767  */
768 void
769 igmp_joingroup(ilm_t *ilm)
770 {
771 	uint_t	timer;
772 	ill_t	*ill;
773 	ip_stack_t	*ipst = ilm->ilm_ipst;
774 
775 	ill = ilm->ilm_ill;
776 
777 	ASSERT(!ill->ill_isv6);
778 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
779 
780 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
781 		ilm->ilm_rtx.rtx_timer = timer = INFINITY;
782 		ilm->ilm_state = IGMP_OTHERMEMBER;
783 	} else {
784 		ip1dbg(("Querier mode %d, sending report, group %x\n",
785 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
786 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
787 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
788 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
789 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
790 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
791 			mrec_t *rp;
792 			mcast_record_t rtype;
793 			/*
794 			 * The possible state changes we need to handle here:
795 			 *   Old State	New State	Report
796 			 *
797 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
798 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
799 			 *
800 			 * No need to send the BLOCK(0) report; ALLOW(X)
801 			 * is enough.
802 			 */
803 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
804 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
805 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
806 			    ilm->ilm_filter, NULL);
807 			igmpv3_sendrpt(ill, rp);
808 			/*
809 			 * Set up retransmission state.  Timer is set below,
810 			 * for both v3 and older versions.
811 			 */
812 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
813 			    ilm->ilm_filter);
814 		}
815 
816 		/* Set the ilm timer value */
817 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
818 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
819 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
820 		timer = ilm->ilm_rtx.rtx_timer;
821 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
822 		ilm->ilm_state = IGMP_IREPORTEDLAST;
823 
824 		/*
825 		 * We are holding ill_mcast_lock here and the timeout
826 		 * handler (igmp_timeout_handler_per_ill) acquires that
827 		 * lock. Hence we can't call igmp_start_timers since it could
828 		 * deadlock in untimeout().
829 		 * Instead the thread which drops ill_mcast_lock will have
830 		 * to call ill_mcast_timer_start().
831 		 */
832 		mutex_enter(&ipst->ips_igmp_timer_lock);
833 		ipst->ips_igmp_deferred_next = MIN(timer,
834 		    ipst->ips_igmp_deferred_next);
835 		mutex_exit(&ipst->ips_igmp_timer_lock);
836 	}
837 
838 	if (ip_debug > 1) {
839 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
840 		    "igmp_joingroup: multicast_type %d timer %d",
841 		    (ilm->ilm_ill->ill_mcast_type),
842 		    (int)ntohl(timer));
843 	}
844 }
845 
846 /*
847  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
848  * and it gets sent after the lock is dropped.
849  */
850 void
851 mld_joingroup(ilm_t *ilm)
852 {
853 	uint_t	timer;
854 	ill_t	*ill;
855 	ip_stack_t	*ipst = ilm->ilm_ipst;
856 
857 	ill = ilm->ilm_ill;
858 
859 	ASSERT(ill->ill_isv6);
860 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
861 
862 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
863 		ilm->ilm_rtx.rtx_timer = timer = INFINITY;
864 		ilm->ilm_state = IGMP_OTHERMEMBER;
865 	} else {
866 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
867 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
868 		} else {
869 			mrec_t *rp;
870 			mcast_record_t rtype;
871 			/*
872 			 * The possible state changes we need to handle here:
873 			 *	Old State   New State	Report
874 			 *
875 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
876 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
877 			 *
878 			 * No need to send the BLOCK(0) report; ALLOW(X)
879 			 * is enough
880 			 */
881 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
882 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
883 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
884 			    ilm->ilm_filter, NULL);
885 			mldv2_sendrpt(ill, rp);
886 			/*
887 			 * Set up retransmission state.  Timer is set below,
888 			 * for both v2 and v1.
889 			 */
890 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
891 			    ilm->ilm_filter);
892 		}
893 
894 		/* Set the ilm timer value */
895 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
896 		    ilm->ilm_rtx.rtx_cnt > 0);
897 
898 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
899 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
900 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
901 		timer = ilm->ilm_rtx.rtx_timer;
902 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
903 		ilm->ilm_state = IGMP_IREPORTEDLAST;
904 
905 		/*
906 		 * We are holding ill_mcast_lock here and the timeout
907 		 * handler (mld_timeout_handler_per_ill) acquires that
908 		 * lock. Hence we can't call mld_start_timers since it could
909 		 * deadlock in untimeout().
910 		 * Instead the thread which drops ill_mcast_lock will have
911 		 * to call ill_mcast_timer_start().
912 		 */
913 		mutex_enter(&ipst->ips_mld_timer_lock);
914 		ipst->ips_mld_deferred_next = MIN(timer,
915 		    ipst->ips_mld_deferred_next);
916 		mutex_exit(&ipst->ips_mld_timer_lock);
917 	}
918 
919 	if (ip_debug > 1) {
920 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
921 		    "mld_joingroup: multicast_type %d timer %d",
922 		    (ilm->ilm_ill->ill_mcast_type),
923 		    (int)ntohl(timer));
924 	}
925 }
926 
927 /*
928  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
929  * and it gets sent after the lock is dropped.
930  */
931 void
932 igmp_leavegroup(ilm_t *ilm)
933 {
934 	ill_t *ill = ilm->ilm_ill;
935 
936 	ASSERT(!ill->ill_isv6);
937 
938 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
939 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
940 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
941 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
942 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
943 		    (htonl(INADDR_ALLRTRS_GROUP)));
944 		return;
945 	}
946 	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
947 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
948 		mrec_t *rp;
949 		/*
950 		 * The possible state changes we need to handle here:
951 		 *	Old State	New State	Report
952 		 *
953 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
954 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
955 		 *
956 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
957 		 */
958 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
959 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
960 			    ilm->ilm_filter, NULL);
961 		} else {
962 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
963 			    NULL, NULL);
964 		}
965 		igmpv3_sendrpt(ill, rp);
966 		return;
967 	}
968 }
969 
970 /*
971  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
972  * and it gets sent after the lock is dropped.
973  */
974 void
975 mld_leavegroup(ilm_t *ilm)
976 {
977 	ill_t *ill = ilm->ilm_ill;
978 
979 	ASSERT(ill->ill_isv6);
980 
981 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
982 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
983 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
984 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
985 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
986 		return;
987 	}
988 	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
989 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
990 		mrec_t *rp;
991 		/*
992 		 * The possible state changes we need to handle here:
993 		 *	Old State	New State	Report
994 		 *
995 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
996 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
997 		 *
998 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
999 		 */
1000 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1001 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1002 			    ilm->ilm_filter, NULL);
1003 		} else {
1004 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
1005 			    NULL, NULL);
1006 		}
1007 		mldv2_sendrpt(ill, rp);
1008 		return;
1009 	}
1010 }
1011 
1012 /*
1013  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1014  * and it gets sent after the lock is dropped.
1015  */
1016 void
1017 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1018 {
1019 	ill_t *ill;
1020 	mrec_t *rp;
1021 	ip_stack_t	*ipst = ilm->ilm_ipst;
1022 
1023 	ASSERT(ilm != NULL);
1024 
1025 	/* state change reports should only be sent if the router is v3 */
1026 	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
1027 		return;
1028 
1029 	ill = ilm->ilm_ill;
1030 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1031 
1032 	/*
1033 	 * Compare existing(old) state with the new state and prepare
1034 	 * State Change Report, according to the rules in RFC 3376:
1035 	 *
1036 	 *	Old State	New State	State Change Report
1037 	 *
1038 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1039 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1040 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1041 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1042 	 */
1043 
1044 	if (ilm->ilm_fmode == fmode) {
1045 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1046 		slist_t *allow, *block;
1047 		if (((a_minus_b = l_alloc()) == NULL) ||
1048 		    ((b_minus_a = l_alloc()) == NULL)) {
1049 			l_free(a_minus_b);
1050 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1051 				goto send_to_ex;
1052 			else
1053 				goto send_to_in;
1054 		}
1055 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1056 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1057 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1058 			allow = b_minus_a;
1059 			block = a_minus_b;
1060 		} else {
1061 			allow = a_minus_b;
1062 			block = b_minus_a;
1063 		}
1064 		rp = NULL;
1065 		if (!SLIST_IS_EMPTY(allow))
1066 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1067 			    allow, rp);
1068 		if (!SLIST_IS_EMPTY(block))
1069 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1070 			    block, rp);
1071 		l_free(a_minus_b);
1072 		l_free(b_minus_a);
1073 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1074 send_to_ex:
1075 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1076 		    NULL);
1077 	} else {
1078 send_to_in:
1079 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1080 		    NULL);
1081 	}
1082 
1083 	/*
1084 	 * Need to set up retransmission state; merge the new info with the
1085 	 * current state (which may be null).  If the timer is not currently
1086 	 * running, the caller will start it when dropping ill_mcast_lock.
1087 	 */
1088 	rp = mcast_merge_rtx(ilm, rp, flist);
1089 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1090 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1091 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1092 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1093 		mutex_enter(&ipst->ips_igmp_timer_lock);
1094 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1095 		    ilm->ilm_rtx.rtx_timer);
1096 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1097 		mutex_exit(&ipst->ips_igmp_timer_lock);
1098 	}
1099 
1100 	igmpv3_sendrpt(ill, rp);
1101 }
1102 
1103 /*
1104  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1105  * and it gets sent after the lock is dropped.
1106  */
1107 void
1108 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1109 {
1110 	ill_t *ill;
1111 	mrec_t *rp = NULL;
1112 	ip_stack_t	*ipst = ilm->ilm_ipst;
1113 
1114 	ASSERT(ilm != NULL);
1115 
1116 	ill = ilm->ilm_ill;
1117 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1118 
1119 	/* only need to send if we have an mldv2-capable router */
1120 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1121 		return;
1122 	}
1123 
1124 	/*
1125 	 * Compare existing (old) state with the new state passed in
1126 	 * and send appropriate MLDv2 State Change Report.
1127 	 *
1128 	 *	Old State	New State	State Change Report
1129 	 *
1130 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1131 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1132 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1133 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1134 	 */
1135 	if (ilm->ilm_fmode == fmode) {
1136 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1137 		slist_t *allow, *block;
1138 		if (((a_minus_b = l_alloc()) == NULL) ||
1139 		    ((b_minus_a = l_alloc()) == NULL)) {
1140 			l_free(a_minus_b);
1141 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1142 				goto send_to_ex;
1143 			else
1144 				goto send_to_in;
1145 		}
1146 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1147 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1148 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1149 			allow = b_minus_a;
1150 			block = a_minus_b;
1151 		} else {
1152 			allow = a_minus_b;
1153 			block = b_minus_a;
1154 		}
1155 		if (!SLIST_IS_EMPTY(allow))
1156 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1157 			    allow, rp);
1158 		if (!SLIST_IS_EMPTY(block))
1159 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1160 			    block, rp);
1161 		l_free(a_minus_b);
1162 		l_free(b_minus_a);
1163 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1164 send_to_ex:
1165 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1166 		    NULL);
1167 	} else {
1168 send_to_in:
1169 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1170 		    NULL);
1171 	}
1172 
1173 	/*
1174 	 * Need to set up retransmission state; merge the new info with the
1175 	 * current state (which may be null).  If the timer is not currently
1176 	 * running, the caller will start it when dropping ill_mcast_lock.
1177 	 */
1178 	rp = mcast_merge_rtx(ilm, rp, flist);
1179 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1180 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1181 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1182 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1183 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1184 		mutex_enter(&ipst->ips_mld_timer_lock);
1185 		ipst->ips_mld_deferred_next =
1186 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1187 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1188 		mutex_exit(&ipst->ips_mld_timer_lock);
1189 	}
1190 
1191 	mldv2_sendrpt(ill, rp);
1192 }
1193 
1194 uint_t
1195 igmp_timeout_handler_per_ill(ill_t *ill)
1196 {
1197 	uint_t	next = INFINITY, current;
1198 	ilm_t	*ilm;
1199 	mrec_t	*rp = NULL;
1200 	mrec_t	*rtxrp = NULL;
1201 	rtx_state_t *rtxp;
1202 	mcast_record_t	rtype;
1203 
1204 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1205 
1206 	current = CURRENT_MSTIME;
1207 	/* First check the global timer on this interface */
1208 	if (ill->ill_global_timer == INFINITY)
1209 		goto per_ilm_timer;
1210 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1211 		ill->ill_global_timer = INFINITY;
1212 		/*
1213 		 * Send report for each group on this interface.
1214 		 * Since we just set the global timer (received a v3 general
1215 		 * query), need to skip the all hosts addr (224.0.0.1), per
1216 		 * RFC 3376 section 5.
1217 		 */
1218 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1219 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1220 				continue;
1221 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1222 			    ilm->ilm_filter, rp);
1223 			/*
1224 			 * Since we're sending a report on this group, okay
1225 			 * to delete pending group-specific timers.  Note
1226 			 * that group-specific retransmit timers still need
1227 			 * to be checked in the per_ilm_timer for-loop.
1228 			 */
1229 			ilm->ilm_timer = INFINITY;
1230 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1231 			FREE_SLIST(ilm->ilm_pendsrcs);
1232 			ilm->ilm_pendsrcs = NULL;
1233 		}
1234 		igmpv3_sendrpt(ill, rp);
1235 		rp = NULL;
1236 	} else {
1237 		if ((ill->ill_global_timer - current) < next)
1238 			next = ill->ill_global_timer - current;
1239 	}
1240 
1241 per_ilm_timer:
1242 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1243 		if (ilm->ilm_timer == INFINITY)
1244 			goto per_ilm_rtxtimer;
1245 
1246 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1247 			if ((ilm->ilm_timer - current) < next)
1248 				next = ilm->ilm_timer - current;
1249 
1250 			if (ip_debug > 1) {
1251 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1252 				    "igmp_timo_hlr 2: ilm_timr %d "
1253 				    "typ %d nxt %d",
1254 				    (int)ntohl(ilm->ilm_timer - current),
1255 				    (ill->ill_mcast_type), next);
1256 			}
1257 
1258 			goto per_ilm_rtxtimer;
1259 		}
1260 
1261 		/* the timer has expired, need to take action */
1262 		ilm->ilm_timer = INFINITY;
1263 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1264 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1265 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1266 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1267 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1268 		} else {
1269 			slist_t *rsp;
1270 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1271 			    (rsp = l_alloc()) != NULL) {
1272 				/*
1273 				 * Contents of reply depend on pending
1274 				 * requested source list.
1275 				 */
1276 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1277 					l_intersection(ilm->ilm_filter,
1278 					    ilm->ilm_pendsrcs, rsp);
1279 				} else {
1280 					l_difference(ilm->ilm_pendsrcs,
1281 					    ilm->ilm_filter, rsp);
1282 				}
1283 				FREE_SLIST(ilm->ilm_pendsrcs);
1284 				ilm->ilm_pendsrcs = NULL;
1285 				if (!SLIST_IS_EMPTY(rsp))
1286 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1287 					    &ilm->ilm_v6addr, rsp, rp);
1288 				FREE_SLIST(rsp);
1289 			} else {
1290 				/*
1291 				 * Either the pending request is just group-
1292 				 * specific, or we couldn't get the resources
1293 				 * (rsp) to build a source-specific reply.
1294 				 */
1295 				rp = mcast_bldmrec(ilm->ilm_fmode,
1296 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1297 			}
1298 			igmpv3_sendrpt(ill, rp);
1299 			rp = NULL;
1300 		}
1301 
1302 per_ilm_rtxtimer:
1303 		rtxp = &ilm->ilm_rtx;
1304 
1305 		if (rtxp->rtx_timer == INFINITY)
1306 			continue;
1307 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1308 			if ((rtxp->rtx_timer - current) < next)
1309 				next = rtxp->rtx_timer - current;
1310 			continue;
1311 		}
1312 
1313 		rtxp->rtx_timer = INFINITY;
1314 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1315 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1316 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1317 			continue;
1318 		}
1319 		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1320 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1321 			continue;
1322 		}
1323 
1324 		/*
1325 		 * The retransmit timer has popped, and our router is
1326 		 * IGMPv3.  We have to delve into the retransmit state
1327 		 * stored in the ilm.
1328 		 *
1329 		 * Decrement the retransmit count.  If the fmode rtx
1330 		 * count is active, decrement it, and send a filter
1331 		 * mode change report with the ilm's source list.
1332 		 * Otherwise, send a source list change report with
1333 		 * the current retransmit lists.
1334 		 */
1335 		ASSERT(rtxp->rtx_cnt > 0);
1336 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1337 		rtxp->rtx_cnt--;
1338 		if (rtxp->rtx_fmode_cnt > 0) {
1339 			rtxp->rtx_fmode_cnt--;
1340 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1341 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1342 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1343 			    ilm->ilm_filter, rtxrp);
1344 		} else {
1345 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1346 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1347 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1348 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1349 		}
1350 		if (rtxp->rtx_cnt > 0) {
1351 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1352 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1353 			if (rtxp->rtx_timer < next)
1354 				next = rtxp->rtx_timer;
1355 			rtxp->rtx_timer += current;
1356 		} else {
1357 			ASSERT(rtxp->rtx_timer == INFINITY);
1358 			CLEAR_SLIST(rtxp->rtx_allow);
1359 			CLEAR_SLIST(rtxp->rtx_block);
1360 		}
1361 		igmpv3_sendrpt(ill, rtxrp);
1362 		rtxrp = NULL;
1363 	}
1364 
1365 	rw_exit(&ill->ill_mcast_lock);
1366 	/* Send any deferred/queued IP packets */
1367 	ill_mcast_send_queued(ill);
1368 	/* Defer ill_mcast_timer_start() until the caller is done */
1369 
1370 	return (next);
1371 }
1372 
1373 /*
1374  * igmp_timeout_handler:
1375  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1376  * Returns number of ticks to next event (or 0 if none).
1377  *
1378  * As part of multicast join and leave igmp we may need to send out an
1379  * igmp request. The igmp related state variables in the ilm are protected
1380  * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
1381  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1382  * starts the igmp timer if needed. It serializes multiple threads trying to
1383  * simultaneously start the timer using the igmp_timer_setter_active flag.
1384  *
1385  * igmp_input() receives igmp queries and responds to the queries
1386  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1387  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1388  * performs the action exclusively after acquiring ill_mcast_lock.
1389  *
1390  * The igmp_slowtimeo() function is called thru another timer.
1391  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1392  */
1393 void
1394 igmp_timeout_handler(void *arg)
1395 {
1396 	ill_t	*ill;
1397 	uint_t  global_next = INFINITY;
1398 	uint_t  next;
1399 	ill_walk_context_t ctx;
1400 	ip_stack_t *ipst = arg;
1401 
1402 	ASSERT(arg != NULL);
1403 	mutex_enter(&ipst->ips_igmp_timer_lock);
1404 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1405 	ipst->ips_igmp_timeout_id = 0;
1406 	ipst->ips_igmp_timer_scheduled_last = 0;
1407 	ipst->ips_igmp_time_to_next = 0;
1408 	mutex_exit(&ipst->ips_igmp_timer_lock);
1409 
1410 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1411 	ill = ILL_START_WALK_V4(&ctx, ipst);
1412 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1413 		ASSERT(!ill->ill_isv6);
1414 		/* Make sure the ill isn't going away. */
1415 		if (!ill_check_and_refhold(ill))
1416 			continue;
1417 		rw_exit(&ipst->ips_ill_g_lock);
1418 		next = igmp_timeout_handler_per_ill(ill);
1419 		if (next < global_next)
1420 			global_next = next;
1421 		ill_refrele(ill);
1422 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1423 	}
1424 	rw_exit(&ipst->ips_ill_g_lock);
1425 	if (global_next != INFINITY)
1426 		igmp_start_timers(global_next, ipst);
1427 }
1428 
1429 /*
1430  * mld_timeout_handler:
1431  * Called when there are timeout events, every next (tick).
1432  * Returns number of ticks to next event (or 0 if none).
1433  */
1434 uint_t
1435 mld_timeout_handler_per_ill(ill_t *ill)
1436 {
1437 	ilm_t	*ilm;
1438 	uint_t	next = INFINITY, current;
1439 	mrec_t	*rp, *rtxrp;
1440 	rtx_state_t *rtxp;
1441 	mcast_record_t	rtype;
1442 
1443 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1444 
1445 	current = CURRENT_MSTIME;
1446 	/*
1447 	 * First check the global timer on this interface; the global timer
1448 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1449 	 */
1450 	if (ill->ill_global_timer == INFINITY)
1451 		goto per_ilm_timer;
1452 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1453 		ill->ill_global_timer = INFINITY;
1454 		/*
1455 		 * Send report for each group on this interface.
1456 		 * Since we just set the global timer (received a v2 general
1457 		 * query), need to skip the all hosts addr (ff02::1), per
1458 		 * RFC 3810 section 6.
1459 		 */
1460 		rp = NULL;
1461 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1462 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1463 			    &ipv6_all_hosts_mcast))
1464 				continue;
1465 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1466 			    ilm->ilm_filter, rp);
1467 			/*
1468 			 * Since we're sending a report on this group, okay
1469 			 * to delete pending group-specific timers.  Note
1470 			 * that group-specific retransmit timers still need
1471 			 * to be checked in the per_ilm_timer for-loop.
1472 			 */
1473 			ilm->ilm_timer = INFINITY;
1474 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1475 			FREE_SLIST(ilm->ilm_pendsrcs);
1476 			ilm->ilm_pendsrcs = NULL;
1477 		}
1478 		mldv2_sendrpt(ill, rp);
1479 	} else {
1480 		if ((ill->ill_global_timer - current) < next)
1481 			next = ill->ill_global_timer - current;
1482 	}
1483 
1484 per_ilm_timer:
1485 	rp = rtxrp = NULL;
1486 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1487 		if (ilm->ilm_timer == INFINITY)
1488 			goto per_ilm_rtxtimer;
1489 
1490 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1491 			if ((ilm->ilm_timer - current) < next)
1492 				next = ilm->ilm_timer - current;
1493 
1494 			if (ip_debug > 1) {
1495 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1496 				    "igmp_timo_hlr 2: ilm_timr"
1497 				    " %d typ %d nxt %d",
1498 				    (int)ntohl(ilm->ilm_timer - current),
1499 				    (ill->ill_mcast_type), next);
1500 			}
1501 
1502 			goto per_ilm_rtxtimer;
1503 		}
1504 
1505 		/* the timer has expired, need to take action */
1506 		ilm->ilm_timer = INFINITY;
1507 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1508 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1509 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1510 		} else {
1511 			slist_t *rsp;
1512 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1513 			    (rsp = l_alloc()) != NULL) {
1514 				/*
1515 				 * Contents of reply depend on pending
1516 				 * requested source list.
1517 				 */
1518 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1519 					l_intersection(ilm->ilm_filter,
1520 					    ilm->ilm_pendsrcs, rsp);
1521 				} else {
1522 					l_difference(ilm->ilm_pendsrcs,
1523 					    ilm->ilm_filter, rsp);
1524 				}
1525 				FREE_SLIST(ilm->ilm_pendsrcs);
1526 				ilm->ilm_pendsrcs = NULL;
1527 				if (!SLIST_IS_EMPTY(rsp))
1528 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1529 					    &ilm->ilm_v6addr, rsp, rp);
1530 				FREE_SLIST(rsp);
1531 			} else {
1532 				rp = mcast_bldmrec(ilm->ilm_fmode,
1533 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1534 			}
1535 		}
1536 
1537 per_ilm_rtxtimer:
1538 		rtxp = &ilm->ilm_rtx;
1539 
1540 		if (rtxp->rtx_timer == INFINITY)
1541 			continue;
1542 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1543 			if ((rtxp->rtx_timer - current) < next)
1544 				next = rtxp->rtx_timer - current;
1545 			continue;
1546 		}
1547 
1548 		rtxp->rtx_timer = INFINITY;
1549 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1550 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1551 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1552 			continue;
1553 		}
1554 
1555 		/*
1556 		 * The retransmit timer has popped, and our router is
1557 		 * MLDv2.  We have to delve into the retransmit state
1558 		 * stored in the ilm.
1559 		 *
1560 		 * Decrement the retransmit count.  If the fmode rtx
1561 		 * count is active, decrement it, and send a filter
1562 		 * mode change report with the ilm's source list.
1563 		 * Otherwise, send a source list change report with
1564 		 * the current retransmit lists.
1565 		 */
1566 		ASSERT(rtxp->rtx_cnt > 0);
1567 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1568 		rtxp->rtx_cnt--;
1569 		if (rtxp->rtx_fmode_cnt > 0) {
1570 			rtxp->rtx_fmode_cnt--;
1571 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1572 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1573 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1574 			    ilm->ilm_filter, rtxrp);
1575 		} else {
1576 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1577 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1578 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1579 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1580 		}
1581 		if (rtxp->rtx_cnt > 0) {
1582 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1583 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1584 			if (rtxp->rtx_timer < next)
1585 				next = rtxp->rtx_timer;
1586 			rtxp->rtx_timer += current;
1587 		} else {
1588 			ASSERT(rtxp->rtx_timer == INFINITY);
1589 			CLEAR_SLIST(rtxp->rtx_allow);
1590 			CLEAR_SLIST(rtxp->rtx_block);
1591 		}
1592 	}
1593 
1594 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1595 		mldv2_sendrpt(ill, rp);
1596 		mldv2_sendrpt(ill, rtxrp);
1597 	}
1598 	rw_exit(&ill->ill_mcast_lock);
1599 	/* Send any deferred/queued IP packets */
1600 	ill_mcast_send_queued(ill);
1601 	/* Defer ill_mcast_timer_start() until the caller is done */
1602 
1603 	return (next);
1604 }
1605 
1606 /*
1607  * mld_timeout_handler:
1608  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1609  * Returns number of ticks to next event (or 0 if none).
1610  * MT issues are same as igmp_timeout_handler
1611  */
1612 void
1613 mld_timeout_handler(void *arg)
1614 {
1615 	ill_t	*ill;
1616 	uint_t  global_next = INFINITY;
1617 	uint_t  next;
1618 	ill_walk_context_t ctx;
1619 	ip_stack_t *ipst = arg;
1620 
1621 	ASSERT(arg != NULL);
1622 	mutex_enter(&ipst->ips_mld_timer_lock);
1623 	ASSERT(ipst->ips_mld_timeout_id != 0);
1624 	ipst->ips_mld_timeout_id = 0;
1625 	ipst->ips_mld_timer_scheduled_last = 0;
1626 	ipst->ips_mld_time_to_next = 0;
1627 	mutex_exit(&ipst->ips_mld_timer_lock);
1628 
1629 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1630 	ill = ILL_START_WALK_V6(&ctx, ipst);
1631 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1632 		ASSERT(ill->ill_isv6);
1633 		/* Make sure the ill isn't going away. */
1634 		if (!ill_check_and_refhold(ill))
1635 			continue;
1636 		rw_exit(&ipst->ips_ill_g_lock);
1637 		next = mld_timeout_handler_per_ill(ill);
1638 		if (next < global_next)
1639 			global_next = next;
1640 		ill_refrele(ill);
1641 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1642 	}
1643 	rw_exit(&ipst->ips_ill_g_lock);
1644 	if (global_next != INFINITY)
1645 		mld_start_timers(global_next, ipst);
1646 }
1647 
1648 /*
1649  * Calculate the Older Version Querier Present timeout value, in number
1650  * of slowtimo intervals, for the given ill.
1651  */
1652 #define	OVQP(ill) \
1653 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1654 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1655 
1656 /*
1657  * igmp_slowtimo:
1658  * - Resets to new router if we didnt we hear from the router
1659  *   in IGMP_AGE_THRESHOLD seconds.
1660  * - Resets slowtimeout.
1661  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1662  * IGMP version than configured.
1663  */
1664 void
1665 igmp_slowtimo(void *arg)
1666 {
1667 	ill_t	*ill;
1668 	ill_if_t *ifp;
1669 	avl_tree_t *avl_tree;
1670 	ip_stack_t *ipst = (ip_stack_t *)arg;
1671 
1672 	ASSERT(arg != NULL);
1673 
1674 	/*
1675 	 * The ill_if_t list is circular, hence the odd loop parameters.
1676 	 *
1677 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1678 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1679 	 * structure (allowing us to skip if none of the instances have timers
1680 	 * running).
1681 	 */
1682 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1683 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1684 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1685 	    ifp = ifp->illif_next) {
1686 		/*
1687 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1688 		 * a V1 or V2 query now and we miss seeing the count now,
1689 		 * we will see it the next time igmp_slowtimo is called.
1690 		 */
1691 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1692 			continue;
1693 
1694 		avl_tree = &ifp->illif_avl_by_ppa;
1695 		for (ill = avl_first(avl_tree); ill != NULL;
1696 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1697 			/* Make sure the ill isn't going away. */
1698 			if (!ill_check_and_refhold(ill))
1699 				continue;
1700 			rw_exit(&ipst->ips_ill_g_lock);
1701 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1702 			if (ill->ill_mcast_v1_tset == 1)
1703 				ill->ill_mcast_v1_time++;
1704 			if (ill->ill_mcast_v2_tset == 1)
1705 				ill->ill_mcast_v2_time++;
1706 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1707 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1708 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1709 				if ((ill->ill_mcast_v2_tset > 0) ||
1710 				    (ipst->ips_igmp_max_version ==
1711 				    IGMP_V2_ROUTER)) {
1712 					ip1dbg(("V1 query timer "
1713 					    "expired on %s; switching "
1714 					    "mode to IGMP_V2\n",
1715 					    ill->ill_name));
1716 					ill->ill_mcast_type =
1717 					    IGMP_V2_ROUTER;
1718 				} else {
1719 					ip1dbg(("V1 query timer "
1720 					    "expired on %s; switching "
1721 					    "mode to IGMP_V3\n",
1722 					    ill->ill_name));
1723 					ill->ill_mcast_type =
1724 					    IGMP_V3_ROUTER;
1725 				}
1726 				ill->ill_mcast_v1_time = 0;
1727 				ill->ill_mcast_v1_tset = 0;
1728 				atomic_dec_16(&ifp->illif_mcast_v1);
1729 			}
1730 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1731 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1732 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1733 				ip1dbg(("V2 query timer expired on "
1734 				    "%s; switching mode to IGMP_V3\n",
1735 				    ill->ill_name));
1736 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1737 				ill->ill_mcast_v2_time = 0;
1738 				ill->ill_mcast_v2_tset = 0;
1739 				atomic_dec_16(&ifp->illif_mcast_v2);
1740 			}
1741 			rw_exit(&ill->ill_mcast_lock);
1742 			ill_refrele(ill);
1743 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1744 		}
1745 	}
1746 	rw_exit(&ipst->ips_ill_g_lock);
1747 	ill_mcast_timer_start(ipst);
1748 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1749 	if (ipst->ips_igmp_slowtimeout_quiesce != B_TRUE) {
1750 		ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
1751 		    (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1752 	} else {
1753 		ipst->ips_igmp_slowtimeout_id = 0;
1754 	}
1755 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1756 }
1757 
1758 /*
1759  * mld_slowtimo:
1760  * - Resets to newer version if we didn't hear from the older version router
1761  *   in MLD_AGE_THRESHOLD seconds.
1762  * - Restarts slowtimeout.
1763  * Check for ips_mld_max_version ensures that we don't revert to a higher
1764  * IGMP version than configured.
1765  */
1766 void
1767 mld_slowtimo(void *arg)
1768 {
1769 	ill_t *ill;
1770 	ill_if_t *ifp;
1771 	avl_tree_t *avl_tree;
1772 	ip_stack_t *ipst = (ip_stack_t *)arg;
1773 
1774 	ASSERT(arg != NULL);
1775 	/* See comments in igmp_slowtimo() above... */
1776 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1777 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1778 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1779 	    ifp = ifp->illif_next) {
1780 		if (ifp->illif_mcast_v1 == 0)
1781 			continue;
1782 
1783 		avl_tree = &ifp->illif_avl_by_ppa;
1784 		for (ill = avl_first(avl_tree); ill != NULL;
1785 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1786 			/* Make sure the ill isn't going away. */
1787 			if (!ill_check_and_refhold(ill))
1788 				continue;
1789 			rw_exit(&ipst->ips_ill_g_lock);
1790 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1791 			if (ill->ill_mcast_v1_tset == 1)
1792 				ill->ill_mcast_v1_time++;
1793 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1794 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1795 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1796 				ip1dbg(("MLD query timer expired on"
1797 				    " %s; switching mode to MLD_V2\n",
1798 				    ill->ill_name));
1799 				ill->ill_mcast_type = MLD_V2_ROUTER;
1800 				ill->ill_mcast_v1_time = 0;
1801 				ill->ill_mcast_v1_tset = 0;
1802 				atomic_dec_16(&ifp->illif_mcast_v1);
1803 			}
1804 			rw_exit(&ill->ill_mcast_lock);
1805 			ill_refrele(ill);
1806 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1807 		}
1808 	}
1809 	rw_exit(&ipst->ips_ill_g_lock);
1810 	ill_mcast_timer_start(ipst);
1811 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1812 	if (ipst->ips_mld_slowtimeout_quiesce != B_TRUE) {
1813 		ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
1814 		    (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1815 	} else {
1816 		ipst->ips_mld_slowtimeout_id = 0;
1817 	}
1818 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1819 }
1820 
1821 /*
1822  * igmp_sendpkt:
1823  * This will send to ip_output_simple just like icmp_inbound.
1824  */
1825 static void
1826 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1827 {
1828 	mblk_t	*mp;
1829 	igmpa_t	*igmpa;
1830 	uint8_t *rtralert;
1831 	ipha_t	*ipha;
1832 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1833 	size_t	size  = hdrlen + sizeof (igmpa_t);
1834 	ill_t	*ill  = ilm->ilm_ill;
1835 	ip_stack_t *ipst = ill->ill_ipst;
1836 
1837 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1838 
1839 	mp = allocb(size, BPRI_HI);
1840 	if (mp == NULL) {
1841 		return;
1842 	}
1843 	mp->b_wptr = mp->b_rptr + size;
1844 
1845 	ipha = (ipha_t *)mp->b_rptr;
1846 	rtralert = (uint8_t *)&(ipha[1]);
1847 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1848 	igmpa->igmpa_type   = type;
1849 	igmpa->igmpa_code   = 0;
1850 	igmpa->igmpa_group  = ilm->ilm_addr;
1851 	igmpa->igmpa_cksum  = 0;
1852 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1853 
1854 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1855 	rtralert[1] = RTRALERT_LEN;
1856 	rtralert[2] = 0;
1857 	rtralert[3] = 0;
1858 
1859 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1860 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1861 	ipha->ipha_type_of_service	= 0;
1862 	ipha->ipha_length = htons(size);
1863 	ipha->ipha_ident = 0;
1864 	ipha->ipha_fragment_offset_and_flags = 0;
1865 	ipha->ipha_ttl		= IGMP_TTL;
1866 	ipha->ipha_protocol	= IPPROTO_IGMP;
1867 	ipha->ipha_hdr_checksum	= 0;
1868 	ipha->ipha_dst		= addr ? addr : igmpa->igmpa_group;
1869 	ipha->ipha_src		= INADDR_ANY;
1870 
1871 	ill_mcast_queue(ill, mp);
1872 
1873 	++ipst->ips_igmpstat.igps_snd_reports;
1874 }
1875 
1876 /*
1877  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
1878  * The report will contain one group record
1879  * for each element of reclist.  If this causes packet length to
1880  * exceed ill->ill_mc_mtu, multiple reports are sent.
1881  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1882  * and those buffers are freed here.
1883  */
1884 static void
1885 igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
1886 {
1887 	igmp3ra_t *igmp3ra;
1888 	grphdra_t *grphdr;
1889 	mblk_t *mp;
1890 	ipha_t *ipha;
1891 	uint8_t *rtralert;
1892 	ipaddr_t *src_array;
1893 	int i, j, numrec, more_src_cnt;
1894 	size_t hdrsize, size, rsize;
1895 	mrec_t *rp, *cur_reclist;
1896 	mrec_t *next_reclist = reclist;
1897 	boolean_t morepkts;
1898 	ip_stack_t	 *ipst = ill->ill_ipst;
1899 
1900 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1901 
1902 	/* if there aren't any records, there's nothing to send */
1903 	if (reclist == NULL)
1904 		return;
1905 
1906 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1907 nextpkt:
1908 	size = hdrsize + sizeof (igmp3ra_t);
1909 	morepkts = B_FALSE;
1910 	more_src_cnt = 0;
1911 	cur_reclist = next_reclist;
1912 	numrec = 0;
1913 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1914 		rsize = sizeof (grphdra_t) +
1915 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1916 		if (size + rsize > ill->ill_mc_mtu) {
1917 			if (rp == cur_reclist) {
1918 				/*
1919 				 * If the first mrec we looked at is too big
1920 				 * to fit in a single packet (i.e the source
1921 				 * list is too big), we must either truncate
1922 				 * the list (if TO_EX or IS_EX), or send
1923 				 * multiple reports for the same group (all
1924 				 * other types).
1925 				 */
1926 				int srcspace, srcsperpkt;
1927 				srcspace = ill->ill_mc_mtu - (size +
1928 				    sizeof (grphdra_t));
1929 
1930 				/*
1931 				 * Skip if there's not even enough room in
1932 				 * a single packet to send something useful.
1933 				 */
1934 				if (srcspace <= sizeof (ipaddr_t))
1935 					continue;
1936 
1937 				srcsperpkt = srcspace / sizeof (ipaddr_t);
1938 				/*
1939 				 * Increment size and numrec, because we will
1940 				 * be sending a record for the mrec we're
1941 				 * looking at now.
1942 				 */
1943 				size += sizeof (grphdra_t) +
1944 				    (srcsperpkt * sizeof (ipaddr_t));
1945 				numrec++;
1946 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
1947 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
1948 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1949 					if (rp->mrec_next == NULL) {
1950 						/* no more packets to send */
1951 						break;
1952 					} else {
1953 						/*
1954 						 * more packets, but we're
1955 						 * done with this mrec.
1956 						 */
1957 						next_reclist = rp->mrec_next;
1958 					}
1959 				} else {
1960 					more_src_cnt = rp->mrec_srcs.sl_numsrc
1961 					    - srcsperpkt;
1962 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1963 					/*
1964 					 * We'll fix up this mrec (remove the
1965 					 * srcs we've already sent) before
1966 					 * returning to nextpkt above.
1967 					 */
1968 					next_reclist = rp;
1969 				}
1970 			} else {
1971 				next_reclist = rp;
1972 			}
1973 			morepkts = B_TRUE;
1974 			break;
1975 		}
1976 		size += rsize;
1977 		numrec++;
1978 	}
1979 
1980 	mp = allocb(size, BPRI_HI);
1981 	if (mp == NULL) {
1982 		goto free_reclist;
1983 	}
1984 	bzero((char *)mp->b_rptr, size);
1985 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
1986 
1987 	ipha = (ipha_t *)mp->b_rptr;
1988 	rtralert = (uint8_t *)&(ipha[1]);
1989 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
1990 	grphdr = (grphdra_t *)&(igmp3ra[1]);
1991 
1992 	rp = cur_reclist;
1993 	for (i = 0; i < numrec; i++) {
1994 		grphdr->grphdra_type = rp->mrec_type;
1995 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
1996 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
1997 		src_array = (ipaddr_t *)&(grphdr[1]);
1998 
1999 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2000 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2001 
2002 		grphdr = (grphdra_t *)&(src_array[j]);
2003 		rp = rp->mrec_next;
2004 	}
2005 
2006 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2007 	igmp3ra->igmp3ra_numrec = htons(numrec);
2008 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2009 
2010 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2011 	rtralert[1] = RTRALERT_LEN;
2012 	rtralert[2] = 0;
2013 	rtralert[3] = 0;
2014 
2015 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2016 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2017 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2018 	ipha->ipha_length = htons(size);
2019 	ipha->ipha_ttl = IGMP_TTL;
2020 	ipha->ipha_protocol = IPPROTO_IGMP;
2021 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2022 	ipha->ipha_src = INADDR_ANY;
2023 
2024 	ill_mcast_queue(ill, mp);
2025 
2026 	++ipst->ips_igmpstat.igps_snd_reports;
2027 
2028 	if (morepkts) {
2029 		if (more_src_cnt > 0) {
2030 			int index, mvsize;
2031 			slist_t *sl = &next_reclist->mrec_srcs;
2032 			index = sl->sl_numsrc;
2033 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2034 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2035 			    mvsize);
2036 			sl->sl_numsrc = more_src_cnt;
2037 		}
2038 		goto nextpkt;
2039 	}
2040 
2041 free_reclist:
2042 	while (reclist != NULL) {
2043 		rp = reclist->mrec_next;
2044 		mi_free(reclist);
2045 		reclist = rp;
2046 	}
2047 }
2048 
2049 /*
2050  * mld_input:
2051  * Return NULL for a bad packet that is discarded here.
2052  * Return mp if the message is OK and should be handed to "raw" receivers.
2053  * Callers of mld_input() may need to reinitialize variables that were copied
2054  * from the mblk as this calls pullupmsg().
2055  */
2056 mblk_t *
2057 mld_input(mblk_t *mp, ip_recv_attr_t *ira)
2058 {
2059 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2060 	mld_hdr_t	*mldh;
2061 	ilm_t		*ilm;
2062 	ipif_t		*ipif;
2063 	uint16_t	hdr_length, exthdr_length;
2064 	in6_addr_t	*v6group_ptr;
2065 	uint_t		next;
2066 	int		mldlen;
2067 	ill_t		*ill = ira->ira_ill;
2068 	ip_stack_t	*ipst = ill->ill_ipst;
2069 
2070 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2071 
2072 	/* Make sure the src address of the packet is link-local */
2073 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2074 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2075 		freemsg(mp);
2076 		return (NULL);
2077 	}
2078 
2079 	if (ip6h->ip6_hlim != 1) {
2080 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2081 		freemsg(mp);
2082 		return (NULL);
2083 	}
2084 
2085 	/* Get to the icmp header part */
2086 	hdr_length = ira->ira_ip_hdr_length;
2087 	exthdr_length = hdr_length - IPV6_HDR_LEN;
2088 
2089 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2090 
2091 	/* An MLD packet must at least be 24 octets to be valid */
2092 	if (mldlen < MLD_MINLEN) {
2093 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2094 		freemsg(mp);
2095 		return (NULL);
2096 	}
2097 
2098 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2099 
2100 	switch (mldh->mld_type) {
2101 	case MLD_LISTENER_QUERY:
2102 		/*
2103 		 * packet length differentiates between v1 and v2.  v1
2104 		 * query should be exactly 24 octets long; v2 is >= 28.
2105 		 */
2106 		if ((mldlen == MLD_MINLEN) ||
2107 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2108 			next = mld_query_in(mldh, ill);
2109 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2110 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2111 		} else {
2112 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2113 			freemsg(mp);
2114 			return (NULL);
2115 		}
2116 		if (next == 0) {
2117 			return (mp);
2118 		}
2119 
2120 		if (next != INFINITY)
2121 			mld_start_timers(next, ipst);
2122 		break;
2123 
2124 	case MLD_LISTENER_REPORT:
2125 		/*
2126 		 * For fast leave to work, we have to know that we are the
2127 		 * last person to send a report for this group.  Reports
2128 		 * generated by us are looped back since we could potentially
2129 		 * be a multicast router, so discard reports sourced by me.
2130 		 */
2131 		mutex_enter(&ill->ill_lock);
2132 		for (ipif = ill->ill_ipif; ipif != NULL;
2133 		    ipif = ipif->ipif_next) {
2134 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2135 			    &ip6h->ip6_src)) {
2136 				if (ip_debug > 1) {
2137 					char    buf1[INET6_ADDRSTRLEN];
2138 
2139 					(void) mi_strlog(ill->ill_rq,
2140 					    1,
2141 					    SL_TRACE,
2142 					    "mld_input: we are only "
2143 					    "member src %s\n",
2144 					    inet_ntop(AF_INET6, &ip6h->ip6_src,
2145 					    buf1, sizeof (buf1)));
2146 				}
2147 				mutex_exit(&ill->ill_lock);
2148 				return (mp);
2149 			}
2150 		}
2151 		mutex_exit(&ill->ill_lock);
2152 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2153 
2154 		v6group_ptr = &mldh->mld_addr;
2155 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2156 			BUMP_MIB(ill->ill_icmp6_mib,
2157 			    ipv6IfIcmpInGroupMembBadReports);
2158 			freemsg(mp);
2159 			return (NULL);
2160 		}
2161 
2162 
2163 		/*
2164 		 * If we belong to the group being reported, and we are a
2165 		 * 'Delaying member' per the RFC terminology, stop our timer
2166 		 * for that group and 'clear flag' i.e. mark ilm_state as
2167 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2168 		 * membership entries for the same group address (one per zone)
2169 		 * so we need to walk the ill_ilm list.
2170 		 */
2171 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2172 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2173 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2174 				continue;
2175 			BUMP_MIB(ill->ill_icmp6_mib,
2176 			    ipv6IfIcmpInGroupMembOurReports);
2177 
2178 			ilm->ilm_timer = INFINITY;
2179 			ilm->ilm_state = IGMP_OTHERMEMBER;
2180 		}
2181 		rw_exit(&ill->ill_mcast_lock);
2182 		/*
2183 		 * No packets have been sent above - no
2184 		 * ill_mcast_send_queued is needed.
2185 		 */
2186 		ill_mcast_timer_start(ill->ill_ipst);
2187 		break;
2188 
2189 	case MLD_LISTENER_REDUCTION:
2190 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2191 		break;
2192 	}
2193 	return (mp);
2194 }
2195 
2196 /*
2197  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2198  * (non-zero, unsigned) timer value to be set on success.
2199  */
2200 static uint_t
2201 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2202 {
2203 	ilm_t	*ilm;
2204 	int	timer;
2205 	uint_t	next, current;
2206 	in6_addr_t *v6group;
2207 
2208 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2209 
2210 	/*
2211 	 * In the MLD specification, there are 3 states and a flag.
2212 	 *
2213 	 * In Non-Listener state, we simply don't have a membership record.
2214 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2215 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2216 	 * INFINITY)
2217 	 *
2218 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2219 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2220 	 * if I sent the last report.
2221 	 */
2222 	v6group = &mldh->mld_addr;
2223 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2224 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2225 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2226 		return (0);
2227 	}
2228 
2229 	/* Need to do compatibility mode checking */
2230 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2231 	ill->ill_mcast_v1_time = 0;
2232 	ill->ill_mcast_v1_tset = 1;
2233 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2234 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2235 		    "MLD_V1_ROUTER\n", ill->ill_name));
2236 		atomic_inc_16(&ill->ill_ifptr->illif_mcast_v1);
2237 		ill->ill_mcast_type = MLD_V1_ROUTER;
2238 	}
2239 
2240 	timer = (int)ntohs(mldh->mld_maxdelay);
2241 	if (ip_debug > 1) {
2242 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2243 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2244 		    timer, (int)mldh->mld_type);
2245 	}
2246 
2247 	/*
2248 	 * -Start the timers in all of our membership records for
2249 	 * the physical interface on which the query arrived,
2250 	 * excl:
2251 	 *	1.  those that belong to the "all hosts" group,
2252 	 *	2.  those with 0 scope, or 1 node-local scope.
2253 	 *
2254 	 * -Restart any timer that is already running but has a value
2255 	 * longer that the requested timeout.
2256 	 * -Use the value specified in the query message as the
2257 	 * maximum timeout.
2258 	 */
2259 	next = INFINITY;
2260 
2261 	current = CURRENT_MSTIME;
2262 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2263 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2264 
2265 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2266 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2267 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2268 			continue;
2269 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2270 		    &ipv6_all_hosts_mcast)) &&
2271 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2272 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2273 			if (timer == 0) {
2274 				/* Respond immediately */
2275 				ilm->ilm_timer = INFINITY;
2276 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2277 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2278 				break;
2279 			}
2280 			if (ilm->ilm_timer > timer) {
2281 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2282 				if (ilm->ilm_timer < next)
2283 					next = ilm->ilm_timer;
2284 				ilm->ilm_timer += current;
2285 			}
2286 			break;
2287 		}
2288 	}
2289 	rw_exit(&ill->ill_mcast_lock);
2290 	/* Send any deferred/queued IP packets */
2291 	ill_mcast_send_queued(ill);
2292 	ill_mcast_timer_start(ill->ill_ipst);
2293 
2294 	return (next);
2295 }
2296 
2297 /*
2298  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2299  * returns the appropriate (non-zero, unsigned) timer value (which may
2300  * be INFINITY) to be set.
2301  */
2302 static uint_t
2303 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2304 {
2305 	ilm_t	*ilm;
2306 	in6_addr_t *v6group, *src_array;
2307 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2308 	uint8_t	qrv;
2309 
2310 	v6group = &mld2q->mld2q_addr;
2311 	numsrc = ntohs(mld2q->mld2q_numsrc);
2312 
2313 	/* make sure numsrc matches packet size */
2314 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2315 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2316 		return (0);
2317 	}
2318 	src_array = (in6_addr_t *)&mld2q[1];
2319 
2320 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2321 
2322 	/* extract Maximum Response Delay from code in header */
2323 	mrd = ntohs(mld2q->mld2q_mxrc);
2324 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2325 		uint_t hdrval, mant, exp;
2326 		hdrval = mrd;
2327 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2328 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2329 		mrd = (mant | 0x1000) << (exp + 3);
2330 	}
2331 	if (mrd == 0)
2332 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2333 
2334 	MCAST_RANDOM_DELAY(delay, mrd);
2335 	next = (unsigned)INFINITY;
2336 	current = CURRENT_MSTIME;
2337 
2338 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2339 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2340 	else
2341 		ill->ill_mcast_rv = qrv;
2342 
2343 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2344 		uint_t mant, exp;
2345 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2346 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2347 		qqi = (mant | 0x10) << (exp + 3);
2348 	}
2349 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2350 
2351 	/*
2352 	 * If we have a pending general query response that's scheduled
2353 	 * sooner than the delay we calculated for this response, then
2354 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2355 	 */
2356 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2357 	if (ill->ill_global_timer < (current + delay)) {
2358 		rw_exit(&ill->ill_mcast_lock);
2359 		return (next);
2360 	}
2361 
2362 	/*
2363 	 * Now take action depending on query type: general,
2364 	 * group specific, or group/source specific.
2365 	 */
2366 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2367 		/*
2368 		 * general query
2369 		 * We know global timer is either not running or is
2370 		 * greater than our calculated delay, so reset it to
2371 		 * our delay (random value in range [0, response time])
2372 		 */
2373 		ill->ill_global_timer = current + delay;
2374 		next = delay;
2375 	} else {
2376 		/* group or group/source specific query */
2377 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2378 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2379 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2380 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2381 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2382 				continue;
2383 
2384 			/*
2385 			 * If the query is group specific or we have a
2386 			 * pending group specific query, the response is
2387 			 * group specific (pending sources list should be
2388 			 * empty).  Otherwise, need to update the pending
2389 			 * sources list for the group and source specific
2390 			 * response.
2391 			 */
2392 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2393 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2394 group_query:
2395 				FREE_SLIST(ilm->ilm_pendsrcs);
2396 				ilm->ilm_pendsrcs = NULL;
2397 			} else {
2398 				boolean_t overflow;
2399 				slist_t *pktl;
2400 				if (numsrc > MAX_FILTER_SIZE ||
2401 				    (ilm->ilm_pendsrcs == NULL &&
2402 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2403 					/*
2404 					 * We've been sent more sources than
2405 					 * we can deal with; or we can't deal
2406 					 * with a source list at all. Revert
2407 					 * to a group specific query.
2408 					 */
2409 					goto group_query;
2410 				}
2411 				if ((pktl = l_alloc()) == NULL)
2412 					goto group_query;
2413 				pktl->sl_numsrc = numsrc;
2414 				for (i = 0; i < numsrc; i++)
2415 					pktl->sl_addr[i] = src_array[i];
2416 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2417 				    &overflow);
2418 				l_free(pktl);
2419 				if (overflow)
2420 					goto group_query;
2421 			}
2422 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2423 			    INFINITY : (ilm->ilm_timer - current);
2424 			/* set timer to soonest value */
2425 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2426 			if (ilm->ilm_timer < next)
2427 				next = ilm->ilm_timer;
2428 			ilm->ilm_timer += current;
2429 			break;
2430 		}
2431 	}
2432 	rw_exit(&ill->ill_mcast_lock);
2433 	/*
2434 	 * No packets have been sent above - no
2435 	 * ill_mcast_send_queued is needed.
2436 	 */
2437 	ill_mcast_timer_start(ill->ill_ipst);
2438 
2439 	return (next);
2440 }
2441 
2442 /*
2443  * Send MLDv1 response packet with hoplimit 1
2444  */
2445 static void
2446 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2447 {
2448 	mblk_t		*mp;
2449 	mld_hdr_t	*mldh;
2450 	ip6_t		*ip6h;
2451 	ip6_hbh_t	*ip6hbh;
2452 	struct ip6_opt_router	*ip6router;
2453 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2454 	ill_t		*ill = ilm->ilm_ill;
2455 
2456 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2457 
2458 	/*
2459 	 * We need to place a router alert option in this packet.  The length
2460 	 * of the options must be a multiple of 8.  The hbh option header is 2
2461 	 * bytes followed by the 4 byte router alert option.  That leaves
2462 	 * 2 bytes of pad for a total of 8 bytes.
2463 	 */
2464 	const int	router_alert_length = 8;
2465 
2466 	ASSERT(ill->ill_isv6);
2467 
2468 	size += router_alert_length;
2469 	mp = allocb(size, BPRI_HI);
2470 	if (mp == NULL)
2471 		return;
2472 	bzero(mp->b_rptr, size);
2473 	mp->b_wptr = mp->b_rptr + size;
2474 
2475 	ip6h = (ip6_t *)mp->b_rptr;
2476 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2477 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2478 	/*
2479 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2480 	 * above will pad between ip6router and mld.
2481 	 */
2482 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2483 
2484 	mldh->mld_type = type;
2485 	mldh->mld_addr = ilm->ilm_v6addr;
2486 
2487 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2488 	ip6router->ip6or_len = 2;
2489 	ip6router->ip6or_value[0] = 0;
2490 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2491 
2492 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2493 	ip6hbh->ip6h_len = 0;
2494 
2495 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2496 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2497 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2498 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2499 	if (v6addr == NULL)
2500 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2501 	else
2502 		ip6h->ip6_dst = *v6addr;
2503 
2504 	ip6h->ip6_src = ipv6_all_zeros;
2505 	/*
2506 	 * Prepare for checksum by putting icmp length in the icmp
2507 	 * checksum field. The checksum is calculated in ip_output.
2508 	 */
2509 	mldh->mld_cksum = htons(sizeof (*mldh));
2510 
2511 	ill_mcast_queue(ill, mp);
2512 }
2513 
2514 /*
2515  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2516  * report will contain one multicast address record for each element of
2517  * reclist.  If this causes packet length to exceed ill->ill_mc_mtu,
2518  * multiple reports are sent.  reclist is assumed to be made up of
2519  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2520  */
2521 static void
2522 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2523 {
2524 	mblk_t		*mp;
2525 	mld2r_t		*mld2r;
2526 	mld2mar_t	*mld2mar;
2527 	in6_addr_t	*srcarray;
2528 	ip6_t		*ip6h;
2529 	ip6_hbh_t	*ip6hbh;
2530 	struct ip6_opt_router	*ip6router;
2531 	size_t		size, optlen, padlen, icmpsize, rsize;
2532 	int		i, numrec, more_src_cnt;
2533 	mrec_t		*rp, *cur_reclist;
2534 	mrec_t		*next_reclist = reclist;
2535 	boolean_t	morepkts;
2536 
2537 	/* If there aren't any records, there's nothing to send */
2538 	if (reclist == NULL)
2539 		return;
2540 
2541 	ASSERT(ill->ill_isv6);
2542 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2543 
2544 	/*
2545 	 * Total option length (optlen + padlen) must be a multiple of
2546 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2547 	 * length will be 8.  Assert this in case anything ever changes.
2548 	 */
2549 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2550 	ASSERT(optlen <= 8);
2551 	padlen = 8 - optlen;
2552 nextpkt:
2553 	icmpsize = sizeof (mld2r_t);
2554 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2555 	morepkts = B_FALSE;
2556 	more_src_cnt = 0;
2557 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2558 	    rp = rp->mrec_next, numrec++) {
2559 		rsize = sizeof (mld2mar_t) +
2560 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2561 		if (size + rsize > ill->ill_mc_mtu) {
2562 			if (rp == cur_reclist) {
2563 				/*
2564 				 * If the first mrec we looked at is too big
2565 				 * to fit in a single packet (i.e the source
2566 				 * list is too big), we must either truncate
2567 				 * the list (if TO_EX or IS_EX), or send
2568 				 * multiple reports for the same group (all
2569 				 * other types).
2570 				 */
2571 				int srcspace, srcsperpkt;
2572 				srcspace = ill->ill_mc_mtu -
2573 				    (size + sizeof (mld2mar_t));
2574 
2575 				/*
2576 				 * Skip if there's not even enough room in
2577 				 * a single packet to send something useful.
2578 				 */
2579 				if (srcspace <= sizeof (in6_addr_t))
2580 					continue;
2581 
2582 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2583 				/*
2584 				 * Increment icmpsize and size, because we will
2585 				 * be sending a record for the mrec we're
2586 				 * looking at now.
2587 				 */
2588 				rsize = sizeof (mld2mar_t) +
2589 				    (srcsperpkt * sizeof (in6_addr_t));
2590 				icmpsize += rsize;
2591 				size += rsize;
2592 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2593 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2594 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2595 					if (rp->mrec_next == NULL) {
2596 						/* no more packets to send */
2597 						break;
2598 					} else {
2599 						/*
2600 						 * more packets, but we're
2601 						 * done with this mrec.
2602 						 */
2603 						next_reclist = rp->mrec_next;
2604 					}
2605 				} else {
2606 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2607 					    - srcsperpkt;
2608 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2609 					/*
2610 					 * We'll fix up this mrec (remove the
2611 					 * srcs we've already sent) before
2612 					 * returning to nextpkt above.
2613 					 */
2614 					next_reclist = rp;
2615 				}
2616 			} else {
2617 				next_reclist = rp;
2618 			}
2619 			morepkts = B_TRUE;
2620 			break;
2621 		}
2622 		icmpsize += rsize;
2623 		size += rsize;
2624 	}
2625 
2626 	mp = allocb(size, BPRI_HI);
2627 	if (mp == NULL)
2628 		goto free_reclist;
2629 	bzero(mp->b_rptr, size);
2630 	mp->b_wptr = mp->b_rptr + size;
2631 
2632 	ip6h = (ip6_t *)mp->b_rptr;
2633 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2634 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2635 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2636 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2637 
2638 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2639 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2640 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2641 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2642 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2643 	ip6h->ip6_src = ipv6_all_zeros;
2644 
2645 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2646 	/*
2647 	 * ip6h_len is the number of 8-byte words, not including the first
2648 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2649 	 */
2650 	ip6hbh->ip6h_len = 0;
2651 
2652 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2653 	ip6router->ip6or_len = 2;
2654 	ip6router->ip6or_value[0] = 0;
2655 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2656 
2657 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2658 	mld2r->mld2r_nummar = htons(numrec);
2659 	/*
2660 	 * Prepare for the checksum by putting icmp length in the icmp
2661 	 * checksum field. The checksum is calculated in ip_output_simple.
2662 	 */
2663 	mld2r->mld2r_cksum = htons(icmpsize);
2664 
2665 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2666 		mld2mar->mld2mar_type = rp->mrec_type;
2667 		mld2mar->mld2mar_auxlen = 0;
2668 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2669 		mld2mar->mld2mar_group = rp->mrec_group;
2670 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2671 
2672 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2673 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2674 
2675 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2676 	}
2677 
2678 	ill_mcast_queue(ill, mp);
2679 
2680 	if (morepkts) {
2681 		if (more_src_cnt > 0) {
2682 			int index, mvsize;
2683 			slist_t *sl = &next_reclist->mrec_srcs;
2684 			index = sl->sl_numsrc;
2685 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2686 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2687 			    mvsize);
2688 			sl->sl_numsrc = more_src_cnt;
2689 		}
2690 		goto nextpkt;
2691 	}
2692 
2693 free_reclist:
2694 	while (reclist != NULL) {
2695 		rp = reclist->mrec_next;
2696 		mi_free(reclist);
2697 		reclist = rp;
2698 	}
2699 }
2700 
2701 static mrec_t *
2702 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2703     mrec_t *next)
2704 {
2705 	mrec_t *rp;
2706 	int i;
2707 
2708 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2709 	    SLIST_IS_EMPTY(srclist))
2710 		return (next);
2711 
2712 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2713 	if (rp == NULL)
2714 		return (next);
2715 
2716 	rp->mrec_next = next;
2717 	rp->mrec_type = type;
2718 	rp->mrec_auxlen = 0;
2719 	rp->mrec_group = *grp;
2720 	if (srclist == NULL) {
2721 		rp->mrec_srcs.sl_numsrc = 0;
2722 	} else {
2723 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2724 		for (i = 0; i < srclist->sl_numsrc; i++)
2725 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2726 	}
2727 
2728 	return (rp);
2729 }
2730 
2731 /*
2732  * Set up initial retransmit state.  If memory cannot be allocated for
2733  * the source lists, simply create as much state as is possible; memory
2734  * allocation failures are considered one type of transient error that
2735  * the retransmissions are designed to overcome (and if they aren't
2736  * transient, there are bigger problems than failing to notify the
2737  * router about multicast group membership state changes).
2738  */
2739 static void
2740 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2741     slist_t *flist)
2742 {
2743 	/*
2744 	 * There are only three possibilities for rtype:
2745 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2746 	 *	  => rtype is ALLOW_NEW_SOURCES
2747 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2748 	 *	  => rtype is CHANGE_TO_EXCLUDE
2749 	 *	State change that involves a filter mode change
2750 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2751 	 */
2752 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2753 	    rtype == ALLOW_NEW_SOURCES);
2754 
2755 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2756 
2757 	switch (rtype) {
2758 	case CHANGE_TO_EXCLUDE:
2759 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2760 		CLEAR_SLIST(rtxp->rtx_allow);
2761 		COPY_SLIST(flist, rtxp->rtx_block);
2762 		break;
2763 	case ALLOW_NEW_SOURCES:
2764 	case CHANGE_TO_INCLUDE:
2765 		rtxp->rtx_fmode_cnt =
2766 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2767 		CLEAR_SLIST(rtxp->rtx_block);
2768 		COPY_SLIST(flist, rtxp->rtx_allow);
2769 		break;
2770 	}
2771 }
2772 
2773 /*
2774  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2775  * RFC 3376 section 5.1, covers three cases:
2776  *	* The current state change is a filter mode change
2777  *		Set filter mode retransmit counter; set retransmit allow or
2778  *		block list to new source list as appropriate, and clear the
2779  *		retransmit list that was not set; send TO_IN or TO_EX with
2780  *		new source list.
2781  *	* The current state change is a source list change, but the filter
2782  *	  mode retransmit counter is > 0
2783  *		Decrement filter mode retransmit counter; set retransmit
2784  *		allow or block list to  new source list as appropriate,
2785  *		and clear the retransmit list that was not set; send TO_IN
2786  *		or TO_EX with new source list.
2787  *	* The current state change is a source list change, and the filter
2788  *	  mode retransmit counter is 0.
2789  *		Merge existing rtx allow and block lists with new state:
2790  *		  rtx_allow = (new allow + rtx_allow) - new block
2791  *		  rtx_block = (new block + rtx_block) - new allow
2792  *		Send ALLOW and BLOCK records for new retransmit lists;
2793  *		decrement retransmit counter.
2794  *
2795  * As is the case for mcast_init_rtx(), memory allocation failures are
2796  * acceptable; we just create as much state as we can.
2797  */
2798 static mrec_t *
2799 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2800 {
2801 	ill_t *ill;
2802 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2803 	mcast_record_t txtype;
2804 	mrec_t *rp, *rpnext, *rtnmrec;
2805 	boolean_t ovf;
2806 
2807 	ill = ilm->ilm_ill;
2808 
2809 	if (mreclist == NULL)
2810 		return (mreclist);
2811 
2812 	/*
2813 	 * A filter mode change is indicated by a single mrec, which is
2814 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2815 	 * retransmit state as if this were an initial join.  There is
2816 	 * no change to the mrec list.
2817 	 */
2818 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2819 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2820 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2821 		    &mreclist->mrec_srcs);
2822 		return (mreclist);
2823 	}
2824 
2825 	/*
2826 	 * Only the source list has changed
2827 	 */
2828 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2829 	if (rtxp->rtx_fmode_cnt > 0) {
2830 		/* but we're still sending filter mode change reports */
2831 		rtxp->rtx_fmode_cnt--;
2832 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
2833 			CLEAR_SLIST(rtxp->rtx_block);
2834 			COPY_SLIST(flist, rtxp->rtx_allow);
2835 			txtype = CHANGE_TO_INCLUDE;
2836 		} else {
2837 			CLEAR_SLIST(rtxp->rtx_allow);
2838 			COPY_SLIST(flist, rtxp->rtx_block);
2839 			txtype = CHANGE_TO_EXCLUDE;
2840 		}
2841 		/* overwrite first mrec with new info */
2842 		mreclist->mrec_type = txtype;
2843 		l_copy(flist, &mreclist->mrec_srcs);
2844 		/* then free any remaining mrecs */
2845 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
2846 			rpnext = rp->mrec_next;
2847 			mi_free(rp);
2848 		}
2849 		mreclist->mrec_next = NULL;
2850 		rtnmrec = mreclist;
2851 	} else {
2852 		mrec_t *allow_mrec, *block_mrec;
2853 		/*
2854 		 * Just send the source change reports; but we need to
2855 		 * recalculate the ALLOW and BLOCK lists based on previous
2856 		 * state and new changes.
2857 		 */
2858 		rtnmrec = mreclist;
2859 		allow_mrec = block_mrec = NULL;
2860 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
2861 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
2862 			    rp->mrec_type == BLOCK_OLD_SOURCES);
2863 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
2864 				allow_mrec = rp;
2865 			else
2866 				block_mrec = rp;
2867 		}
2868 		/*
2869 		 * Perform calculations:
2870 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
2871 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
2872 		 *
2873 		 * Each calc requires two steps, for example:
2874 		 *   rtx_allow = rtx_allow - mrec_block;
2875 		 *   new_allow = mrec_allow + rtx_allow;
2876 		 *
2877 		 * Store results in mrec lists, and then copy into rtx lists.
2878 		 * We do it in this order in case the rtx list hasn't been
2879 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
2880 		 * Overflows are also okay.
2881 		 */
2882 		if (block_mrec != NULL) {
2883 			l_difference_in_a(rtxp->rtx_allow,
2884 			    &block_mrec->mrec_srcs);
2885 		}
2886 		if (allow_mrec != NULL) {
2887 			l_difference_in_a(rtxp->rtx_block,
2888 			    &allow_mrec->mrec_srcs);
2889 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
2890 			    &ovf);
2891 		}
2892 		if (block_mrec != NULL) {
2893 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
2894 			    &ovf);
2895 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
2896 		} else {
2897 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
2898 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
2899 		}
2900 		if (allow_mrec != NULL) {
2901 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
2902 		} else {
2903 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
2904 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
2905 		}
2906 	}
2907 
2908 	return (rtnmrec);
2909 }
2910