xref: /titanic_52/usr/src/uts/common/inet/ip/igmp.c (revision f936286c99fb83153e4bfd870eb2830a990a82c1)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Internet Group Management Protocol (IGMP) routines.
28  * Multicast Listener Discovery Protocol (MLD) routines.
29  *
30  * Written by Steve Deering, Stanford, May 1988.
31  * Modified by Rosen Sharma, Stanford, Aug 1994.
32  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
33  *
34  * MULTICAST 3.5.1.1
35  */
36 
37 #include <sys/types.h>
38 #include <sys/stream.h>
39 #include <sys/stropts.h>
40 #include <sys/strlog.h>
41 #include <sys/strsun.h>
42 #include <sys/systm.h>
43 #include <sys/ddi.h>
44 #include <sys/sunddi.h>
45 #include <sys/cmn_err.h>
46 #include <sys/atomic.h>
47 #include <sys/zone.h>
48 #include <sys/callb.h>
49 #include <sys/param.h>
50 #include <sys/socket.h>
51 #include <inet/ipclassifier.h>
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/igmp_var.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <inet/ipsec_impl.h>
59 
60 #include <inet/common.h>
61 #include <inet/mi.h>
62 #include <inet/nd.h>
63 #include <inet/tunables.h>
64 #include <inet/ip.h>
65 #include <inet/ip6.h>
66 #include <inet/ip_multi.h>
67 #include <inet/ip_listutils.h>
68 
69 #include <netinet/igmp.h>
70 #include <inet/ip_ndp.h>
71 #include <inet/ip_if.h>
72 
73 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
74 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
75 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
76 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
77 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
78 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
79 static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
80 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
81 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
82 		    slist_t *srclist, mrec_t *next);
83 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
84 		    mcast_record_t rtype, slist_t *flist);
85 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
86 
87 /*
88  * Macros used to do timer len conversions.  Timer values are always
89  * stored and passed to the timer functions as milliseconds; but the
90  * default values and values from the wire may not be.
91  *
92  * And yes, it's obscure, but decisecond is easier to abbreviate than
93  * "tenths of a second".
94  */
95 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
96 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
97 
98 /*
99  * A running timer (scheduled thru timeout) can be cancelled if another
100  * timer with a shorter timeout value is scheduled before it has timed
101  * out.  When the shorter timer expires, the original timer is updated
102  * to account for the time elapsed while the shorter timer ran; but this
103  * does not take into account the amount of time already spent in timeout
104  * state before being preempted by the shorter timer, that is the time
105  * interval between time scheduled to time cancelled.  This can cause
106  * delays in sending out multicast membership reports.  To resolve this
107  * problem, wallclock time (absolute time) is used instead of deltas
108  * (relative time) to track timers.
109  *
110  * The MACRO below gets the lbolt value, used for proper timer scheduling
111  * and firing. Therefore multicast membership reports are sent on time.
112  * The timer does not exactly fire at the time it was scehduled to fire,
113  * there is a difference of a few milliseconds observed. An offset is used
114  * to take care of the difference.
115  */
116 
117 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
118 #define	CURRENT_OFFSET	(999)
119 
120 /*
121  * The first multicast join will trigger the igmp timers / mld timers
122  * The unit for next is milliseconds.
123  */
124 void
125 igmp_start_timers(unsigned next, ip_stack_t *ipst)
126 {
127 	int	time_left;
128 	int	ret;
129 	timeout_id_t id;
130 
131 	ASSERT(next != 0 && next != INFINITY);
132 
133 	mutex_enter(&ipst->ips_igmp_timer_lock);
134 
135 	if (ipst->ips_igmp_timer_setter_active) {
136 		/*
137 		 * Serialize timer setters, one at a time. If the
138 		 * timer is currently being set by someone,
139 		 * just record the next time when it has to be
140 		 * invoked and return. The current setter will
141 		 * take care.
142 		 */
143 		ipst->ips_igmp_time_to_next =
144 		    MIN(ipst->ips_igmp_time_to_next, next);
145 		mutex_exit(&ipst->ips_igmp_timer_lock);
146 		return;
147 	} else {
148 		ipst->ips_igmp_timer_setter_active = B_TRUE;
149 	}
150 	if (ipst->ips_igmp_timeout_id == 0) {
151 		/*
152 		 * The timer is inactive. We need to start a timer if we haven't
153 		 * been asked to quiesce.
154 		 */
155 		ipst->ips_igmp_time_to_next = next;
156 		if (ipst->ips_igmp_timer_quiesce != B_TRUE) {
157 			ipst->ips_igmp_timeout_id =
158 			    timeout(igmp_timeout_handler, (void *)ipst,
159 			    MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
160 			ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
161 		}
162 		ipst->ips_igmp_timer_setter_active = B_FALSE;
163 		mutex_exit(&ipst->ips_igmp_timer_lock);
164 		return;
165 	}
166 
167 	/*
168 	 * The timer was scheduled sometime back for firing in
169 	 * 'igmp_time_to_next' ms and is active. We need to
170 	 * reschedule the timeout if the new 'next' will happen
171 	 * earlier than the currently scheduled timeout
172 	 */
173 	time_left = ipst->ips_igmp_timer_scheduled_last +
174 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
175 	if (time_left < MSEC_TO_TICK(next)) {
176 		ipst->ips_igmp_timer_setter_active = B_FALSE;
177 		mutex_exit(&ipst->ips_igmp_timer_lock);
178 		return;
179 	}
180 	id = ipst->ips_igmp_timeout_id;
181 
182 	mutex_exit(&ipst->ips_igmp_timer_lock);
183 	ret = untimeout(id);
184 	mutex_enter(&ipst->ips_igmp_timer_lock);
185 	/*
186 	 * The timeout was cancelled, or the timeout handler
187 	 * completed, while we were blocked in the untimeout.
188 	 * No other thread could have set the timer meanwhile
189 	 * since we serialized all the timer setters. Thus
190 	 * no timer is currently active nor executing nor will
191 	 * any timer fire in the future. We start the timer now
192 	 * if needed.
193 	 */
194 	if (ret == -1) {
195 		ASSERT(ipst->ips_igmp_timeout_id == 0);
196 	} else {
197 		ASSERT(ipst->ips_igmp_timeout_id != 0);
198 		ipst->ips_igmp_timeout_id = 0;
199 	}
200 	if (ipst->ips_igmp_time_to_next != 0 &&
201 	    ipst->ips_igmp_timer_quiesce != B_TRUE) {
202 		ipst->ips_igmp_time_to_next =
203 		    MIN(ipst->ips_igmp_time_to_next, next);
204 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
205 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
206 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
207 	}
208 	ipst->ips_igmp_timer_setter_active = B_FALSE;
209 	mutex_exit(&ipst->ips_igmp_timer_lock);
210 }
211 
212 /*
213  * mld_start_timers:
214  * The unit for next is milliseconds.
215  */
216 void
217 mld_start_timers(unsigned next, ip_stack_t *ipst)
218 {
219 	int	time_left;
220 	int	ret;
221 	timeout_id_t id;
222 
223 	ASSERT(next != 0 && next != INFINITY);
224 
225 	mutex_enter(&ipst->ips_mld_timer_lock);
226 	if (ipst->ips_mld_timer_setter_active) {
227 		/*
228 		 * Serialize timer setters, one at a time. If the
229 		 * timer is currently being set by someone,
230 		 * just record the next time when it has to be
231 		 * invoked and return. The current setter will
232 		 * take care.
233 		 */
234 		ipst->ips_mld_time_to_next =
235 		    MIN(ipst->ips_mld_time_to_next, next);
236 		mutex_exit(&ipst->ips_mld_timer_lock);
237 		return;
238 	} else {
239 		ipst->ips_mld_timer_setter_active = B_TRUE;
240 	}
241 	if (ipst->ips_mld_timeout_id == 0) {
242 		/*
243 		 * The timer is inactive. We need to start a timer, if we
244 		 * haven't been asked to quiesce.
245 		 */
246 		ipst->ips_mld_time_to_next = next;
247 		if (ipst->ips_mld_timer_quiesce != B_TRUE) {
248 			ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
249 			    (void *)ipst,
250 			    MSEC_TO_TICK(ipst->ips_mld_time_to_next));
251 			ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
252 		}
253 		ipst->ips_mld_timer_setter_active = B_FALSE;
254 		mutex_exit(&ipst->ips_mld_timer_lock);
255 		return;
256 	}
257 
258 	/*
259 	 * The timer was scheduled sometime back for firing in
260 	 * 'igmp_time_to_next' ms and is active. We need to
261 	 * reschedule the timeout if the new 'next' will happen
262 	 * earlier than the currently scheduled timeout
263 	 */
264 	time_left = ipst->ips_mld_timer_scheduled_last +
265 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
266 	if (time_left < MSEC_TO_TICK(next)) {
267 		ipst->ips_mld_timer_setter_active = B_FALSE;
268 		mutex_exit(&ipst->ips_mld_timer_lock);
269 		return;
270 	}
271 	id = ipst->ips_mld_timeout_id;
272 
273 	mutex_exit(&ipst->ips_mld_timer_lock);
274 	ret = untimeout(id);
275 	mutex_enter(&ipst->ips_mld_timer_lock);
276 	/*
277 	 * The timeout was cancelled, or the timeout handler
278 	 * completed, while we were blocked in the untimeout.
279 	 * No other thread could have set the timer meanwhile
280 	 * since we serialized all the timer setters. Thus
281 	 * no timer is currently active nor executing nor will
282 	 * any timer fire in the future. We start the timer now
283 	 * if needed.
284 	 */
285 	if (ret == -1) {
286 		ASSERT(ipst->ips_mld_timeout_id == 0);
287 	} else {
288 		ASSERT(ipst->ips_mld_timeout_id != 0);
289 		ipst->ips_mld_timeout_id = 0;
290 	}
291 	if (ipst->ips_mld_time_to_next != 0 &&
292 	    ipst->ips_mld_timer_quiesce == B_FALSE) {
293 		ipst->ips_mld_time_to_next =
294 		    MIN(ipst->ips_mld_time_to_next, next);
295 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
296 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
297 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
298 	}
299 	ipst->ips_mld_timer_setter_active = B_FALSE;
300 	mutex_exit(&ipst->ips_mld_timer_lock);
301 }
302 
303 /*
304  * igmp_input:
305  * Return NULL for a bad packet that is discarded here.
306  * Return mp if the message is OK and should be handed to "raw" receivers.
307  * Callers of igmp_input() may need to reinitialize variables that were copied
308  * from the mblk as this calls pullupmsg().
309  */
310 mblk_t *
311 igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
312 {
313 	igmpa_t 	*igmpa;
314 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
315 	int		iphlen, igmplen, mblklen;
316 	ilm_t 		*ilm;
317 	uint32_t	src, dst;
318 	uint32_t 	group;
319 	in6_addr_t	v6group;
320 	uint_t		next;
321 	ipif_t 		*ipif;
322 	ill_t		*ill = ira->ira_ill;
323 	ip_stack_t	*ipst = ill->ill_ipst;
324 
325 	ASSERT(!ill->ill_isv6);
326 	++ipst->ips_igmpstat.igps_rcv_total;
327 
328 	mblklen = MBLKL(mp);
329 	iphlen = ira->ira_ip_hdr_length;
330 	if (mblklen < 1 || mblklen < iphlen) {
331 		++ipst->ips_igmpstat.igps_rcv_tooshort;
332 		goto bad_pkt;
333 	}
334 	igmplen = ira->ira_pktlen - iphlen;
335 	/*
336 	 * Since msg sizes are more variable with v3, just pullup the
337 	 * whole thing now.
338 	 */
339 	if (MBLKL(mp) < (igmplen + iphlen)) {
340 		mblk_t *mp1;
341 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
342 			++ipst->ips_igmpstat.igps_rcv_tooshort;
343 			goto bad_pkt;
344 		}
345 		freemsg(mp);
346 		mp = mp1;
347 		ipha = (ipha_t *)(mp->b_rptr);
348 	}
349 
350 	/*
351 	 * Validate lengths
352 	 */
353 	if (igmplen < IGMP_MINLEN) {
354 		++ipst->ips_igmpstat.igps_rcv_tooshort;
355 		goto bad_pkt;
356 	}
357 
358 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
359 	src = ipha->ipha_src;
360 	dst = ipha->ipha_dst;
361 	if (ip_debug > 1)
362 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
363 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
364 		    (int)ntohl(src), (int)ntohl(dst),
365 		    ill->ill_name);
366 
367 	switch (igmpa->igmpa_type) {
368 	case IGMP_MEMBERSHIP_QUERY:
369 		/*
370 		 * packet length differentiates between v1/v2 and v3
371 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
372 		 */
373 		if ((igmplen == IGMP_MINLEN) ||
374 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
375 			next = igmp_query_in(ipha, igmpa, ill);
376 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
377 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
378 			    igmplen);
379 		} else {
380 			++ipst->ips_igmpstat.igps_rcv_tooshort;
381 			goto bad_pkt;
382 		}
383 		if (next == 0)
384 			goto bad_pkt;
385 
386 		if (next != INFINITY)
387 			igmp_start_timers(next, ipst);
388 
389 		break;
390 
391 	case IGMP_V1_MEMBERSHIP_REPORT:
392 	case IGMP_V2_MEMBERSHIP_REPORT:
393 		/*
394 		 * For fast leave to work, we have to know that we are the
395 		 * last person to send a report for this group. Reports
396 		 * generated by us are looped back since we could potentially
397 		 * be a multicast router, so discard reports sourced by me.
398 		 */
399 		mutex_enter(&ill->ill_lock);
400 		for (ipif = ill->ill_ipif; ipif != NULL;
401 		    ipif = ipif->ipif_next) {
402 			if (ipif->ipif_lcl_addr == src) {
403 				if (ip_debug > 1) {
404 					(void) mi_strlog(ill->ill_rq,
405 					    1,
406 					    SL_TRACE,
407 					    "igmp_input: we are only "
408 					    "member src 0x%x\n",
409 					    (int)ntohl(src));
410 				}
411 				mutex_exit(&ill->ill_lock);
412 				return (mp);
413 			}
414 		}
415 		mutex_exit(&ill->ill_lock);
416 
417 		++ipst->ips_igmpstat.igps_rcv_reports;
418 		group = igmpa->igmpa_group;
419 		if (!CLASSD(group)) {
420 			++ipst->ips_igmpstat.igps_rcv_badreports;
421 			goto bad_pkt;
422 		}
423 
424 		/*
425 		 * KLUDGE: if the IP source address of the report has an
426 		 * unspecified (i.e., zero) subnet number, as is allowed for
427 		 * a booting host, replace it with the correct subnet number
428 		 * so that a process-level multicast routing demon can
429 		 * determine which subnet it arrived from.  This is necessary
430 		 * to compensate for the lack of any way for a process to
431 		 * determine the arrival interface of an incoming packet.
432 		 *
433 		 * Requires that a copy of *this* message it passed up
434 		 * to the raw interface which is done by our caller.
435 		 */
436 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
437 			/* Pick the first ipif on this ill */
438 			mutex_enter(&ill->ill_lock);
439 			src = ill->ill_ipif->ipif_subnet;
440 			mutex_exit(&ill->ill_lock);
441 			ip1dbg(("igmp_input: changed src to 0x%x\n",
442 			    (int)ntohl(src)));
443 			ipha->ipha_src = src;
444 		}
445 
446 		/*
447 		 * If our ill has ILMs that belong to the group being
448 		 * reported, and we are a 'Delaying Member' in the RFC
449 		 * terminology, stop our timer for that group and 'clear
450 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
451 		 */
452 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
453 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
454 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
455 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
456 				continue;
457 
458 			++ipst->ips_igmpstat.igps_rcv_ourreports;
459 			ilm->ilm_timer = INFINITY;
460 			ilm->ilm_state = IGMP_OTHERMEMBER;
461 		} /* for */
462 		rw_exit(&ill->ill_mcast_lock);
463 		ill_mcast_timer_start(ill->ill_ipst);
464 		break;
465 
466 	case IGMP_V3_MEMBERSHIP_REPORT:
467 		/*
468 		 * Currently nothing to do here; IGMP router is not
469 		 * implemented in ip, and v3 hosts don't pay attention
470 		 * to membership reports.
471 		 */
472 		break;
473 	}
474 	/*
475 	 * Pass all valid IGMP packets up to any process(es) listening
476 	 * on a raw IGMP socket. Do not free the packet.
477 	 */
478 	return (mp);
479 
480 bad_pkt:
481 	freemsg(mp);
482 	return (NULL);
483 }
484 
485 static uint_t
486 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
487 {
488 	ilm_t	*ilm;
489 	int	timer;
490 	uint_t	next, current;
491 	ip_stack_t	 *ipst;
492 
493 	ipst = ill->ill_ipst;
494 	++ipst->ips_igmpstat.igps_rcv_queries;
495 
496 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
497 	/*
498 	 * In the IGMPv2 specification, there are 3 states and a flag.
499 	 *
500 	 * In Non-Member state, we simply don't have a membership record.
501 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
502 	 * < INFINITY).  In Idle Member state, our timer is not running
503 	 * (ilm->ilm_timer == INFINITY).
504 	 *
505 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
506 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
507 	 * if I sent the last report.
508 	 */
509 	if ((igmpa->igmpa_code == 0) ||
510 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
511 		/*
512 		 * Query from an old router.
513 		 * Remember that the querier on this interface is old,
514 		 * and set the timer to the value in RFC 1112.
515 		 */
516 		ill->ill_mcast_v1_time = 0;
517 		ill->ill_mcast_v1_tset = 1;
518 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
519 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
520 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
521 			atomic_inc_16(&ill->ill_ifptr->illif_mcast_v1);
522 			ill->ill_mcast_type = IGMP_V1_ROUTER;
523 		}
524 
525 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
526 
527 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
528 		    igmpa->igmpa_group != 0) {
529 			++ipst->ips_igmpstat.igps_rcv_badqueries;
530 			rw_exit(&ill->ill_mcast_lock);
531 			ill_mcast_timer_start(ill->ill_ipst);
532 			return (0);
533 		}
534 
535 	} else {
536 		in_addr_t group;
537 
538 		/*
539 		 * Query from a new router
540 		 * Simply do a validity check
541 		 */
542 		group = igmpa->igmpa_group;
543 		if (group != 0 && (!CLASSD(group))) {
544 			++ipst->ips_igmpstat.igps_rcv_badqueries;
545 			rw_exit(&ill->ill_mcast_lock);
546 			ill_mcast_timer_start(ill->ill_ipst);
547 			return (0);
548 		}
549 
550 		/*
551 		 * Switch interface state to v2 on receipt of a v2 query
552 		 * ONLY IF current state is v3.  Let things be if current
553 		 * state if v1 but do reset the v2-querier-present timer.
554 		 */
555 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
556 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
557 			    "to IGMP_V2_ROUTER", ill->ill_name));
558 			atomic_inc_16(&ill->ill_ifptr->illif_mcast_v2);
559 			ill->ill_mcast_type = IGMP_V2_ROUTER;
560 		}
561 		ill->ill_mcast_v2_time = 0;
562 		ill->ill_mcast_v2_tset = 1;
563 
564 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
565 	}
566 
567 	if (ip_debug > 1) {
568 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
569 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
570 		    (int)ntohs(igmpa->igmpa_code),
571 		    (int)ntohs(igmpa->igmpa_type));
572 	}
573 
574 	/*
575 	 * -Start the timers in all of our membership records
576 	 *  for the physical interface on which the query
577 	 *  arrived, excluding those that belong to the "all
578 	 *  hosts" group (224.0.0.1).
579 	 *
580 	 * -Restart any timer that is already running but has
581 	 *  a value longer than the requested timeout.
582 	 *
583 	 * -Use the value specified in the query message as
584 	 *  the maximum timeout.
585 	 */
586 	next = (unsigned)INFINITY;
587 
588 	current = CURRENT_MSTIME;
589 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
590 
591 		/*
592 		 * A multicast router joins INADDR_ANY address
593 		 * to enable promiscuous reception of all
594 		 * mcasts from the interface. This INADDR_ANY
595 		 * is stored in the ilm_v6addr as V6 unspec addr
596 		 */
597 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
598 			continue;
599 		if (ilm->ilm_addr == htonl(INADDR_ANY))
600 			continue;
601 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
602 		    (igmpa->igmpa_group == 0) ||
603 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
604 			if (ilm->ilm_timer > timer) {
605 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
606 				if (ilm->ilm_timer < next)
607 					next = ilm->ilm_timer;
608 				ilm->ilm_timer += current;
609 			}
610 		}
611 	}
612 	rw_exit(&ill->ill_mcast_lock);
613 	/*
614 	 * No packets have been sent above - no
615 	 * ill_mcast_send_queued is needed.
616 	 */
617 	ill_mcast_timer_start(ill->ill_ipst);
618 
619 	return (next);
620 }
621 
622 static uint_t
623 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
624 {
625 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
626 	uint_t		current;
627 	ilm_t		*ilm;
628 	ipaddr_t	*src_array;
629 	uint8_t		qrv;
630 	ip_stack_t	 *ipst;
631 
632 	ipst = ill->ill_ipst;
633 	/* make sure numsrc matches packet size */
634 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
635 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
636 		++ipst->ips_igmpstat.igps_rcv_tooshort;
637 		return (0);
638 	}
639 	src_array = (ipaddr_t *)&igmp3qa[1];
640 
641 	++ipst->ips_igmpstat.igps_rcv_queries;
642 
643 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
644 
645 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
646 		uint_t hdrval, mant, exp;
647 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
648 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
649 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
650 		mrd = (mant | 0x10) << (exp + 3);
651 	}
652 	if (mrd == 0)
653 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
654 	timer = DSEC_TO_MSEC(mrd);
655 	MCAST_RANDOM_DELAY(delay, timer);
656 	next = (unsigned)INFINITY;
657 	current = CURRENT_MSTIME;
658 
659 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
660 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
661 	else
662 		ill->ill_mcast_rv = qrv;
663 
664 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
665 		uint_t hdrval, mant, exp;
666 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
667 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
668 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
669 		qqi = (mant | 0x10) << (exp + 3);
670 	}
671 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
672 
673 	/*
674 	 * If we have a pending general query response that's scheduled
675 	 * sooner than the delay we calculated for this response, then
676 	 * no action is required (RFC3376 section 5.2 rule 1)
677 	 */
678 	if (ill->ill_global_timer < (current + delay)) {
679 		rw_exit(&ill->ill_mcast_lock);
680 		ill_mcast_timer_start(ill->ill_ipst);
681 		return (next);
682 	}
683 
684 	/*
685 	 * Now take action depending upon query type:
686 	 * general, group specific, or group/source specific.
687 	 */
688 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
689 		/*
690 		 * general query
691 		 * We know global timer is either not running or is
692 		 * greater than our calculated delay, so reset it to
693 		 * our delay (random value in range [0, response time]).
694 		 */
695 		ill->ill_global_timer =  current + delay;
696 		next = delay;
697 	} else {
698 		/* group or group/source specific query */
699 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
700 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
701 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
702 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
703 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
704 				continue;
705 			/*
706 			 * If the query is group specific or we have a
707 			 * pending group specific query, the response is
708 			 * group specific (pending sources list should be
709 			 * empty).  Otherwise, need to update the pending
710 			 * sources list for the group and source specific
711 			 * response.
712 			 */
713 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
714 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
715 group_query:
716 				FREE_SLIST(ilm->ilm_pendsrcs);
717 				ilm->ilm_pendsrcs = NULL;
718 			} else {
719 				boolean_t overflow;
720 				slist_t *pktl;
721 				if (numsrc > MAX_FILTER_SIZE ||
722 				    (ilm->ilm_pendsrcs == NULL &&
723 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
724 					/*
725 					 * We've been sent more sources than
726 					 * we can deal with; or we can't deal
727 					 * with a source list at all.  Revert
728 					 * to a group specific query.
729 					 */
730 					goto group_query;
731 				}
732 				if ((pktl = l_alloc()) == NULL)
733 					goto group_query;
734 				pktl->sl_numsrc = numsrc;
735 				for (i = 0; i < numsrc; i++)
736 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
737 					    &(pktl->sl_addr[i]));
738 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
739 				    &overflow);
740 				l_free(pktl);
741 				if (overflow)
742 					goto group_query;
743 			}
744 
745 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
746 			    INFINITY : (ilm->ilm_timer - current);
747 			/* choose soonest timer */
748 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
749 			if (ilm->ilm_timer < next)
750 				next = ilm->ilm_timer;
751 			ilm->ilm_timer += current;
752 		}
753 	}
754 	rw_exit(&ill->ill_mcast_lock);
755 	/*
756 	 * No packets have been sent above - no
757 	 * ill_mcast_send_queued is needed.
758 	 */
759 	ill_mcast_timer_start(ill->ill_ipst);
760 
761 	return (next);
762 }
763 
764 /*
765  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
766  * and it gets sent after the lock is dropped.
767  */
768 void
769 igmp_joingroup(ilm_t *ilm)
770 {
771 	uint_t	timer;
772 	ill_t	*ill;
773 	ip_stack_t	*ipst = ilm->ilm_ipst;
774 
775 	ill = ilm->ilm_ill;
776 
777 	ASSERT(!ill->ill_isv6);
778 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
779 
780 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
781 		ilm->ilm_rtx.rtx_timer = INFINITY;
782 		ilm->ilm_state = IGMP_OTHERMEMBER;
783 	} else {
784 		ip1dbg(("Querier mode %d, sending report, group %x\n",
785 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
786 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
787 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
788 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
789 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
790 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
791 			mrec_t *rp;
792 			mcast_record_t rtype;
793 			/*
794 			 * The possible state changes we need to handle here:
795 			 *   Old State	New State	Report
796 			 *
797 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
798 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
799 			 *
800 			 * No need to send the BLOCK(0) report; ALLOW(X)
801 			 * is enough.
802 			 */
803 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
804 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
805 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
806 			    ilm->ilm_filter, NULL);
807 			igmpv3_sendrpt(ill, rp);
808 			/*
809 			 * Set up retransmission state.  Timer is set below,
810 			 * for both v3 and older versions.
811 			 */
812 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
813 			    ilm->ilm_filter);
814 		}
815 
816 		/* Set the ilm timer value */
817 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
818 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
819 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
820 		timer = ilm->ilm_rtx.rtx_timer;
821 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
822 		ilm->ilm_state = IGMP_IREPORTEDLAST;
823 
824 		/*
825 		 * We are holding ill_mcast_lock here and the timeout
826 		 * handler (igmp_timeout_handler_per_ill) acquires that
827 		 * lock. Hence we can't call igmp_start_timers since it could
828 		 * deadlock in untimeout().
829 		 * Instead the thread which drops ill_mcast_lock will have
830 		 * to call ill_mcast_timer_start().
831 		 */
832 		mutex_enter(&ipst->ips_igmp_timer_lock);
833 		ipst->ips_igmp_deferred_next = MIN(timer,
834 		    ipst->ips_igmp_deferred_next);
835 		mutex_exit(&ipst->ips_igmp_timer_lock);
836 	}
837 
838 	if (ip_debug > 1) {
839 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
840 		    "igmp_joingroup: multicast_type %d timer %d",
841 		    (ilm->ilm_ill->ill_mcast_type),
842 		    (int)ntohl(timer));
843 	}
844 }
845 
846 /*
847  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
848  * and it gets sent after the lock is dropped.
849  */
850 void
851 mld_joingroup(ilm_t *ilm)
852 {
853 	uint_t	timer;
854 	ill_t	*ill;
855 	ip_stack_t	*ipst = ilm->ilm_ipst;
856 
857 	ill = ilm->ilm_ill;
858 
859 	ASSERT(ill->ill_isv6);
860 
861 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
862 
863 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
864 		ilm->ilm_rtx.rtx_timer = INFINITY;
865 		ilm->ilm_state = IGMP_OTHERMEMBER;
866 	} else {
867 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
868 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
869 		} else {
870 			mrec_t *rp;
871 			mcast_record_t rtype;
872 			/*
873 			 * The possible state changes we need to handle here:
874 			 *	Old State   New State	Report
875 			 *
876 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
877 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
878 			 *
879 			 * No need to send the BLOCK(0) report; ALLOW(X)
880 			 * is enough
881 			 */
882 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
883 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
884 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
885 			    ilm->ilm_filter, NULL);
886 			mldv2_sendrpt(ill, rp);
887 			/*
888 			 * Set up retransmission state.  Timer is set below,
889 			 * for both v2 and v1.
890 			 */
891 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
892 			    ilm->ilm_filter);
893 		}
894 
895 		/* Set the ilm timer value */
896 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
897 		    ilm->ilm_rtx.rtx_cnt > 0);
898 
899 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
900 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
901 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
902 		timer = ilm->ilm_rtx.rtx_timer;
903 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
904 		ilm->ilm_state = IGMP_IREPORTEDLAST;
905 
906 		/*
907 		 * We are holding ill_mcast_lock here and the timeout
908 		 * handler (mld_timeout_handler_per_ill) acquires that
909 		 * lock. Hence we can't call mld_start_timers since it could
910 		 * deadlock in untimeout().
911 		 * Instead the thread which drops ill_mcast_lock will have
912 		 * to call ill_mcast_timer_start().
913 		 */
914 		mutex_enter(&ipst->ips_mld_timer_lock);
915 		ipst->ips_mld_deferred_next = MIN(timer,
916 		    ipst->ips_mld_deferred_next);
917 		mutex_exit(&ipst->ips_mld_timer_lock);
918 	}
919 
920 	if (ip_debug > 1) {
921 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
922 		    "mld_joingroup: multicast_type %d timer %d",
923 		    (ilm->ilm_ill->ill_mcast_type),
924 		    (int)ntohl(timer));
925 	}
926 }
927 
928 /*
929  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
930  * and it gets sent after the lock is dropped.
931  */
932 void
933 igmp_leavegroup(ilm_t *ilm)
934 {
935 	ill_t *ill = ilm->ilm_ill;
936 
937 	ASSERT(!ill->ill_isv6);
938 
939 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
940 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
941 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
942 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
943 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
944 		    (htonl(INADDR_ALLRTRS_GROUP)));
945 		return;
946 	}
947 	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
948 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
949 		mrec_t *rp;
950 		/*
951 		 * The possible state changes we need to handle here:
952 		 *	Old State	New State	Report
953 		 *
954 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
955 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
956 		 *
957 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
958 		 */
959 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
960 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
961 			    ilm->ilm_filter, NULL);
962 		} else {
963 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
964 			    NULL, NULL);
965 		}
966 		igmpv3_sendrpt(ill, rp);
967 		return;
968 	}
969 }
970 
971 /*
972  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
973  * and it gets sent after the lock is dropped.
974  */
975 void
976 mld_leavegroup(ilm_t *ilm)
977 {
978 	ill_t *ill = ilm->ilm_ill;
979 
980 	ASSERT(ill->ill_isv6);
981 
982 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
983 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
984 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
985 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
986 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
987 		return;
988 	}
989 	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
990 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
991 		mrec_t *rp;
992 		/*
993 		 * The possible state changes we need to handle here:
994 		 *	Old State	New State	Report
995 		 *
996 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
997 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
998 		 *
999 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
1000 		 */
1001 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1002 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1003 			    ilm->ilm_filter, NULL);
1004 		} else {
1005 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
1006 			    NULL, NULL);
1007 		}
1008 		mldv2_sendrpt(ill, rp);
1009 		return;
1010 	}
1011 }
1012 
1013 /*
1014  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1015  * and it gets sent after the lock is dropped.
1016  */
1017 void
1018 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1019 {
1020 	ill_t *ill;
1021 	mrec_t *rp;
1022 	ip_stack_t	*ipst = ilm->ilm_ipst;
1023 
1024 	ASSERT(ilm != NULL);
1025 
1026 	/* state change reports should only be sent if the router is v3 */
1027 	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
1028 		return;
1029 
1030 	ill = ilm->ilm_ill;
1031 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1032 
1033 	/*
1034 	 * Compare existing(old) state with the new state and prepare
1035 	 * State Change Report, according to the rules in RFC 3376:
1036 	 *
1037 	 *	Old State	New State	State Change Report
1038 	 *
1039 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1040 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1041 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1042 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1043 	 */
1044 
1045 	if (ilm->ilm_fmode == fmode) {
1046 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1047 		slist_t *allow, *block;
1048 		if (((a_minus_b = l_alloc()) == NULL) ||
1049 		    ((b_minus_a = l_alloc()) == NULL)) {
1050 			l_free(a_minus_b);
1051 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1052 				goto send_to_ex;
1053 			else
1054 				goto send_to_in;
1055 		}
1056 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1057 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1058 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1059 			allow = b_minus_a;
1060 			block = a_minus_b;
1061 		} else {
1062 			allow = a_minus_b;
1063 			block = b_minus_a;
1064 		}
1065 		rp = NULL;
1066 		if (!SLIST_IS_EMPTY(allow))
1067 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1068 			    allow, rp);
1069 		if (!SLIST_IS_EMPTY(block))
1070 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1071 			    block, rp);
1072 		l_free(a_minus_b);
1073 		l_free(b_minus_a);
1074 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1075 send_to_ex:
1076 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1077 		    NULL);
1078 	} else {
1079 send_to_in:
1080 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1081 		    NULL);
1082 	}
1083 
1084 	/*
1085 	 * Need to set up retransmission state; merge the new info with the
1086 	 * current state (which may be null).  If the timer is not currently
1087 	 * running, the caller will start it when dropping ill_mcast_lock.
1088 	 */
1089 	rp = mcast_merge_rtx(ilm, rp, flist);
1090 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1091 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1092 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1093 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1094 		mutex_enter(&ipst->ips_igmp_timer_lock);
1095 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1096 		    ilm->ilm_rtx.rtx_timer);
1097 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1098 		mutex_exit(&ipst->ips_igmp_timer_lock);
1099 	}
1100 
1101 	igmpv3_sendrpt(ill, rp);
1102 }
1103 
1104 /*
1105  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1106  * and it gets sent after the lock is dropped.
1107  */
1108 void
1109 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1110 {
1111 	ill_t *ill;
1112 	mrec_t *rp = NULL;
1113 	ip_stack_t	*ipst = ilm->ilm_ipst;
1114 
1115 	ASSERT(ilm != NULL);
1116 
1117 	ill = ilm->ilm_ill;
1118 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1119 
1120 	/* only need to send if we have an mldv2-capable router */
1121 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1122 		return;
1123 	}
1124 
1125 	/*
1126 	 * Compare existing (old) state with the new state passed in
1127 	 * and send appropriate MLDv2 State Change Report.
1128 	 *
1129 	 *	Old State	New State	State Change Report
1130 	 *
1131 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1132 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1133 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1134 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1135 	 */
1136 	if (ilm->ilm_fmode == fmode) {
1137 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1138 		slist_t *allow, *block;
1139 		if (((a_minus_b = l_alloc()) == NULL) ||
1140 		    ((b_minus_a = l_alloc()) == NULL)) {
1141 			l_free(a_minus_b);
1142 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1143 				goto send_to_ex;
1144 			else
1145 				goto send_to_in;
1146 		}
1147 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1148 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1149 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1150 			allow = b_minus_a;
1151 			block = a_minus_b;
1152 		} else {
1153 			allow = a_minus_b;
1154 			block = b_minus_a;
1155 		}
1156 		if (!SLIST_IS_EMPTY(allow))
1157 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1158 			    allow, rp);
1159 		if (!SLIST_IS_EMPTY(block))
1160 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1161 			    block, rp);
1162 		l_free(a_minus_b);
1163 		l_free(b_minus_a);
1164 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1165 send_to_ex:
1166 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1167 		    NULL);
1168 	} else {
1169 send_to_in:
1170 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1171 		    NULL);
1172 	}
1173 
1174 	/*
1175 	 * Need to set up retransmission state; merge the new info with the
1176 	 * current state (which may be null).  If the timer is not currently
1177 	 * running, the caller will start it when dropping ill_mcast_lock.
1178 	 */
1179 	rp = mcast_merge_rtx(ilm, rp, flist);
1180 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1181 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1182 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1183 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1184 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1185 		mutex_enter(&ipst->ips_mld_timer_lock);
1186 		ipst->ips_mld_deferred_next =
1187 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1188 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1189 		mutex_exit(&ipst->ips_mld_timer_lock);
1190 	}
1191 
1192 	mldv2_sendrpt(ill, rp);
1193 }
1194 
1195 uint_t
1196 igmp_timeout_handler_per_ill(ill_t *ill)
1197 {
1198 	uint_t	next = INFINITY, current;
1199 	ilm_t	*ilm;
1200 	mrec_t	*rp = NULL;
1201 	mrec_t	*rtxrp = NULL;
1202 	rtx_state_t *rtxp;
1203 	mcast_record_t	rtype;
1204 
1205 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1206 
1207 	current = CURRENT_MSTIME;
1208 	/* First check the global timer on this interface */
1209 	if (ill->ill_global_timer == INFINITY)
1210 		goto per_ilm_timer;
1211 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1212 		ill->ill_global_timer = INFINITY;
1213 		/*
1214 		 * Send report for each group on this interface.
1215 		 * Since we just set the global timer (received a v3 general
1216 		 * query), need to skip the all hosts addr (224.0.0.1), per
1217 		 * RFC 3376 section 5.
1218 		 */
1219 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1220 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1221 				continue;
1222 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1223 			    ilm->ilm_filter, rp);
1224 			/*
1225 			 * Since we're sending a report on this group, okay
1226 			 * to delete pending group-specific timers.  Note
1227 			 * that group-specific retransmit timers still need
1228 			 * to be checked in the per_ilm_timer for-loop.
1229 			 */
1230 			ilm->ilm_timer = INFINITY;
1231 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1232 			FREE_SLIST(ilm->ilm_pendsrcs);
1233 			ilm->ilm_pendsrcs = NULL;
1234 		}
1235 		igmpv3_sendrpt(ill, rp);
1236 		rp = NULL;
1237 	} else {
1238 		if ((ill->ill_global_timer - current) < next)
1239 			next = ill->ill_global_timer - current;
1240 	}
1241 
1242 per_ilm_timer:
1243 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1244 		if (ilm->ilm_timer == INFINITY)
1245 			goto per_ilm_rtxtimer;
1246 
1247 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1248 			if ((ilm->ilm_timer - current) < next)
1249 				next = ilm->ilm_timer - current;
1250 
1251 			if (ip_debug > 1) {
1252 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1253 				    "igmp_timo_hlr 2: ilm_timr %d "
1254 				    "typ %d nxt %d",
1255 				    (int)ntohl(ilm->ilm_timer - current),
1256 				    (ill->ill_mcast_type), next);
1257 			}
1258 
1259 			goto per_ilm_rtxtimer;
1260 		}
1261 
1262 		/* the timer has expired, need to take action */
1263 		ilm->ilm_timer = INFINITY;
1264 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1265 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1266 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1267 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1268 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1269 		} else {
1270 			slist_t *rsp;
1271 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1272 			    (rsp = l_alloc()) != NULL) {
1273 				/*
1274 				 * Contents of reply depend on pending
1275 				 * requested source list.
1276 				 */
1277 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1278 					l_intersection(ilm->ilm_filter,
1279 					    ilm->ilm_pendsrcs, rsp);
1280 				} else {
1281 					l_difference(ilm->ilm_pendsrcs,
1282 					    ilm->ilm_filter, rsp);
1283 				}
1284 				FREE_SLIST(ilm->ilm_pendsrcs);
1285 				ilm->ilm_pendsrcs = NULL;
1286 				if (!SLIST_IS_EMPTY(rsp))
1287 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1288 					    &ilm->ilm_v6addr, rsp, rp);
1289 				FREE_SLIST(rsp);
1290 			} else {
1291 				/*
1292 				 * Either the pending request is just group-
1293 				 * specific, or we couldn't get the resources
1294 				 * (rsp) to build a source-specific reply.
1295 				 */
1296 				rp = mcast_bldmrec(ilm->ilm_fmode,
1297 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1298 			}
1299 			igmpv3_sendrpt(ill, rp);
1300 			rp = NULL;
1301 		}
1302 
1303 per_ilm_rtxtimer:
1304 		rtxp = &ilm->ilm_rtx;
1305 
1306 		if (rtxp->rtx_timer == INFINITY)
1307 			continue;
1308 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1309 			if ((rtxp->rtx_timer - current) < next)
1310 				next = rtxp->rtx_timer - current;
1311 			continue;
1312 		}
1313 
1314 		rtxp->rtx_timer = INFINITY;
1315 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1316 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1317 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1318 			continue;
1319 		}
1320 		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1321 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1322 			continue;
1323 		}
1324 
1325 		/*
1326 		 * The retransmit timer has popped, and our router is
1327 		 * IGMPv3.  We have to delve into the retransmit state
1328 		 * stored in the ilm.
1329 		 *
1330 		 * Decrement the retransmit count.  If the fmode rtx
1331 		 * count is active, decrement it, and send a filter
1332 		 * mode change report with the ilm's source list.
1333 		 * Otherwise, send a source list change report with
1334 		 * the current retransmit lists.
1335 		 */
1336 		ASSERT(rtxp->rtx_cnt > 0);
1337 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1338 		rtxp->rtx_cnt--;
1339 		if (rtxp->rtx_fmode_cnt > 0) {
1340 			rtxp->rtx_fmode_cnt--;
1341 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1342 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1343 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1344 			    ilm->ilm_filter, rtxrp);
1345 		} else {
1346 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1347 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1348 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1349 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1350 		}
1351 		if (rtxp->rtx_cnt > 0) {
1352 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1353 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1354 			if (rtxp->rtx_timer < next)
1355 				next = rtxp->rtx_timer;
1356 			rtxp->rtx_timer += current;
1357 		} else {
1358 			ASSERT(rtxp->rtx_timer == INFINITY);
1359 			CLEAR_SLIST(rtxp->rtx_allow);
1360 			CLEAR_SLIST(rtxp->rtx_block);
1361 		}
1362 		igmpv3_sendrpt(ill, rtxrp);
1363 		rtxrp = NULL;
1364 	}
1365 
1366 	rw_exit(&ill->ill_mcast_lock);
1367 	/* Send any deferred/queued IP packets */
1368 	ill_mcast_send_queued(ill);
1369 	/* Defer ill_mcast_timer_start() until the caller is done */
1370 
1371 	return (next);
1372 }
1373 
1374 /*
1375  * igmp_timeout_handler:
1376  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1377  * Returns number of ticks to next event (or 0 if none).
1378  *
1379  * As part of multicast join and leave igmp we may need to send out an
1380  * igmp request. The igmp related state variables in the ilm are protected
1381  * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
1382  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1383  * starts the igmp timer if needed. It serializes multiple threads trying to
1384  * simultaneously start the timer using the igmp_timer_setter_active flag.
1385  *
1386  * igmp_input() receives igmp queries and responds to the queries
1387  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1388  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1389  * performs the action exclusively after acquiring ill_mcast_lock.
1390  *
1391  * The igmp_slowtimeo() function is called thru another timer.
1392  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1393  */
1394 void
1395 igmp_timeout_handler(void *arg)
1396 {
1397 	ill_t	*ill;
1398 	uint_t  global_next = INFINITY;
1399 	uint_t  next;
1400 	ill_walk_context_t ctx;
1401 	ip_stack_t *ipst = arg;
1402 
1403 	ASSERT(arg != NULL);
1404 	mutex_enter(&ipst->ips_igmp_timer_lock);
1405 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1406 	ipst->ips_igmp_timeout_id = 0;
1407 	ipst->ips_igmp_timer_scheduled_last = 0;
1408 	ipst->ips_igmp_time_to_next = 0;
1409 	mutex_exit(&ipst->ips_igmp_timer_lock);
1410 
1411 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1412 	ill = ILL_START_WALK_V4(&ctx, ipst);
1413 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1414 		ASSERT(!ill->ill_isv6);
1415 		/* Make sure the ill isn't going away. */
1416 		if (!ill_check_and_refhold(ill))
1417 			continue;
1418 		rw_exit(&ipst->ips_ill_g_lock);
1419 		next = igmp_timeout_handler_per_ill(ill);
1420 		if (next < global_next)
1421 			global_next = next;
1422 		ill_refrele(ill);
1423 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1424 	}
1425 	rw_exit(&ipst->ips_ill_g_lock);
1426 	if (global_next != INFINITY)
1427 		igmp_start_timers(global_next, ipst);
1428 }
1429 
1430 /*
1431  * mld_timeout_handler:
1432  * Called when there are timeout events, every next (tick).
1433  * Returns number of ticks to next event (or 0 if none).
1434  */
1435 uint_t
1436 mld_timeout_handler_per_ill(ill_t *ill)
1437 {
1438 	ilm_t 	*ilm;
1439 	uint_t	next = INFINITY, current;
1440 	mrec_t	*rp, *rtxrp;
1441 	rtx_state_t *rtxp;
1442 	mcast_record_t	rtype;
1443 
1444 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1445 
1446 	current = CURRENT_MSTIME;
1447 	/*
1448 	 * First check the global timer on this interface; the global timer
1449 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1450 	 */
1451 	if (ill->ill_global_timer == INFINITY)
1452 		goto per_ilm_timer;
1453 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1454 		ill->ill_global_timer = INFINITY;
1455 		/*
1456 		 * Send report for each group on this interface.
1457 		 * Since we just set the global timer (received a v2 general
1458 		 * query), need to skip the all hosts addr (ff02::1), per
1459 		 * RFC 3810 section 6.
1460 		 */
1461 		rp = NULL;
1462 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1463 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1464 			    &ipv6_all_hosts_mcast))
1465 				continue;
1466 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1467 			    ilm->ilm_filter, rp);
1468 			/*
1469 			 * Since we're sending a report on this group, okay
1470 			 * to delete pending group-specific timers.  Note
1471 			 * that group-specific retransmit timers still need
1472 			 * to be checked in the per_ilm_timer for-loop.
1473 			 */
1474 			ilm->ilm_timer = INFINITY;
1475 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1476 			FREE_SLIST(ilm->ilm_pendsrcs);
1477 			ilm->ilm_pendsrcs = NULL;
1478 		}
1479 		mldv2_sendrpt(ill, rp);
1480 	} else {
1481 		if ((ill->ill_global_timer - current) < next)
1482 			next = ill->ill_global_timer - current;
1483 	}
1484 
1485 per_ilm_timer:
1486 	rp = rtxrp = NULL;
1487 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1488 		if (ilm->ilm_timer == INFINITY)
1489 			goto per_ilm_rtxtimer;
1490 
1491 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1492 			if ((ilm->ilm_timer - current) < next)
1493 				next = ilm->ilm_timer - current;
1494 
1495 			if (ip_debug > 1) {
1496 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1497 				    "igmp_timo_hlr 2: ilm_timr"
1498 				    " %d typ %d nxt %d",
1499 				    (int)ntohl(ilm->ilm_timer - current),
1500 				    (ill->ill_mcast_type), next);
1501 			}
1502 
1503 			goto per_ilm_rtxtimer;
1504 		}
1505 
1506 		/* the timer has expired, need to take action */
1507 		ilm->ilm_timer = INFINITY;
1508 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1509 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1510 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1511 		} else {
1512 			slist_t *rsp;
1513 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1514 			    (rsp = l_alloc()) != NULL) {
1515 				/*
1516 				 * Contents of reply depend on pending
1517 				 * requested source list.
1518 				 */
1519 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1520 					l_intersection(ilm->ilm_filter,
1521 					    ilm->ilm_pendsrcs, rsp);
1522 				} else {
1523 					l_difference(ilm->ilm_pendsrcs,
1524 					    ilm->ilm_filter, rsp);
1525 				}
1526 				FREE_SLIST(ilm->ilm_pendsrcs);
1527 				ilm->ilm_pendsrcs = NULL;
1528 				if (!SLIST_IS_EMPTY(rsp))
1529 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1530 					    &ilm->ilm_v6addr, rsp, rp);
1531 				FREE_SLIST(rsp);
1532 			} else {
1533 				rp = mcast_bldmrec(ilm->ilm_fmode,
1534 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1535 			}
1536 		}
1537 
1538 per_ilm_rtxtimer:
1539 		rtxp = &ilm->ilm_rtx;
1540 
1541 		if (rtxp->rtx_timer == INFINITY)
1542 			continue;
1543 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1544 			if ((rtxp->rtx_timer - current) < next)
1545 				next = rtxp->rtx_timer - current;
1546 			continue;
1547 		}
1548 
1549 		rtxp->rtx_timer = INFINITY;
1550 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1551 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1552 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1553 			continue;
1554 		}
1555 
1556 		/*
1557 		 * The retransmit timer has popped, and our router is
1558 		 * MLDv2.  We have to delve into the retransmit state
1559 		 * stored in the ilm.
1560 		 *
1561 		 * Decrement the retransmit count.  If the fmode rtx
1562 		 * count is active, decrement it, and send a filter
1563 		 * mode change report with the ilm's source list.
1564 		 * Otherwise, send a source list change report with
1565 		 * the current retransmit lists.
1566 		 */
1567 		ASSERT(rtxp->rtx_cnt > 0);
1568 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1569 		rtxp->rtx_cnt--;
1570 		if (rtxp->rtx_fmode_cnt > 0) {
1571 			rtxp->rtx_fmode_cnt--;
1572 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1573 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1574 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1575 			    ilm->ilm_filter, rtxrp);
1576 		} else {
1577 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1578 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1579 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1580 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1581 		}
1582 		if (rtxp->rtx_cnt > 0) {
1583 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1584 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1585 			if (rtxp->rtx_timer < next)
1586 				next = rtxp->rtx_timer;
1587 			rtxp->rtx_timer += current;
1588 		} else {
1589 			ASSERT(rtxp->rtx_timer == INFINITY);
1590 			CLEAR_SLIST(rtxp->rtx_allow);
1591 			CLEAR_SLIST(rtxp->rtx_block);
1592 		}
1593 	}
1594 
1595 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1596 		mldv2_sendrpt(ill, rp);
1597 		mldv2_sendrpt(ill, rtxrp);
1598 	}
1599 	rw_exit(&ill->ill_mcast_lock);
1600 	/* Send any deferred/queued IP packets */
1601 	ill_mcast_send_queued(ill);
1602 	/* Defer ill_mcast_timer_start() until the caller is done */
1603 
1604 	return (next);
1605 }
1606 
1607 /*
1608  * mld_timeout_handler:
1609  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1610  * Returns number of ticks to next event (or 0 if none).
1611  * MT issues are same as igmp_timeout_handler
1612  */
1613 void
1614 mld_timeout_handler(void *arg)
1615 {
1616 	ill_t	*ill;
1617 	uint_t  global_next = INFINITY;
1618 	uint_t  next;
1619 	ill_walk_context_t ctx;
1620 	ip_stack_t *ipst = arg;
1621 
1622 	ASSERT(arg != NULL);
1623 	mutex_enter(&ipst->ips_mld_timer_lock);
1624 	ASSERT(ipst->ips_mld_timeout_id != 0);
1625 	ipst->ips_mld_timeout_id = 0;
1626 	ipst->ips_mld_timer_scheduled_last = 0;
1627 	ipst->ips_mld_time_to_next = 0;
1628 	mutex_exit(&ipst->ips_mld_timer_lock);
1629 
1630 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1631 	ill = ILL_START_WALK_V6(&ctx, ipst);
1632 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1633 		ASSERT(ill->ill_isv6);
1634 		/* Make sure the ill isn't going away. */
1635 		if (!ill_check_and_refhold(ill))
1636 			continue;
1637 		rw_exit(&ipst->ips_ill_g_lock);
1638 		next = mld_timeout_handler_per_ill(ill);
1639 		if (next < global_next)
1640 			global_next = next;
1641 		ill_refrele(ill);
1642 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1643 	}
1644 	rw_exit(&ipst->ips_ill_g_lock);
1645 	if (global_next != INFINITY)
1646 		mld_start_timers(global_next, ipst);
1647 }
1648 
1649 /*
1650  * Calculate the Older Version Querier Present timeout value, in number
1651  * of slowtimo intervals, for the given ill.
1652  */
1653 #define	OVQP(ill) \
1654 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1655 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1656 
1657 /*
1658  * igmp_slowtimo:
1659  * - Resets to new router if we didnt we hear from the router
1660  *   in IGMP_AGE_THRESHOLD seconds.
1661  * - Resets slowtimeout.
1662  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1663  * IGMP version than configured.
1664  */
1665 void
1666 igmp_slowtimo(void *arg)
1667 {
1668 	ill_t	*ill;
1669 	ill_if_t *ifp;
1670 	avl_tree_t *avl_tree;
1671 	ip_stack_t *ipst = (ip_stack_t *)arg;
1672 
1673 	ASSERT(arg != NULL);
1674 
1675 	/*
1676 	 * The ill_if_t list is circular, hence the odd loop parameters.
1677 	 *
1678 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1679 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1680 	 * structure (allowing us to skip if none of the instances have timers
1681 	 * running).
1682 	 */
1683 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1684 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1685 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1686 	    ifp = ifp->illif_next) {
1687 		/*
1688 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1689 		 * a V1 or V2 query now and we miss seeing the count now,
1690 		 * we will see it the next time igmp_slowtimo is called.
1691 		 */
1692 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1693 			continue;
1694 
1695 		avl_tree = &ifp->illif_avl_by_ppa;
1696 		for (ill = avl_first(avl_tree); ill != NULL;
1697 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1698 			/* Make sure the ill isn't going away. */
1699 			if (!ill_check_and_refhold(ill))
1700 				continue;
1701 			rw_exit(&ipst->ips_ill_g_lock);
1702 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1703 			if (ill->ill_mcast_v1_tset == 1)
1704 				ill->ill_mcast_v1_time++;
1705 			if (ill->ill_mcast_v2_tset == 1)
1706 				ill->ill_mcast_v2_time++;
1707 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1708 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1709 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1710 				if ((ill->ill_mcast_v2_tset > 0) ||
1711 				    (ipst->ips_igmp_max_version ==
1712 				    IGMP_V2_ROUTER)) {
1713 					ip1dbg(("V1 query timer "
1714 					    "expired on %s; switching "
1715 					    "mode to IGMP_V2\n",
1716 					    ill->ill_name));
1717 					ill->ill_mcast_type =
1718 					    IGMP_V2_ROUTER;
1719 				} else {
1720 					ip1dbg(("V1 query timer "
1721 					    "expired on %s; switching "
1722 					    "mode to IGMP_V3\n",
1723 					    ill->ill_name));
1724 					ill->ill_mcast_type =
1725 					    IGMP_V3_ROUTER;
1726 				}
1727 				ill->ill_mcast_v1_time = 0;
1728 				ill->ill_mcast_v1_tset = 0;
1729 				atomic_dec_16(&ifp->illif_mcast_v1);
1730 			}
1731 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1732 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1733 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1734 				ip1dbg(("V2 query timer expired on "
1735 				    "%s; switching mode to IGMP_V3\n",
1736 				    ill->ill_name));
1737 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1738 				ill->ill_mcast_v2_time = 0;
1739 				ill->ill_mcast_v2_tset = 0;
1740 				atomic_dec_16(&ifp->illif_mcast_v2);
1741 			}
1742 			rw_exit(&ill->ill_mcast_lock);
1743 			ill_refrele(ill);
1744 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1745 		}
1746 	}
1747 	rw_exit(&ipst->ips_ill_g_lock);
1748 	ill_mcast_timer_start(ipst);
1749 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1750 	if (ipst->ips_igmp_slowtimeout_quiesce != B_TRUE) {
1751 		ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo,
1752 		    (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1753 	} else {
1754 		ipst->ips_igmp_slowtimeout_id = 0;
1755 	}
1756 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1757 }
1758 
1759 /*
1760  * mld_slowtimo:
1761  * - Resets to newer version if we didn't hear from the older version router
1762  *   in MLD_AGE_THRESHOLD seconds.
1763  * - Restarts slowtimeout.
1764  * Check for ips_mld_max_version ensures that we don't revert to a higher
1765  * IGMP version than configured.
1766  */
1767 void
1768 mld_slowtimo(void *arg)
1769 {
1770 	ill_t *ill;
1771 	ill_if_t *ifp;
1772 	avl_tree_t *avl_tree;
1773 	ip_stack_t *ipst = (ip_stack_t *)arg;
1774 
1775 	ASSERT(arg != NULL);
1776 	/* See comments in igmp_slowtimo() above... */
1777 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1778 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1779 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1780 	    ifp = ifp->illif_next) {
1781 		if (ifp->illif_mcast_v1 == 0)
1782 			continue;
1783 
1784 		avl_tree = &ifp->illif_avl_by_ppa;
1785 		for (ill = avl_first(avl_tree); ill != NULL;
1786 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1787 			/* Make sure the ill isn't going away. */
1788 			if (!ill_check_and_refhold(ill))
1789 				continue;
1790 			rw_exit(&ipst->ips_ill_g_lock);
1791 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1792 			if (ill->ill_mcast_v1_tset == 1)
1793 				ill->ill_mcast_v1_time++;
1794 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1795 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1796 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1797 				ip1dbg(("MLD query timer expired on"
1798 				    " %s; switching mode to MLD_V2\n",
1799 				    ill->ill_name));
1800 				ill->ill_mcast_type = MLD_V2_ROUTER;
1801 				ill->ill_mcast_v1_time = 0;
1802 				ill->ill_mcast_v1_tset = 0;
1803 				atomic_dec_16(&ifp->illif_mcast_v1);
1804 			}
1805 			rw_exit(&ill->ill_mcast_lock);
1806 			ill_refrele(ill);
1807 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1808 		}
1809 	}
1810 	rw_exit(&ipst->ips_ill_g_lock);
1811 	ill_mcast_timer_start(ipst);
1812 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1813 	if (ipst->ips_mld_slowtimeout_quiesce != B_TRUE) {
1814 		ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo,
1815 		    (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1816 	} else {
1817 		ipst->ips_mld_slowtimeout_id = 0;
1818 	}
1819 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1820 }
1821 
1822 /*
1823  * igmp_sendpkt:
1824  * This will send to ip_output_simple just like icmp_inbound.
1825  */
1826 static void
1827 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1828 {
1829 	mblk_t	*mp;
1830 	igmpa_t	*igmpa;
1831 	uint8_t *rtralert;
1832 	ipha_t	*ipha;
1833 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1834 	size_t	size  = hdrlen + sizeof (igmpa_t);
1835 	ill_t 	*ill  = ilm->ilm_ill;
1836 	ip_stack_t *ipst = ill->ill_ipst;
1837 
1838 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1839 
1840 	mp = allocb(size, BPRI_HI);
1841 	if (mp == NULL) {
1842 		return;
1843 	}
1844 	mp->b_wptr = mp->b_rptr + size;
1845 
1846 	ipha = (ipha_t *)mp->b_rptr;
1847 	rtralert = (uint8_t *)&(ipha[1]);
1848 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1849 	igmpa->igmpa_type   = type;
1850 	igmpa->igmpa_code   = 0;
1851 	igmpa->igmpa_group  = ilm->ilm_addr;
1852 	igmpa->igmpa_cksum  = 0;
1853 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1854 
1855 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1856 	rtralert[1] = RTRALERT_LEN;
1857 	rtralert[2] = 0;
1858 	rtralert[3] = 0;
1859 
1860 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1861 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1862 	ipha->ipha_type_of_service 	= 0;
1863 	ipha->ipha_length = htons(size);
1864 	ipha->ipha_ident = 0;
1865 	ipha->ipha_fragment_offset_and_flags = 0;
1866 	ipha->ipha_ttl 		= IGMP_TTL;
1867 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1868 	ipha->ipha_hdr_checksum 	= 0;
1869 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1870 	ipha->ipha_src 		= INADDR_ANY;
1871 
1872 	ill_mcast_queue(ill, mp);
1873 
1874 	++ipst->ips_igmpstat.igps_snd_reports;
1875 }
1876 
1877 /*
1878  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
1879  * The report will contain one group record
1880  * for each element of reclist.  If this causes packet length to
1881  * exceed ill->ill_mc_mtu, multiple reports are sent.
1882  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1883  * and those buffers are freed here.
1884  */
1885 static void
1886 igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
1887 {
1888 	igmp3ra_t *igmp3ra;
1889 	grphdra_t *grphdr;
1890 	mblk_t *mp;
1891 	ipha_t *ipha;
1892 	uint8_t *rtralert;
1893 	ipaddr_t *src_array;
1894 	int i, j, numrec, more_src_cnt;
1895 	size_t hdrsize, size, rsize;
1896 	mrec_t *rp, *cur_reclist;
1897 	mrec_t *next_reclist = reclist;
1898 	boolean_t morepkts;
1899 	ip_stack_t	 *ipst = ill->ill_ipst;
1900 
1901 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1902 
1903 	/* if there aren't any records, there's nothing to send */
1904 	if (reclist == NULL)
1905 		return;
1906 
1907 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1908 nextpkt:
1909 	size = hdrsize + sizeof (igmp3ra_t);
1910 	morepkts = B_FALSE;
1911 	more_src_cnt = 0;
1912 	cur_reclist = next_reclist;
1913 	numrec = 0;
1914 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1915 		rsize = sizeof (grphdra_t) +
1916 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1917 		if (size + rsize > ill->ill_mc_mtu) {
1918 			if (rp == cur_reclist) {
1919 				/*
1920 				 * If the first mrec we looked at is too big
1921 				 * to fit in a single packet (i.e the source
1922 				 * list is too big), we must either truncate
1923 				 * the list (if TO_EX or IS_EX), or send
1924 				 * multiple reports for the same group (all
1925 				 * other types).
1926 				 */
1927 				int srcspace, srcsperpkt;
1928 				srcspace = ill->ill_mc_mtu - (size +
1929 				    sizeof (grphdra_t));
1930 
1931 				/*
1932 				 * Skip if there's not even enough room in
1933 				 * a single packet to send something useful.
1934 				 */
1935 				if (srcspace <= sizeof (ipaddr_t))
1936 					continue;
1937 
1938 				srcsperpkt = srcspace / sizeof (ipaddr_t);
1939 				/*
1940 				 * Increment size and numrec, because we will
1941 				 * be sending a record for the mrec we're
1942 				 * looking at now.
1943 				 */
1944 				size += sizeof (grphdra_t) +
1945 				    (srcsperpkt * sizeof (ipaddr_t));
1946 				numrec++;
1947 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
1948 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
1949 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1950 					if (rp->mrec_next == NULL) {
1951 						/* no more packets to send */
1952 						break;
1953 					} else {
1954 						/*
1955 						 * more packets, but we're
1956 						 * done with this mrec.
1957 						 */
1958 						next_reclist = rp->mrec_next;
1959 					}
1960 				} else {
1961 					more_src_cnt = rp->mrec_srcs.sl_numsrc
1962 					    - srcsperpkt;
1963 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1964 					/*
1965 					 * We'll fix up this mrec (remove the
1966 					 * srcs we've already sent) before
1967 					 * returning to nextpkt above.
1968 					 */
1969 					next_reclist = rp;
1970 				}
1971 			} else {
1972 				next_reclist = rp;
1973 			}
1974 			morepkts = B_TRUE;
1975 			break;
1976 		}
1977 		size += rsize;
1978 		numrec++;
1979 	}
1980 
1981 	mp = allocb(size, BPRI_HI);
1982 	if (mp == NULL) {
1983 		goto free_reclist;
1984 	}
1985 	bzero((char *)mp->b_rptr, size);
1986 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
1987 
1988 	ipha = (ipha_t *)mp->b_rptr;
1989 	rtralert = (uint8_t *)&(ipha[1]);
1990 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
1991 	grphdr = (grphdra_t *)&(igmp3ra[1]);
1992 
1993 	rp = cur_reclist;
1994 	for (i = 0; i < numrec; i++) {
1995 		grphdr->grphdra_type = rp->mrec_type;
1996 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
1997 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
1998 		src_array = (ipaddr_t *)&(grphdr[1]);
1999 
2000 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2001 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2002 
2003 		grphdr = (grphdra_t *)&(src_array[j]);
2004 		rp = rp->mrec_next;
2005 	}
2006 
2007 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2008 	igmp3ra->igmp3ra_numrec = htons(numrec);
2009 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2010 
2011 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2012 	rtralert[1] = RTRALERT_LEN;
2013 	rtralert[2] = 0;
2014 	rtralert[3] = 0;
2015 
2016 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2017 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2018 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2019 	ipha->ipha_length = htons(size);
2020 	ipha->ipha_ttl = IGMP_TTL;
2021 	ipha->ipha_protocol = IPPROTO_IGMP;
2022 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2023 	ipha->ipha_src = INADDR_ANY;
2024 
2025 	ill_mcast_queue(ill, mp);
2026 
2027 	++ipst->ips_igmpstat.igps_snd_reports;
2028 
2029 	if (morepkts) {
2030 		if (more_src_cnt > 0) {
2031 			int index, mvsize;
2032 			slist_t *sl = &next_reclist->mrec_srcs;
2033 			index = sl->sl_numsrc;
2034 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2035 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2036 			    mvsize);
2037 			sl->sl_numsrc = more_src_cnt;
2038 		}
2039 		goto nextpkt;
2040 	}
2041 
2042 free_reclist:
2043 	while (reclist != NULL) {
2044 		rp = reclist->mrec_next;
2045 		mi_free(reclist);
2046 		reclist = rp;
2047 	}
2048 }
2049 
2050 /*
2051  * mld_input:
2052  * Return NULL for a bad packet that is discarded here.
2053  * Return mp if the message is OK and should be handed to "raw" receivers.
2054  * Callers of mld_input() may need to reinitialize variables that were copied
2055  * from the mblk as this calls pullupmsg().
2056  */
2057 mblk_t *
2058 mld_input(mblk_t *mp, ip_recv_attr_t *ira)
2059 {
2060 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2061 	mld_hdr_t	*mldh;
2062 	ilm_t		*ilm;
2063 	ipif_t		*ipif;
2064 	uint16_t	hdr_length, exthdr_length;
2065 	in6_addr_t	*v6group_ptr;
2066 	uint_t		next;
2067 	int		mldlen;
2068 	ill_t		*ill = ira->ira_ill;
2069 	ip_stack_t	*ipst = ill->ill_ipst;
2070 
2071 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2072 
2073 	/* Make sure the src address of the packet is link-local */
2074 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2075 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2076 		freemsg(mp);
2077 		return (NULL);
2078 	}
2079 
2080 	if (ip6h->ip6_hlim != 1) {
2081 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2082 		freemsg(mp);
2083 		return (NULL);
2084 	}
2085 
2086 	/* Get to the icmp header part */
2087 	hdr_length = ira->ira_ip_hdr_length;
2088 	exthdr_length = hdr_length - IPV6_HDR_LEN;
2089 
2090 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2091 
2092 	/* An MLD packet must at least be 24 octets to be valid */
2093 	if (mldlen < MLD_MINLEN) {
2094 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2095 		freemsg(mp);
2096 		return (NULL);
2097 	}
2098 
2099 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2100 
2101 	switch (mldh->mld_type) {
2102 	case MLD_LISTENER_QUERY:
2103 		/*
2104 		 * packet length differentiates between v1 and v2.  v1
2105 		 * query should be exactly 24 octets long; v2 is >= 28.
2106 		 */
2107 		if ((mldlen == MLD_MINLEN) ||
2108 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2109 			next = mld_query_in(mldh, ill);
2110 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2111 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2112 		} else {
2113 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2114 			freemsg(mp);
2115 			return (NULL);
2116 		}
2117 		if (next == 0) {
2118 			return (mp);
2119 		}
2120 
2121 		if (next != INFINITY)
2122 			mld_start_timers(next, ipst);
2123 		break;
2124 
2125 	case MLD_LISTENER_REPORT:
2126 		/*
2127 		 * For fast leave to work, we have to know that we are the
2128 		 * last person to send a report for this group.  Reports
2129 		 * generated by us are looped back since we could potentially
2130 		 * be a multicast router, so discard reports sourced by me.
2131 		 */
2132 		mutex_enter(&ill->ill_lock);
2133 		for (ipif = ill->ill_ipif; ipif != NULL;
2134 		    ipif = ipif->ipif_next) {
2135 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2136 			    &ip6h->ip6_src)) {
2137 				if (ip_debug > 1) {
2138 					char    buf1[INET6_ADDRSTRLEN];
2139 
2140 					(void) mi_strlog(ill->ill_rq,
2141 					    1,
2142 					    SL_TRACE,
2143 					    "mld_input: we are only "
2144 					    "member src %s\n",
2145 					    inet_ntop(AF_INET6, &ip6h->ip6_src,
2146 					    buf1, sizeof (buf1)));
2147 				}
2148 				mutex_exit(&ill->ill_lock);
2149 				return (mp);
2150 			}
2151 		}
2152 		mutex_exit(&ill->ill_lock);
2153 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2154 
2155 		v6group_ptr = &mldh->mld_addr;
2156 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2157 			BUMP_MIB(ill->ill_icmp6_mib,
2158 			    ipv6IfIcmpInGroupMembBadReports);
2159 			freemsg(mp);
2160 			return (NULL);
2161 		}
2162 
2163 
2164 		/*
2165 		 * If we belong to the group being reported, and we are a
2166 		 * 'Delaying member' per the RFC terminology, stop our timer
2167 		 * for that group and 'clear flag' i.e. mark ilm_state as
2168 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2169 		 * membership entries for the same group address (one per zone)
2170 		 * so we need to walk the ill_ilm list.
2171 		 */
2172 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2173 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2174 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2175 				continue;
2176 			BUMP_MIB(ill->ill_icmp6_mib,
2177 			    ipv6IfIcmpInGroupMembOurReports);
2178 
2179 			ilm->ilm_timer = INFINITY;
2180 			ilm->ilm_state = IGMP_OTHERMEMBER;
2181 		}
2182 		rw_exit(&ill->ill_mcast_lock);
2183 		/*
2184 		 * No packets have been sent above - no
2185 		 * ill_mcast_send_queued is needed.
2186 		 */
2187 		ill_mcast_timer_start(ill->ill_ipst);
2188 		break;
2189 
2190 	case MLD_LISTENER_REDUCTION:
2191 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2192 		break;
2193 	}
2194 	return (mp);
2195 }
2196 
2197 /*
2198  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2199  * (non-zero, unsigned) timer value to be set on success.
2200  */
2201 static uint_t
2202 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2203 {
2204 	ilm_t	*ilm;
2205 	int	timer;
2206 	uint_t	next, current;
2207 	in6_addr_t *v6group;
2208 
2209 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2210 
2211 	/*
2212 	 * In the MLD specification, there are 3 states and a flag.
2213 	 *
2214 	 * In Non-Listener state, we simply don't have a membership record.
2215 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2216 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2217 	 * INFINITY)
2218 	 *
2219 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2220 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2221 	 * if I sent the last report.
2222 	 */
2223 	v6group = &mldh->mld_addr;
2224 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2225 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2226 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2227 		return (0);
2228 	}
2229 
2230 	/* Need to do compatibility mode checking */
2231 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2232 	ill->ill_mcast_v1_time = 0;
2233 	ill->ill_mcast_v1_tset = 1;
2234 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2235 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2236 		    "MLD_V1_ROUTER\n", ill->ill_name));
2237 		atomic_inc_16(&ill->ill_ifptr->illif_mcast_v1);
2238 		ill->ill_mcast_type = MLD_V1_ROUTER;
2239 	}
2240 
2241 	timer = (int)ntohs(mldh->mld_maxdelay);
2242 	if (ip_debug > 1) {
2243 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2244 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2245 		    timer, (int)mldh->mld_type);
2246 	}
2247 
2248 	/*
2249 	 * -Start the timers in all of our membership records for
2250 	 * the physical interface on which the query arrived,
2251 	 * excl:
2252 	 *	1.  those that belong to the "all hosts" group,
2253 	 *	2.  those with 0 scope, or 1 node-local scope.
2254 	 *
2255 	 * -Restart any timer that is already running but has a value
2256 	 * longer that the requested timeout.
2257 	 * -Use the value specified in the query message as the
2258 	 * maximum timeout.
2259 	 */
2260 	next = INFINITY;
2261 
2262 	current = CURRENT_MSTIME;
2263 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2264 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2265 
2266 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2267 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2268 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2269 			continue;
2270 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2271 		    &ipv6_all_hosts_mcast)) &&
2272 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2273 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2274 			if (timer == 0) {
2275 				/* Respond immediately */
2276 				ilm->ilm_timer = INFINITY;
2277 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2278 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2279 				break;
2280 			}
2281 			if (ilm->ilm_timer > timer) {
2282 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2283 				if (ilm->ilm_timer < next)
2284 					next = ilm->ilm_timer;
2285 				ilm->ilm_timer += current;
2286 			}
2287 			break;
2288 		}
2289 	}
2290 	rw_exit(&ill->ill_mcast_lock);
2291 	/* Send any deferred/queued IP packets */
2292 	ill_mcast_send_queued(ill);
2293 	ill_mcast_timer_start(ill->ill_ipst);
2294 
2295 	return (next);
2296 }
2297 
2298 /*
2299  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2300  * returns the appropriate (non-zero, unsigned) timer value (which may
2301  * be INFINITY) to be set.
2302  */
2303 static uint_t
2304 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2305 {
2306 	ilm_t	*ilm;
2307 	in6_addr_t *v6group, *src_array;
2308 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2309 	uint8_t	qrv;
2310 
2311 	v6group = &mld2q->mld2q_addr;
2312 	numsrc = ntohs(mld2q->mld2q_numsrc);
2313 
2314 	/* make sure numsrc matches packet size */
2315 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2316 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2317 		return (0);
2318 	}
2319 	src_array = (in6_addr_t *)&mld2q[1];
2320 
2321 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2322 
2323 	/* extract Maximum Response Delay from code in header */
2324 	mrd = ntohs(mld2q->mld2q_mxrc);
2325 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2326 		uint_t hdrval, mant, exp;
2327 		hdrval = mrd;
2328 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2329 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2330 		mrd = (mant | 0x1000) << (exp + 3);
2331 	}
2332 	if (mrd == 0)
2333 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2334 
2335 	MCAST_RANDOM_DELAY(delay, mrd);
2336 	next = (unsigned)INFINITY;
2337 	current = CURRENT_MSTIME;
2338 
2339 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2340 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2341 	else
2342 		ill->ill_mcast_rv = qrv;
2343 
2344 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2345 		uint_t mant, exp;
2346 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2347 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2348 		qqi = (mant | 0x10) << (exp + 3);
2349 	}
2350 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2351 
2352 	/*
2353 	 * If we have a pending general query response that's scheduled
2354 	 * sooner than the delay we calculated for this response, then
2355 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2356 	 */
2357 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2358 	if (ill->ill_global_timer < (current + delay)) {
2359 		rw_exit(&ill->ill_mcast_lock);
2360 		return (next);
2361 	}
2362 
2363 	/*
2364 	 * Now take action depending on query type: general,
2365 	 * group specific, or group/source specific.
2366 	 */
2367 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2368 		/*
2369 		 * general query
2370 		 * We know global timer is either not running or is
2371 		 * greater than our calculated delay, so reset it to
2372 		 * our delay (random value in range [0, response time])
2373 		 */
2374 		ill->ill_global_timer = current + delay;
2375 		next = delay;
2376 	} else {
2377 		/* group or group/source specific query */
2378 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2379 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2380 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2381 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2382 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2383 				continue;
2384 
2385 			/*
2386 			 * If the query is group specific or we have a
2387 			 * pending group specific query, the response is
2388 			 * group specific (pending sources list should be
2389 			 * empty).  Otherwise, need to update the pending
2390 			 * sources list for the group and source specific
2391 			 * response.
2392 			 */
2393 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2394 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2395 group_query:
2396 				FREE_SLIST(ilm->ilm_pendsrcs);
2397 				ilm->ilm_pendsrcs = NULL;
2398 			} else {
2399 				boolean_t overflow;
2400 				slist_t *pktl;
2401 				if (numsrc > MAX_FILTER_SIZE ||
2402 				    (ilm->ilm_pendsrcs == NULL &&
2403 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2404 					/*
2405 					 * We've been sent more sources than
2406 					 * we can deal with; or we can't deal
2407 					 * with a source list at all. Revert
2408 					 * to a group specific query.
2409 					 */
2410 					goto group_query;
2411 				}
2412 				if ((pktl = l_alloc()) == NULL)
2413 					goto group_query;
2414 				pktl->sl_numsrc = numsrc;
2415 				for (i = 0; i < numsrc; i++)
2416 					pktl->sl_addr[i] = src_array[i];
2417 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2418 				    &overflow);
2419 				l_free(pktl);
2420 				if (overflow)
2421 					goto group_query;
2422 			}
2423 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2424 			    INFINITY : (ilm->ilm_timer - current);
2425 			/* set timer to soonest value */
2426 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2427 			if (ilm->ilm_timer < next)
2428 				next = ilm->ilm_timer;
2429 			ilm->ilm_timer += current;
2430 			break;
2431 		}
2432 	}
2433 	rw_exit(&ill->ill_mcast_lock);
2434 	/*
2435 	 * No packets have been sent above - no
2436 	 * ill_mcast_send_queued is needed.
2437 	 */
2438 	ill_mcast_timer_start(ill->ill_ipst);
2439 
2440 	return (next);
2441 }
2442 
2443 /*
2444  * Send MLDv1 response packet with hoplimit 1
2445  */
2446 static void
2447 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2448 {
2449 	mblk_t		*mp;
2450 	mld_hdr_t	*mldh;
2451 	ip6_t 		*ip6h;
2452 	ip6_hbh_t	*ip6hbh;
2453 	struct ip6_opt_router	*ip6router;
2454 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2455 	ill_t		*ill = ilm->ilm_ill;
2456 
2457 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2458 
2459 	/*
2460 	 * We need to place a router alert option in this packet.  The length
2461 	 * of the options must be a multiple of 8.  The hbh option header is 2
2462 	 * bytes followed by the 4 byte router alert option.  That leaves
2463 	 * 2 bytes of pad for a total of 8 bytes.
2464 	 */
2465 	const int	router_alert_length = 8;
2466 
2467 	ASSERT(ill->ill_isv6);
2468 
2469 	size += router_alert_length;
2470 	mp = allocb(size, BPRI_HI);
2471 	if (mp == NULL)
2472 		return;
2473 	bzero(mp->b_rptr, size);
2474 	mp->b_wptr = mp->b_rptr + size;
2475 
2476 	ip6h = (ip6_t *)mp->b_rptr;
2477 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2478 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2479 	/*
2480 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2481 	 * above will pad between ip6router and mld.
2482 	 */
2483 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2484 
2485 	mldh->mld_type = type;
2486 	mldh->mld_addr = ilm->ilm_v6addr;
2487 
2488 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2489 	ip6router->ip6or_len = 2;
2490 	ip6router->ip6or_value[0] = 0;
2491 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2492 
2493 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2494 	ip6hbh->ip6h_len = 0;
2495 
2496 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2497 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2498 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2499 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2500 	if (v6addr == NULL)
2501 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2502 	else
2503 		ip6h->ip6_dst = *v6addr;
2504 
2505 	ip6h->ip6_src = ipv6_all_zeros;
2506 	/*
2507 	 * Prepare for checksum by putting icmp length in the icmp
2508 	 * checksum field. The checksum is calculated in ip_output.
2509 	 */
2510 	mldh->mld_cksum = htons(sizeof (*mldh));
2511 
2512 	ill_mcast_queue(ill, mp);
2513 }
2514 
2515 /*
2516  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2517  * report will contain one multicast address record for each element of
2518  * reclist.  If this causes packet length to exceed ill->ill_mc_mtu,
2519  * multiple reports are sent.  reclist is assumed to be made up of
2520  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2521  */
2522 static void
2523 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2524 {
2525 	mblk_t		*mp;
2526 	mld2r_t		*mld2r;
2527 	mld2mar_t	*mld2mar;
2528 	in6_addr_t	*srcarray;
2529 	ip6_t		*ip6h;
2530 	ip6_hbh_t	*ip6hbh;
2531 	struct ip6_opt_router	*ip6router;
2532 	size_t		size, optlen, padlen, icmpsize, rsize;
2533 	int		i, numrec, more_src_cnt;
2534 	mrec_t		*rp, *cur_reclist;
2535 	mrec_t		*next_reclist = reclist;
2536 	boolean_t	morepkts;
2537 
2538 	/* If there aren't any records, there's nothing to send */
2539 	if (reclist == NULL)
2540 		return;
2541 
2542 	ASSERT(ill->ill_isv6);
2543 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2544 
2545 	/*
2546 	 * Total option length (optlen + padlen) must be a multiple of
2547 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2548 	 * length will be 8.  Assert this in case anything ever changes.
2549 	 */
2550 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2551 	ASSERT(optlen <= 8);
2552 	padlen = 8 - optlen;
2553 nextpkt:
2554 	icmpsize = sizeof (mld2r_t);
2555 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2556 	morepkts = B_FALSE;
2557 	more_src_cnt = 0;
2558 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2559 	    rp = rp->mrec_next, numrec++) {
2560 		rsize = sizeof (mld2mar_t) +
2561 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2562 		if (size + rsize > ill->ill_mc_mtu) {
2563 			if (rp == cur_reclist) {
2564 				/*
2565 				 * If the first mrec we looked at is too big
2566 				 * to fit in a single packet (i.e the source
2567 				 * list is too big), we must either truncate
2568 				 * the list (if TO_EX or IS_EX), or send
2569 				 * multiple reports for the same group (all
2570 				 * other types).
2571 				 */
2572 				int srcspace, srcsperpkt;
2573 				srcspace = ill->ill_mc_mtu -
2574 				    (size + sizeof (mld2mar_t));
2575 
2576 				/*
2577 				 * Skip if there's not even enough room in
2578 				 * a single packet to send something useful.
2579 				 */
2580 				if (srcspace <= sizeof (in6_addr_t))
2581 					continue;
2582 
2583 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2584 				/*
2585 				 * Increment icmpsize and size, because we will
2586 				 * be sending a record for the mrec we're
2587 				 * looking at now.
2588 				 */
2589 				rsize = sizeof (mld2mar_t) +
2590 				    (srcsperpkt * sizeof (in6_addr_t));
2591 				icmpsize += rsize;
2592 				size += rsize;
2593 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2594 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2595 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2596 					if (rp->mrec_next == NULL) {
2597 						/* no more packets to send */
2598 						break;
2599 					} else {
2600 						/*
2601 						 * more packets, but we're
2602 						 * done with this mrec.
2603 						 */
2604 						next_reclist = rp->mrec_next;
2605 					}
2606 				} else {
2607 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2608 					    - srcsperpkt;
2609 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2610 					/*
2611 					 * We'll fix up this mrec (remove the
2612 					 * srcs we've already sent) before
2613 					 * returning to nextpkt above.
2614 					 */
2615 					next_reclist = rp;
2616 				}
2617 			} else {
2618 				next_reclist = rp;
2619 			}
2620 			morepkts = B_TRUE;
2621 			break;
2622 		}
2623 		icmpsize += rsize;
2624 		size += rsize;
2625 	}
2626 
2627 	mp = allocb(size, BPRI_HI);
2628 	if (mp == NULL)
2629 		goto free_reclist;
2630 	bzero(mp->b_rptr, size);
2631 	mp->b_wptr = mp->b_rptr + size;
2632 
2633 	ip6h = (ip6_t *)mp->b_rptr;
2634 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2635 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2636 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2637 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2638 
2639 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2640 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2641 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2642 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2643 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2644 	ip6h->ip6_src = ipv6_all_zeros;
2645 
2646 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2647 	/*
2648 	 * ip6h_len is the number of 8-byte words, not including the first
2649 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2650 	 */
2651 	ip6hbh->ip6h_len = 0;
2652 
2653 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2654 	ip6router->ip6or_len = 2;
2655 	ip6router->ip6or_value[0] = 0;
2656 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2657 
2658 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2659 	mld2r->mld2r_nummar = htons(numrec);
2660 	/*
2661 	 * Prepare for the checksum by putting icmp length in the icmp
2662 	 * checksum field. The checksum is calculated in ip_output_simple.
2663 	 */
2664 	mld2r->mld2r_cksum = htons(icmpsize);
2665 
2666 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2667 		mld2mar->mld2mar_type = rp->mrec_type;
2668 		mld2mar->mld2mar_auxlen = 0;
2669 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2670 		mld2mar->mld2mar_group = rp->mrec_group;
2671 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2672 
2673 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2674 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2675 
2676 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2677 	}
2678 
2679 	ill_mcast_queue(ill, mp);
2680 
2681 	if (morepkts) {
2682 		if (more_src_cnt > 0) {
2683 			int index, mvsize;
2684 			slist_t *sl = &next_reclist->mrec_srcs;
2685 			index = sl->sl_numsrc;
2686 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2687 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2688 			    mvsize);
2689 			sl->sl_numsrc = more_src_cnt;
2690 		}
2691 		goto nextpkt;
2692 	}
2693 
2694 free_reclist:
2695 	while (reclist != NULL) {
2696 		rp = reclist->mrec_next;
2697 		mi_free(reclist);
2698 		reclist = rp;
2699 	}
2700 }
2701 
2702 static mrec_t *
2703 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2704     mrec_t *next)
2705 {
2706 	mrec_t *rp;
2707 	int i;
2708 
2709 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2710 	    SLIST_IS_EMPTY(srclist))
2711 		return (next);
2712 
2713 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2714 	if (rp == NULL)
2715 		return (next);
2716 
2717 	rp->mrec_next = next;
2718 	rp->mrec_type = type;
2719 	rp->mrec_auxlen = 0;
2720 	rp->mrec_group = *grp;
2721 	if (srclist == NULL) {
2722 		rp->mrec_srcs.sl_numsrc = 0;
2723 	} else {
2724 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2725 		for (i = 0; i < srclist->sl_numsrc; i++)
2726 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2727 	}
2728 
2729 	return (rp);
2730 }
2731 
2732 /*
2733  * Set up initial retransmit state.  If memory cannot be allocated for
2734  * the source lists, simply create as much state as is possible; memory
2735  * allocation failures are considered one type of transient error that
2736  * the retransmissions are designed to overcome (and if they aren't
2737  * transient, there are bigger problems than failing to notify the
2738  * router about multicast group membership state changes).
2739  */
2740 static void
2741 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2742     slist_t *flist)
2743 {
2744 	/*
2745 	 * There are only three possibilities for rtype:
2746 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2747 	 *	  => rtype is ALLOW_NEW_SOURCES
2748 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2749 	 *	  => rtype is CHANGE_TO_EXCLUDE
2750 	 *	State change that involves a filter mode change
2751 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2752 	 */
2753 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2754 	    rtype == ALLOW_NEW_SOURCES);
2755 
2756 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2757 
2758 	switch (rtype) {
2759 	case CHANGE_TO_EXCLUDE:
2760 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2761 		CLEAR_SLIST(rtxp->rtx_allow);
2762 		COPY_SLIST(flist, rtxp->rtx_block);
2763 		break;
2764 	case ALLOW_NEW_SOURCES:
2765 	case CHANGE_TO_INCLUDE:
2766 		rtxp->rtx_fmode_cnt =
2767 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2768 		CLEAR_SLIST(rtxp->rtx_block);
2769 		COPY_SLIST(flist, rtxp->rtx_allow);
2770 		break;
2771 	}
2772 }
2773 
2774 /*
2775  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2776  * RFC 3376 section 5.1, covers three cases:
2777  *	* The current state change is a filter mode change
2778  *		Set filter mode retransmit counter; set retransmit allow or
2779  *		block list to new source list as appropriate, and clear the
2780  *		retransmit list that was not set; send TO_IN or TO_EX with
2781  *		new source list.
2782  *	* The current state change is a source list change, but the filter
2783  *	  mode retransmit counter is > 0
2784  *		Decrement filter mode retransmit counter; set retransmit
2785  *		allow or block list to  new source list as appropriate,
2786  *		and clear the retransmit list that was not set; send TO_IN
2787  *		or TO_EX with new source list.
2788  *	* The current state change is a source list change, and the filter
2789  *	  mode retransmit counter is 0.
2790  *		Merge existing rtx allow and block lists with new state:
2791  *		  rtx_allow = (new allow + rtx_allow) - new block
2792  *		  rtx_block = (new block + rtx_block) - new allow
2793  *		Send ALLOW and BLOCK records for new retransmit lists;
2794  *		decrement retransmit counter.
2795  *
2796  * As is the case for mcast_init_rtx(), memory allocation failures are
2797  * acceptable; we just create as much state as we can.
2798  */
2799 static mrec_t *
2800 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2801 {
2802 	ill_t *ill;
2803 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2804 	mcast_record_t txtype;
2805 	mrec_t *rp, *rpnext, *rtnmrec;
2806 	boolean_t ovf;
2807 
2808 	ill = ilm->ilm_ill;
2809 
2810 	if (mreclist == NULL)
2811 		return (mreclist);
2812 
2813 	/*
2814 	 * A filter mode change is indicated by a single mrec, which is
2815 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2816 	 * retransmit state as if this were an initial join.  There is
2817 	 * no change to the mrec list.
2818 	 */
2819 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2820 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2821 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2822 		    &mreclist->mrec_srcs);
2823 		return (mreclist);
2824 	}
2825 
2826 	/*
2827 	 * Only the source list has changed
2828 	 */
2829 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2830 	if (rtxp->rtx_fmode_cnt > 0) {
2831 		/* but we're still sending filter mode change reports */
2832 		rtxp->rtx_fmode_cnt--;
2833 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
2834 			CLEAR_SLIST(rtxp->rtx_block);
2835 			COPY_SLIST(flist, rtxp->rtx_allow);
2836 			txtype = CHANGE_TO_INCLUDE;
2837 		} else {
2838 			CLEAR_SLIST(rtxp->rtx_allow);
2839 			COPY_SLIST(flist, rtxp->rtx_block);
2840 			txtype = CHANGE_TO_EXCLUDE;
2841 		}
2842 		/* overwrite first mrec with new info */
2843 		mreclist->mrec_type = txtype;
2844 		l_copy(flist, &mreclist->mrec_srcs);
2845 		/* then free any remaining mrecs */
2846 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
2847 			rpnext = rp->mrec_next;
2848 			mi_free(rp);
2849 		}
2850 		mreclist->mrec_next = NULL;
2851 		rtnmrec = mreclist;
2852 	} else {
2853 		mrec_t *allow_mrec, *block_mrec;
2854 		/*
2855 		 * Just send the source change reports; but we need to
2856 		 * recalculate the ALLOW and BLOCK lists based on previous
2857 		 * state and new changes.
2858 		 */
2859 		rtnmrec = mreclist;
2860 		allow_mrec = block_mrec = NULL;
2861 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
2862 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
2863 			    rp->mrec_type == BLOCK_OLD_SOURCES);
2864 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
2865 				allow_mrec = rp;
2866 			else
2867 				block_mrec = rp;
2868 		}
2869 		/*
2870 		 * Perform calculations:
2871 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
2872 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
2873 		 *
2874 		 * Each calc requires two steps, for example:
2875 		 *   rtx_allow = rtx_allow - mrec_block;
2876 		 *   new_allow = mrec_allow + rtx_allow;
2877 		 *
2878 		 * Store results in mrec lists, and then copy into rtx lists.
2879 		 * We do it in this order in case the rtx list hasn't been
2880 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
2881 		 * Overflows are also okay.
2882 		 */
2883 		if (block_mrec != NULL) {
2884 			l_difference_in_a(rtxp->rtx_allow,
2885 			    &block_mrec->mrec_srcs);
2886 		}
2887 		if (allow_mrec != NULL) {
2888 			l_difference_in_a(rtxp->rtx_block,
2889 			    &allow_mrec->mrec_srcs);
2890 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
2891 			    &ovf);
2892 		}
2893 		if (block_mrec != NULL) {
2894 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
2895 			    &ovf);
2896 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
2897 		} else {
2898 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
2899 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
2900 		}
2901 		if (allow_mrec != NULL) {
2902 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
2903 		} else {
2904 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
2905 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
2906 		}
2907 	}
2908 
2909 	return (rtnmrec);
2910 }
2911