xref: /titanic_50/usr/src/uts/common/inet/ip/igmp.c (revision 538aa54d819fa7751ca82bcc30d4ed8c57ec2ef2)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright (c) 1991, 2010, Oracle and/or its affiliates. All rights reserved.
23  */
24 /* Copyright (c) 1990 Mentat Inc. */
25 
26 /*
27  * Internet Group Management Protocol (IGMP) routines.
28  * Multicast Listener Discovery Protocol (MLD) routines.
29  *
30  * Written by Steve Deering, Stanford, May 1988.
31  * Modified by Rosen Sharma, Stanford, Aug 1994.
32  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
33  *
34  * MULTICAST 3.5.1.1
35  */
36 
37 #include <sys/types.h>
38 #include <sys/stream.h>
39 #include <sys/stropts.h>
40 #include <sys/strlog.h>
41 #include <sys/strsun.h>
42 #include <sys/systm.h>
43 #include <sys/ddi.h>
44 #include <sys/sunddi.h>
45 #include <sys/cmn_err.h>
46 #include <sys/atomic.h>
47 #include <sys/zone.h>
48 #include <sys/callb.h>
49 #include <sys/param.h>
50 #include <sys/socket.h>
51 #include <inet/ipclassifier.h>
52 #include <net/if.h>
53 #include <net/route.h>
54 #include <netinet/in.h>
55 #include <netinet/igmp_var.h>
56 #include <netinet/ip6.h>
57 #include <netinet/icmp6.h>
58 #include <inet/ipsec_impl.h>
59 
60 #include <inet/common.h>
61 #include <inet/mi.h>
62 #include <inet/nd.h>
63 #include <inet/tunables.h>
64 #include <inet/ip.h>
65 #include <inet/ip6.h>
66 #include <inet/ip_multi.h>
67 #include <inet/ip_listutils.h>
68 
69 #include <netinet/igmp.h>
70 #include <inet/ip_ndp.h>
71 #include <inet/ip_if.h>
72 
73 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
74 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
75 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
76 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
77 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
78 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
79 static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
80 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
81 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
82 		    slist_t *srclist, mrec_t *next);
83 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
84 		    mcast_record_t rtype, slist_t *flist);
85 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
86 
87 /*
88  * Macros used to do timer len conversions.  Timer values are always
89  * stored and passed to the timer functions as milliseconds; but the
90  * default values and values from the wire may not be.
91  *
92  * And yes, it's obscure, but decisecond is easier to abbreviate than
93  * "tenths of a second".
94  */
95 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
96 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
97 
98 /*
99  * A running timer (scheduled thru timeout) can be cancelled if another
100  * timer with a shorter timeout value is scheduled before it has timed
101  * out.  When the shorter timer expires, the original timer is updated
102  * to account for the time elapsed while the shorter timer ran; but this
103  * does not take into account the amount of time already spent in timeout
104  * state before being preempted by the shorter timer, that is the time
105  * interval between time scheduled to time cancelled.  This can cause
106  * delays in sending out multicast membership reports.  To resolve this
107  * problem, wallclock time (absolute time) is used instead of deltas
108  * (relative time) to track timers.
109  *
110  * The MACRO below gets the lbolt value, used for proper timer scheduling
111  * and firing. Therefore multicast membership reports are sent on time.
112  * The timer does not exactly fire at the time it was scehduled to fire,
113  * there is a difference of a few milliseconds observed. An offset is used
114  * to take care of the difference.
115  */
116 
117 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
118 #define	CURRENT_OFFSET	(999)
119 
120 /*
121  * The first multicast join will trigger the igmp timers / mld timers
122  * The unit for next is milliseconds.
123  */
124 void
125 igmp_start_timers(unsigned next, ip_stack_t *ipst)
126 {
127 	int	time_left;
128 	int	ret;
129 	timeout_id_t id;
130 
131 	ASSERT(next != 0 && next != INFINITY);
132 
133 	mutex_enter(&ipst->ips_igmp_timer_lock);
134 
135 	if (ipst->ips_igmp_timer_setter_active) {
136 		/*
137 		 * Serialize timer setters, one at a time. If the
138 		 * timer is currently being set by someone,
139 		 * just record the next time when it has to be
140 		 * invoked and return. The current setter will
141 		 * take care.
142 		 */
143 		ipst->ips_igmp_time_to_next =
144 		    MIN(ipst->ips_igmp_time_to_next, next);
145 		mutex_exit(&ipst->ips_igmp_timer_lock);
146 		return;
147 	} else {
148 		ipst->ips_igmp_timer_setter_active = B_TRUE;
149 	}
150 	if (ipst->ips_igmp_timeout_id == 0) {
151 		/*
152 		 * The timer is inactive. We need to start a timer
153 		 */
154 		ipst->ips_igmp_time_to_next = next;
155 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
156 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
157 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
158 		ipst->ips_igmp_timer_setter_active = B_FALSE;
159 		mutex_exit(&ipst->ips_igmp_timer_lock);
160 		return;
161 	}
162 
163 	/*
164 	 * The timer was scheduled sometime back for firing in
165 	 * 'igmp_time_to_next' ms and is active. We need to
166 	 * reschedule the timeout if the new 'next' will happen
167 	 * earlier than the currently scheduled timeout
168 	 */
169 	time_left = ipst->ips_igmp_timer_scheduled_last +
170 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
171 	if (time_left < MSEC_TO_TICK(next)) {
172 		ipst->ips_igmp_timer_setter_active = B_FALSE;
173 		mutex_exit(&ipst->ips_igmp_timer_lock);
174 		return;
175 	}
176 	id = ipst->ips_igmp_timeout_id;
177 
178 	mutex_exit(&ipst->ips_igmp_timer_lock);
179 	ret = untimeout(id);
180 	mutex_enter(&ipst->ips_igmp_timer_lock);
181 	/*
182 	 * The timeout was cancelled, or the timeout handler
183 	 * completed, while we were blocked in the untimeout.
184 	 * No other thread could have set the timer meanwhile
185 	 * since we serialized all the timer setters. Thus
186 	 * no timer is currently active nor executing nor will
187 	 * any timer fire in the future. We start the timer now
188 	 * if needed.
189 	 */
190 	if (ret == -1) {
191 		ASSERT(ipst->ips_igmp_timeout_id == 0);
192 	} else {
193 		ASSERT(ipst->ips_igmp_timeout_id != 0);
194 		ipst->ips_igmp_timeout_id = 0;
195 	}
196 	if (ipst->ips_igmp_time_to_next != 0) {
197 		ipst->ips_igmp_time_to_next =
198 		    MIN(ipst->ips_igmp_time_to_next, next);
199 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
200 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
201 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
202 	}
203 	ipst->ips_igmp_timer_setter_active = B_FALSE;
204 	mutex_exit(&ipst->ips_igmp_timer_lock);
205 }
206 
207 /*
208  * mld_start_timers:
209  * The unit for next is milliseconds.
210  */
211 void
212 mld_start_timers(unsigned next, ip_stack_t *ipst)
213 {
214 	int	time_left;
215 	int	ret;
216 	timeout_id_t id;
217 
218 	ASSERT(next != 0 && next != INFINITY);
219 
220 	mutex_enter(&ipst->ips_mld_timer_lock);
221 	if (ipst->ips_mld_timer_setter_active) {
222 		/*
223 		 * Serialize timer setters, one at a time. If the
224 		 * timer is currently being set by someone,
225 		 * just record the next time when it has to be
226 		 * invoked and return. The current setter will
227 		 * take care.
228 		 */
229 		ipst->ips_mld_time_to_next =
230 		    MIN(ipst->ips_mld_time_to_next, next);
231 		mutex_exit(&ipst->ips_mld_timer_lock);
232 		return;
233 	} else {
234 		ipst->ips_mld_timer_setter_active = B_TRUE;
235 	}
236 	if (ipst->ips_mld_timeout_id == 0) {
237 		/*
238 		 * The timer is inactive. We need to start a timer
239 		 */
240 		ipst->ips_mld_time_to_next = next;
241 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
242 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
243 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
244 		ipst->ips_mld_timer_setter_active = B_FALSE;
245 		mutex_exit(&ipst->ips_mld_timer_lock);
246 		return;
247 	}
248 
249 	/*
250 	 * The timer was scheduled sometime back for firing in
251 	 * 'igmp_time_to_next' ms and is active. We need to
252 	 * reschedule the timeout if the new 'next' will happen
253 	 * earlier than the currently scheduled timeout
254 	 */
255 	time_left = ipst->ips_mld_timer_scheduled_last +
256 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
257 	if (time_left < MSEC_TO_TICK(next)) {
258 		ipst->ips_mld_timer_setter_active = B_FALSE;
259 		mutex_exit(&ipst->ips_mld_timer_lock);
260 		return;
261 	}
262 	id = ipst->ips_mld_timeout_id;
263 
264 	mutex_exit(&ipst->ips_mld_timer_lock);
265 	ret = untimeout(id);
266 	mutex_enter(&ipst->ips_mld_timer_lock);
267 	/*
268 	 * The timeout was cancelled, or the timeout handler
269 	 * completed, while we were blocked in the untimeout.
270 	 * No other thread could have set the timer meanwhile
271 	 * since we serialized all the timer setters. Thus
272 	 * no timer is currently active nor executing nor will
273 	 * any timer fire in the future. We start the timer now
274 	 * if needed.
275 	 */
276 	if (ret == -1) {
277 		ASSERT(ipst->ips_mld_timeout_id == 0);
278 	} else {
279 		ASSERT(ipst->ips_mld_timeout_id != 0);
280 		ipst->ips_mld_timeout_id = 0;
281 	}
282 	if (ipst->ips_mld_time_to_next != 0) {
283 		ipst->ips_mld_time_to_next =
284 		    MIN(ipst->ips_mld_time_to_next, next);
285 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
286 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
287 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
288 	}
289 	ipst->ips_mld_timer_setter_active = B_FALSE;
290 	mutex_exit(&ipst->ips_mld_timer_lock);
291 }
292 
293 /*
294  * igmp_input:
295  * Return NULL for a bad packet that is discarded here.
296  * Return mp if the message is OK and should be handed to "raw" receivers.
297  * Callers of igmp_input() may need to reinitialize variables that were copied
298  * from the mblk as this calls pullupmsg().
299  */
300 mblk_t *
301 igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
302 {
303 	igmpa_t 	*igmpa;
304 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
305 	int		iphlen, igmplen, mblklen;
306 	ilm_t 		*ilm;
307 	uint32_t	src, dst;
308 	uint32_t 	group;
309 	in6_addr_t	v6group;
310 	uint_t		next;
311 	ipif_t 		*ipif;
312 	ill_t		*ill = ira->ira_ill;
313 	ip_stack_t	*ipst = ill->ill_ipst;
314 
315 	ASSERT(!ill->ill_isv6);
316 	++ipst->ips_igmpstat.igps_rcv_total;
317 
318 	mblklen = MBLKL(mp);
319 	iphlen = ira->ira_ip_hdr_length;
320 	if (mblklen < 1 || mblklen < iphlen) {
321 		++ipst->ips_igmpstat.igps_rcv_tooshort;
322 		goto bad_pkt;
323 	}
324 	igmplen = ira->ira_pktlen - iphlen;
325 	/*
326 	 * Since msg sizes are more variable with v3, just pullup the
327 	 * whole thing now.
328 	 */
329 	if (MBLKL(mp) < (igmplen + iphlen)) {
330 		mblk_t *mp1;
331 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
332 			++ipst->ips_igmpstat.igps_rcv_tooshort;
333 			goto bad_pkt;
334 		}
335 		freemsg(mp);
336 		mp = mp1;
337 		ipha = (ipha_t *)(mp->b_rptr);
338 	}
339 
340 	/*
341 	 * Validate lengths
342 	 */
343 	if (igmplen < IGMP_MINLEN) {
344 		++ipst->ips_igmpstat.igps_rcv_tooshort;
345 		goto bad_pkt;
346 	}
347 
348 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
349 	src = ipha->ipha_src;
350 	dst = ipha->ipha_dst;
351 	if (ip_debug > 1)
352 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
353 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
354 		    (int)ntohl(src), (int)ntohl(dst),
355 		    ill->ill_name);
356 
357 	switch (igmpa->igmpa_type) {
358 	case IGMP_MEMBERSHIP_QUERY:
359 		/*
360 		 * packet length differentiates between v1/v2 and v3
361 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
362 		 */
363 		if ((igmplen == IGMP_MINLEN) ||
364 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
365 			next = igmp_query_in(ipha, igmpa, ill);
366 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
367 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
368 			    igmplen);
369 		} else {
370 			++ipst->ips_igmpstat.igps_rcv_tooshort;
371 			goto bad_pkt;
372 		}
373 		if (next == 0)
374 			goto bad_pkt;
375 
376 		if (next != INFINITY)
377 			igmp_start_timers(next, ipst);
378 
379 		break;
380 
381 	case IGMP_V1_MEMBERSHIP_REPORT:
382 	case IGMP_V2_MEMBERSHIP_REPORT:
383 		/*
384 		 * For fast leave to work, we have to know that we are the
385 		 * last person to send a report for this group. Reports
386 		 * generated by us are looped back since we could potentially
387 		 * be a multicast router, so discard reports sourced by me.
388 		 */
389 		mutex_enter(&ill->ill_lock);
390 		for (ipif = ill->ill_ipif; ipif != NULL;
391 		    ipif = ipif->ipif_next) {
392 			if (ipif->ipif_lcl_addr == src) {
393 				if (ip_debug > 1) {
394 					(void) mi_strlog(ill->ill_rq,
395 					    1,
396 					    SL_TRACE,
397 					    "igmp_input: we are only "
398 					    "member src 0x%x\n",
399 					    (int)ntohl(src));
400 				}
401 				mutex_exit(&ill->ill_lock);
402 				return (mp);
403 			}
404 		}
405 		mutex_exit(&ill->ill_lock);
406 
407 		++ipst->ips_igmpstat.igps_rcv_reports;
408 		group = igmpa->igmpa_group;
409 		if (!CLASSD(group)) {
410 			++ipst->ips_igmpstat.igps_rcv_badreports;
411 			goto bad_pkt;
412 		}
413 
414 		/*
415 		 * KLUDGE: if the IP source address of the report has an
416 		 * unspecified (i.e., zero) subnet number, as is allowed for
417 		 * a booting host, replace it with the correct subnet number
418 		 * so that a process-level multicast routing demon can
419 		 * determine which subnet it arrived from.  This is necessary
420 		 * to compensate for the lack of any way for a process to
421 		 * determine the arrival interface of an incoming packet.
422 		 *
423 		 * Requires that a copy of *this* message it passed up
424 		 * to the raw interface which is done by our caller.
425 		 */
426 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
427 			/* Pick the first ipif on this ill */
428 			mutex_enter(&ill->ill_lock);
429 			src = ill->ill_ipif->ipif_subnet;
430 			mutex_exit(&ill->ill_lock);
431 			ip1dbg(("igmp_input: changed src to 0x%x\n",
432 			    (int)ntohl(src)));
433 			ipha->ipha_src = src;
434 		}
435 
436 		/*
437 		 * If our ill has ILMs that belong to the group being
438 		 * reported, and we are a 'Delaying Member' in the RFC
439 		 * terminology, stop our timer for that group and 'clear
440 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
441 		 */
442 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
443 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
444 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
445 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
446 				continue;
447 
448 			++ipst->ips_igmpstat.igps_rcv_ourreports;
449 			ilm->ilm_timer = INFINITY;
450 			ilm->ilm_state = IGMP_OTHERMEMBER;
451 		} /* for */
452 		rw_exit(&ill->ill_mcast_lock);
453 		ill_mcast_timer_start(ill->ill_ipst);
454 		break;
455 
456 	case IGMP_V3_MEMBERSHIP_REPORT:
457 		/*
458 		 * Currently nothing to do here; IGMP router is not
459 		 * implemented in ip, and v3 hosts don't pay attention
460 		 * to membership reports.
461 		 */
462 		break;
463 	}
464 	/*
465 	 * Pass all valid IGMP packets up to any process(es) listening
466 	 * on a raw IGMP socket. Do not free the packet.
467 	 */
468 	return (mp);
469 
470 bad_pkt:
471 	freemsg(mp);
472 	return (NULL);
473 }
474 
475 static uint_t
476 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
477 {
478 	ilm_t	*ilm;
479 	int	timer;
480 	uint_t	next, current;
481 	ip_stack_t	 *ipst;
482 
483 	ipst = ill->ill_ipst;
484 	++ipst->ips_igmpstat.igps_rcv_queries;
485 
486 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
487 	/*
488 	 * In the IGMPv2 specification, there are 3 states and a flag.
489 	 *
490 	 * In Non-Member state, we simply don't have a membership record.
491 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
492 	 * < INFINITY).  In Idle Member state, our timer is not running
493 	 * (ilm->ilm_timer == INFINITY).
494 	 *
495 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
496 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
497 	 * if I sent the last report.
498 	 */
499 	if ((igmpa->igmpa_code == 0) ||
500 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
501 		/*
502 		 * Query from an old router.
503 		 * Remember that the querier on this interface is old,
504 		 * and set the timer to the value in RFC 1112.
505 		 */
506 		ill->ill_mcast_v1_time = 0;
507 		ill->ill_mcast_v1_tset = 1;
508 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
509 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
510 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
511 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
512 			ill->ill_mcast_type = IGMP_V1_ROUTER;
513 		}
514 
515 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
516 
517 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
518 		    igmpa->igmpa_group != 0) {
519 			++ipst->ips_igmpstat.igps_rcv_badqueries;
520 			rw_exit(&ill->ill_mcast_lock);
521 			ill_mcast_timer_start(ill->ill_ipst);
522 			return (0);
523 		}
524 
525 	} else {
526 		in_addr_t group;
527 
528 		/*
529 		 * Query from a new router
530 		 * Simply do a validity check
531 		 */
532 		group = igmpa->igmpa_group;
533 		if (group != 0 && (!CLASSD(group))) {
534 			++ipst->ips_igmpstat.igps_rcv_badqueries;
535 			rw_exit(&ill->ill_mcast_lock);
536 			ill_mcast_timer_start(ill->ill_ipst);
537 			return (0);
538 		}
539 
540 		/*
541 		 * Switch interface state to v2 on receipt of a v2 query
542 		 * ONLY IF current state is v3.  Let things be if current
543 		 * state if v1 but do reset the v2-querier-present timer.
544 		 */
545 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
546 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
547 			    "to IGMP_V2_ROUTER", ill->ill_name));
548 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
549 			ill->ill_mcast_type = IGMP_V2_ROUTER;
550 		}
551 		ill->ill_mcast_v2_time = 0;
552 		ill->ill_mcast_v2_tset = 1;
553 
554 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
555 	}
556 
557 	if (ip_debug > 1) {
558 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
559 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
560 		    (int)ntohs(igmpa->igmpa_code),
561 		    (int)ntohs(igmpa->igmpa_type));
562 	}
563 
564 	/*
565 	 * -Start the timers in all of our membership records
566 	 *  for the physical interface on which the query
567 	 *  arrived, excluding those that belong to the "all
568 	 *  hosts" group (224.0.0.1).
569 	 *
570 	 * -Restart any timer that is already running but has
571 	 *  a value longer than the requested timeout.
572 	 *
573 	 * -Use the value specified in the query message as
574 	 *  the maximum timeout.
575 	 */
576 	next = (unsigned)INFINITY;
577 
578 	current = CURRENT_MSTIME;
579 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
580 
581 		/*
582 		 * A multicast router joins INADDR_ANY address
583 		 * to enable promiscuous reception of all
584 		 * mcasts from the interface. This INADDR_ANY
585 		 * is stored in the ilm_v6addr as V6 unspec addr
586 		 */
587 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
588 			continue;
589 		if (ilm->ilm_addr == htonl(INADDR_ANY))
590 			continue;
591 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
592 		    (igmpa->igmpa_group == 0) ||
593 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
594 			if (ilm->ilm_timer > timer) {
595 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
596 				if (ilm->ilm_timer < next)
597 					next = ilm->ilm_timer;
598 				ilm->ilm_timer += current;
599 			}
600 		}
601 	}
602 	rw_exit(&ill->ill_mcast_lock);
603 	/*
604 	 * No packets have been sent above - no
605 	 * ill_mcast_send_queued is needed.
606 	 */
607 	ill_mcast_timer_start(ill->ill_ipst);
608 
609 	return (next);
610 }
611 
612 static uint_t
613 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
614 {
615 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
616 	uint_t		current;
617 	ilm_t		*ilm;
618 	ipaddr_t	*src_array;
619 	uint8_t		qrv;
620 	ip_stack_t	 *ipst;
621 
622 	ipst = ill->ill_ipst;
623 	/* make sure numsrc matches packet size */
624 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
625 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
626 		++ipst->ips_igmpstat.igps_rcv_tooshort;
627 		return (0);
628 	}
629 	src_array = (ipaddr_t *)&igmp3qa[1];
630 
631 	++ipst->ips_igmpstat.igps_rcv_queries;
632 
633 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
634 
635 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
636 		uint_t hdrval, mant, exp;
637 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
638 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
639 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
640 		mrd = (mant | 0x10) << (exp + 3);
641 	}
642 	if (mrd == 0)
643 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
644 	timer = DSEC_TO_MSEC(mrd);
645 	MCAST_RANDOM_DELAY(delay, timer);
646 	next = (unsigned)INFINITY;
647 	current = CURRENT_MSTIME;
648 
649 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
650 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
651 	else
652 		ill->ill_mcast_rv = qrv;
653 
654 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
655 		uint_t hdrval, mant, exp;
656 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
657 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
658 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
659 		qqi = (mant | 0x10) << (exp + 3);
660 	}
661 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
662 
663 	/*
664 	 * If we have a pending general query response that's scheduled
665 	 * sooner than the delay we calculated for this response, then
666 	 * no action is required (RFC3376 section 5.2 rule 1)
667 	 */
668 	if (ill->ill_global_timer < (current + delay)) {
669 		rw_exit(&ill->ill_mcast_lock);
670 		ill_mcast_timer_start(ill->ill_ipst);
671 		return (next);
672 	}
673 
674 	/*
675 	 * Now take action depending upon query type:
676 	 * general, group specific, or group/source specific.
677 	 */
678 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
679 		/*
680 		 * general query
681 		 * We know global timer is either not running or is
682 		 * greater than our calculated delay, so reset it to
683 		 * our delay (random value in range [0, response time]).
684 		 */
685 		ill->ill_global_timer =  current + delay;
686 		next = delay;
687 	} else {
688 		/* group or group/source specific query */
689 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
690 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
691 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
692 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
693 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
694 				continue;
695 			/*
696 			 * If the query is group specific or we have a
697 			 * pending group specific query, the response is
698 			 * group specific (pending sources list should be
699 			 * empty).  Otherwise, need to update the pending
700 			 * sources list for the group and source specific
701 			 * response.
702 			 */
703 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
704 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
705 group_query:
706 				FREE_SLIST(ilm->ilm_pendsrcs);
707 				ilm->ilm_pendsrcs = NULL;
708 			} else {
709 				boolean_t overflow;
710 				slist_t *pktl;
711 				if (numsrc > MAX_FILTER_SIZE ||
712 				    (ilm->ilm_pendsrcs == NULL &&
713 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
714 					/*
715 					 * We've been sent more sources than
716 					 * we can deal with; or we can't deal
717 					 * with a source list at all.  Revert
718 					 * to a group specific query.
719 					 */
720 					goto group_query;
721 				}
722 				if ((pktl = l_alloc()) == NULL)
723 					goto group_query;
724 				pktl->sl_numsrc = numsrc;
725 				for (i = 0; i < numsrc; i++)
726 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
727 					    &(pktl->sl_addr[i]));
728 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
729 				    &overflow);
730 				l_free(pktl);
731 				if (overflow)
732 					goto group_query;
733 			}
734 
735 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
736 			    INFINITY : (ilm->ilm_timer - current);
737 			/* choose soonest timer */
738 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
739 			if (ilm->ilm_timer < next)
740 				next = ilm->ilm_timer;
741 			ilm->ilm_timer += current;
742 		}
743 	}
744 	rw_exit(&ill->ill_mcast_lock);
745 	/*
746 	 * No packets have been sent above - no
747 	 * ill_mcast_send_queued is needed.
748 	 */
749 	ill_mcast_timer_start(ill->ill_ipst);
750 
751 	return (next);
752 }
753 
754 /*
755  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
756  * and it gets sent after the lock is dropped.
757  */
758 void
759 igmp_joingroup(ilm_t *ilm)
760 {
761 	uint_t	timer;
762 	ill_t	*ill;
763 	ip_stack_t	*ipst = ilm->ilm_ipst;
764 
765 	ill = ilm->ilm_ill;
766 
767 	ASSERT(!ill->ill_isv6);
768 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
769 
770 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
771 		ilm->ilm_rtx.rtx_timer = INFINITY;
772 		ilm->ilm_state = IGMP_OTHERMEMBER;
773 	} else {
774 		ip1dbg(("Querier mode %d, sending report, group %x\n",
775 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
776 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
777 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
778 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
779 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
780 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
781 			mrec_t *rp;
782 			mcast_record_t rtype;
783 			/*
784 			 * The possible state changes we need to handle here:
785 			 *   Old State	New State	Report
786 			 *
787 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
788 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
789 			 *
790 			 * No need to send the BLOCK(0) report; ALLOW(X)
791 			 * is enough.
792 			 */
793 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
794 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
795 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
796 			    ilm->ilm_filter, NULL);
797 			igmpv3_sendrpt(ill, rp);
798 			/*
799 			 * Set up retransmission state.  Timer is set below,
800 			 * for both v3 and older versions.
801 			 */
802 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
803 			    ilm->ilm_filter);
804 		}
805 
806 		/* Set the ilm timer value */
807 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
808 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
809 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
810 		timer = ilm->ilm_rtx.rtx_timer;
811 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
812 		ilm->ilm_state = IGMP_IREPORTEDLAST;
813 
814 		/*
815 		 * We are holding ill_mcast_lock here and the timeout
816 		 * handler (igmp_timeout_handler_per_ill) acquires that
817 		 * lock. Hence we can't call igmp_start_timers since it could
818 		 * deadlock in untimeout().
819 		 * Instead the thread which drops ill_mcast_lock will have
820 		 * to call ill_mcast_timer_start().
821 		 */
822 		mutex_enter(&ipst->ips_igmp_timer_lock);
823 		ipst->ips_igmp_deferred_next = MIN(timer,
824 		    ipst->ips_igmp_deferred_next);
825 		mutex_exit(&ipst->ips_igmp_timer_lock);
826 	}
827 
828 	if (ip_debug > 1) {
829 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
830 		    "igmp_joingroup: multicast_type %d timer %d",
831 		    (ilm->ilm_ill->ill_mcast_type),
832 		    (int)ntohl(timer));
833 	}
834 }
835 
836 /*
837  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
838  * and it gets sent after the lock is dropped.
839  */
840 void
841 mld_joingroup(ilm_t *ilm)
842 {
843 	uint_t	timer;
844 	ill_t	*ill;
845 	ip_stack_t	*ipst = ilm->ilm_ipst;
846 
847 	ill = ilm->ilm_ill;
848 
849 	ASSERT(ill->ill_isv6);
850 
851 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
852 
853 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
854 		ilm->ilm_rtx.rtx_timer = INFINITY;
855 		ilm->ilm_state = IGMP_OTHERMEMBER;
856 	} else {
857 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
858 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
859 		} else {
860 			mrec_t *rp;
861 			mcast_record_t rtype;
862 			/*
863 			 * The possible state changes we need to handle here:
864 			 *	Old State   New State	Report
865 			 *
866 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
867 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
868 			 *
869 			 * No need to send the BLOCK(0) report; ALLOW(X)
870 			 * is enough
871 			 */
872 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
873 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
874 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
875 			    ilm->ilm_filter, NULL);
876 			mldv2_sendrpt(ill, rp);
877 			/*
878 			 * Set up retransmission state.  Timer is set below,
879 			 * for both v2 and v1.
880 			 */
881 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
882 			    ilm->ilm_filter);
883 		}
884 
885 		/* Set the ilm timer value */
886 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
887 		    ilm->ilm_rtx.rtx_cnt > 0);
888 
889 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
890 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
891 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
892 		timer = ilm->ilm_rtx.rtx_timer;
893 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
894 		ilm->ilm_state = IGMP_IREPORTEDLAST;
895 
896 		/*
897 		 * We are holding ill_mcast_lock here and the timeout
898 		 * handler (mld_timeout_handler_per_ill) acquires that
899 		 * lock. Hence we can't call mld_start_timers since it could
900 		 * deadlock in untimeout().
901 		 * Instead the thread which drops ill_mcast_lock will have
902 		 * to call ill_mcast_timer_start().
903 		 */
904 		mutex_enter(&ipst->ips_mld_timer_lock);
905 		ipst->ips_mld_deferred_next = MIN(timer,
906 		    ipst->ips_mld_deferred_next);
907 		mutex_exit(&ipst->ips_mld_timer_lock);
908 	}
909 
910 	if (ip_debug > 1) {
911 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
912 		    "mld_joingroup: multicast_type %d timer %d",
913 		    (ilm->ilm_ill->ill_mcast_type),
914 		    (int)ntohl(timer));
915 	}
916 }
917 
918 /*
919  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
920  * and it gets sent after the lock is dropped.
921  */
922 void
923 igmp_leavegroup(ilm_t *ilm)
924 {
925 	ill_t *ill = ilm->ilm_ill;
926 
927 	ASSERT(!ill->ill_isv6);
928 
929 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
930 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
931 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
932 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
933 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
934 		    (htonl(INADDR_ALLRTRS_GROUP)));
935 		return;
936 	}
937 	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
938 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
939 		mrec_t *rp;
940 		/*
941 		 * The possible state changes we need to handle here:
942 		 *	Old State	New State	Report
943 		 *
944 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
945 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
946 		 *
947 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
948 		 */
949 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
950 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
951 			    ilm->ilm_filter, NULL);
952 		} else {
953 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
954 			    NULL, NULL);
955 		}
956 		igmpv3_sendrpt(ill, rp);
957 		return;
958 	}
959 }
960 
961 /*
962  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
963  * and it gets sent after the lock is dropped.
964  */
965 void
966 mld_leavegroup(ilm_t *ilm)
967 {
968 	ill_t *ill = ilm->ilm_ill;
969 
970 	ASSERT(ill->ill_isv6);
971 
972 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
973 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
974 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
975 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
976 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
977 		return;
978 	}
979 	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
980 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
981 		mrec_t *rp;
982 		/*
983 		 * The possible state changes we need to handle here:
984 		 *	Old State	New State	Report
985 		 *
986 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
987 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
988 		 *
989 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
990 		 */
991 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
992 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
993 			    ilm->ilm_filter, NULL);
994 		} else {
995 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
996 			    NULL, NULL);
997 		}
998 		mldv2_sendrpt(ill, rp);
999 		return;
1000 	}
1001 }
1002 
1003 /*
1004  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1005  * and it gets sent after the lock is dropped.
1006  */
1007 void
1008 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1009 {
1010 	ill_t *ill;
1011 	mrec_t *rp;
1012 	ip_stack_t	*ipst = ilm->ilm_ipst;
1013 
1014 	ASSERT(ilm != NULL);
1015 
1016 	/* state change reports should only be sent if the router is v3 */
1017 	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
1018 		return;
1019 
1020 	ill = ilm->ilm_ill;
1021 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1022 
1023 	/*
1024 	 * Compare existing(old) state with the new state and prepare
1025 	 * State Change Report, according to the rules in RFC 3376:
1026 	 *
1027 	 *	Old State	New State	State Change Report
1028 	 *
1029 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1030 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1031 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1032 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1033 	 */
1034 
1035 	if (ilm->ilm_fmode == fmode) {
1036 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1037 		slist_t *allow, *block;
1038 		if (((a_minus_b = l_alloc()) == NULL) ||
1039 		    ((b_minus_a = l_alloc()) == NULL)) {
1040 			l_free(a_minus_b);
1041 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1042 				goto send_to_ex;
1043 			else
1044 				goto send_to_in;
1045 		}
1046 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1047 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1048 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1049 			allow = b_minus_a;
1050 			block = a_minus_b;
1051 		} else {
1052 			allow = a_minus_b;
1053 			block = b_minus_a;
1054 		}
1055 		rp = NULL;
1056 		if (!SLIST_IS_EMPTY(allow))
1057 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1058 			    allow, rp);
1059 		if (!SLIST_IS_EMPTY(block))
1060 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1061 			    block, rp);
1062 		l_free(a_minus_b);
1063 		l_free(b_minus_a);
1064 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1065 send_to_ex:
1066 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1067 		    NULL);
1068 	} else {
1069 send_to_in:
1070 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1071 		    NULL);
1072 	}
1073 
1074 	/*
1075 	 * Need to set up retransmission state; merge the new info with the
1076 	 * current state (which may be null).  If the timer is not currently
1077 	 * running, the caller will start it when dropping ill_mcast_lock.
1078 	 */
1079 	rp = mcast_merge_rtx(ilm, rp, flist);
1080 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1081 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1082 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1083 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1084 		mutex_enter(&ipst->ips_igmp_timer_lock);
1085 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1086 		    ilm->ilm_rtx.rtx_timer);
1087 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1088 		mutex_exit(&ipst->ips_igmp_timer_lock);
1089 	}
1090 
1091 	igmpv3_sendrpt(ill, rp);
1092 }
1093 
1094 /*
1095  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1096  * and it gets sent after the lock is dropped.
1097  */
1098 void
1099 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1100 {
1101 	ill_t *ill;
1102 	mrec_t *rp = NULL;
1103 	ip_stack_t	*ipst = ilm->ilm_ipst;
1104 
1105 	ASSERT(ilm != NULL);
1106 
1107 	ill = ilm->ilm_ill;
1108 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1109 
1110 	/* only need to send if we have an mldv2-capable router */
1111 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1112 		return;
1113 	}
1114 
1115 	/*
1116 	 * Compare existing (old) state with the new state passed in
1117 	 * and send appropriate MLDv2 State Change Report.
1118 	 *
1119 	 *	Old State	New State	State Change Report
1120 	 *
1121 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1122 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1123 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1124 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1125 	 */
1126 	if (ilm->ilm_fmode == fmode) {
1127 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1128 		slist_t *allow, *block;
1129 		if (((a_minus_b = l_alloc()) == NULL) ||
1130 		    ((b_minus_a = l_alloc()) == NULL)) {
1131 			l_free(a_minus_b);
1132 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1133 				goto send_to_ex;
1134 			else
1135 				goto send_to_in;
1136 		}
1137 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1138 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1139 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1140 			allow = b_minus_a;
1141 			block = a_minus_b;
1142 		} else {
1143 			allow = a_minus_b;
1144 			block = b_minus_a;
1145 		}
1146 		if (!SLIST_IS_EMPTY(allow))
1147 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1148 			    allow, rp);
1149 		if (!SLIST_IS_EMPTY(block))
1150 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1151 			    block, rp);
1152 		l_free(a_minus_b);
1153 		l_free(b_minus_a);
1154 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1155 send_to_ex:
1156 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1157 		    NULL);
1158 	} else {
1159 send_to_in:
1160 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1161 		    NULL);
1162 	}
1163 
1164 	/*
1165 	 * Need to set up retransmission state; merge the new info with the
1166 	 * current state (which may be null).  If the timer is not currently
1167 	 * running, the caller will start it when dropping ill_mcast_lock.
1168 	 */
1169 	rp = mcast_merge_rtx(ilm, rp, flist);
1170 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1171 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1172 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1173 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1174 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1175 		mutex_enter(&ipst->ips_mld_timer_lock);
1176 		ipst->ips_mld_deferred_next =
1177 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1178 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1179 		mutex_exit(&ipst->ips_mld_timer_lock);
1180 	}
1181 
1182 	mldv2_sendrpt(ill, rp);
1183 }
1184 
1185 uint_t
1186 igmp_timeout_handler_per_ill(ill_t *ill)
1187 {
1188 	uint_t	next = INFINITY, current;
1189 	ilm_t	*ilm;
1190 	mrec_t	*rp = NULL;
1191 	mrec_t	*rtxrp = NULL;
1192 	rtx_state_t *rtxp;
1193 	mcast_record_t	rtype;
1194 
1195 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1196 
1197 	current = CURRENT_MSTIME;
1198 	/* First check the global timer on this interface */
1199 	if (ill->ill_global_timer == INFINITY)
1200 		goto per_ilm_timer;
1201 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1202 		ill->ill_global_timer = INFINITY;
1203 		/*
1204 		 * Send report for each group on this interface.
1205 		 * Since we just set the global timer (received a v3 general
1206 		 * query), need to skip the all hosts addr (224.0.0.1), per
1207 		 * RFC 3376 section 5.
1208 		 */
1209 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1210 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1211 				continue;
1212 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1213 			    ilm->ilm_filter, rp);
1214 			/*
1215 			 * Since we're sending a report on this group, okay
1216 			 * to delete pending group-specific timers.  Note
1217 			 * that group-specific retransmit timers still need
1218 			 * to be checked in the per_ilm_timer for-loop.
1219 			 */
1220 			ilm->ilm_timer = INFINITY;
1221 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1222 			FREE_SLIST(ilm->ilm_pendsrcs);
1223 			ilm->ilm_pendsrcs = NULL;
1224 		}
1225 		igmpv3_sendrpt(ill, rp);
1226 		rp = NULL;
1227 	} else {
1228 		if ((ill->ill_global_timer - current) < next)
1229 			next = ill->ill_global_timer - current;
1230 	}
1231 
1232 per_ilm_timer:
1233 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1234 		if (ilm->ilm_timer == INFINITY)
1235 			goto per_ilm_rtxtimer;
1236 
1237 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1238 			if ((ilm->ilm_timer - current) < next)
1239 				next = ilm->ilm_timer - current;
1240 
1241 			if (ip_debug > 1) {
1242 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1243 				    "igmp_timo_hlr 2: ilm_timr %d "
1244 				    "typ %d nxt %d",
1245 				    (int)ntohl(ilm->ilm_timer - current),
1246 				    (ill->ill_mcast_type), next);
1247 			}
1248 
1249 			goto per_ilm_rtxtimer;
1250 		}
1251 
1252 		/* the timer has expired, need to take action */
1253 		ilm->ilm_timer = INFINITY;
1254 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1255 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1256 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1257 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1258 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1259 		} else {
1260 			slist_t *rsp;
1261 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1262 			    (rsp = l_alloc()) != NULL) {
1263 				/*
1264 				 * Contents of reply depend on pending
1265 				 * requested source list.
1266 				 */
1267 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1268 					l_intersection(ilm->ilm_filter,
1269 					    ilm->ilm_pendsrcs, rsp);
1270 				} else {
1271 					l_difference(ilm->ilm_pendsrcs,
1272 					    ilm->ilm_filter, rsp);
1273 				}
1274 				FREE_SLIST(ilm->ilm_pendsrcs);
1275 				ilm->ilm_pendsrcs = NULL;
1276 				if (!SLIST_IS_EMPTY(rsp))
1277 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1278 					    &ilm->ilm_v6addr, rsp, rp);
1279 				FREE_SLIST(rsp);
1280 			} else {
1281 				/*
1282 				 * Either the pending request is just group-
1283 				 * specific, or we couldn't get the resources
1284 				 * (rsp) to build a source-specific reply.
1285 				 */
1286 				rp = mcast_bldmrec(ilm->ilm_fmode,
1287 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1288 			}
1289 			igmpv3_sendrpt(ill, rp);
1290 			rp = NULL;
1291 		}
1292 
1293 per_ilm_rtxtimer:
1294 		rtxp = &ilm->ilm_rtx;
1295 
1296 		if (rtxp->rtx_timer == INFINITY)
1297 			continue;
1298 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1299 			if ((rtxp->rtx_timer - current) < next)
1300 				next = rtxp->rtx_timer - current;
1301 			continue;
1302 		}
1303 
1304 		rtxp->rtx_timer = INFINITY;
1305 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1306 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1307 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1308 			continue;
1309 		}
1310 		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1311 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1312 			continue;
1313 		}
1314 
1315 		/*
1316 		 * The retransmit timer has popped, and our router is
1317 		 * IGMPv3.  We have to delve into the retransmit state
1318 		 * stored in the ilm.
1319 		 *
1320 		 * Decrement the retransmit count.  If the fmode rtx
1321 		 * count is active, decrement it, and send a filter
1322 		 * mode change report with the ilm's source list.
1323 		 * Otherwise, send a source list change report with
1324 		 * the current retransmit lists.
1325 		 */
1326 		ASSERT(rtxp->rtx_cnt > 0);
1327 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1328 		rtxp->rtx_cnt--;
1329 		if (rtxp->rtx_fmode_cnt > 0) {
1330 			rtxp->rtx_fmode_cnt--;
1331 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1332 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1333 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1334 			    ilm->ilm_filter, rtxrp);
1335 		} else {
1336 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1337 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1338 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1339 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1340 		}
1341 		if (rtxp->rtx_cnt > 0) {
1342 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1343 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1344 			if (rtxp->rtx_timer < next)
1345 				next = rtxp->rtx_timer;
1346 			rtxp->rtx_timer += current;
1347 		} else {
1348 			ASSERT(rtxp->rtx_timer == INFINITY);
1349 			CLEAR_SLIST(rtxp->rtx_allow);
1350 			CLEAR_SLIST(rtxp->rtx_block);
1351 		}
1352 		igmpv3_sendrpt(ill, rtxrp);
1353 		rtxrp = NULL;
1354 	}
1355 
1356 	rw_exit(&ill->ill_mcast_lock);
1357 	/* Send any deferred/queued IP packets */
1358 	ill_mcast_send_queued(ill);
1359 	/* Defer ill_mcast_timer_start() until the caller is done */
1360 
1361 	return (next);
1362 }
1363 
1364 /*
1365  * igmp_timeout_handler:
1366  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1367  * Returns number of ticks to next event (or 0 if none).
1368  *
1369  * As part of multicast join and leave igmp we may need to send out an
1370  * igmp request. The igmp related state variables in the ilm are protected
1371  * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
1372  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1373  * starts the igmp timer if needed. It serializes multiple threads trying to
1374  * simultaneously start the timer using the igmp_timer_setter_active flag.
1375  *
1376  * igmp_input() receives igmp queries and responds to the queries
1377  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1378  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1379  * performs the action exclusively after acquiring ill_mcast_lock.
1380  *
1381  * The igmp_slowtimeo() function is called thru another timer.
1382  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1383  */
1384 void
1385 igmp_timeout_handler(void *arg)
1386 {
1387 	ill_t	*ill;
1388 	uint_t  global_next = INFINITY;
1389 	uint_t  next;
1390 	ill_walk_context_t ctx;
1391 	ip_stack_t *ipst = arg;
1392 
1393 	ASSERT(arg != NULL);
1394 	mutex_enter(&ipst->ips_igmp_timer_lock);
1395 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1396 	ipst->ips_igmp_timeout_id = 0;
1397 	ipst->ips_igmp_timer_scheduled_last = 0;
1398 	ipst->ips_igmp_time_to_next = 0;
1399 	mutex_exit(&ipst->ips_igmp_timer_lock);
1400 
1401 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1402 	ill = ILL_START_WALK_V4(&ctx, ipst);
1403 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1404 		ASSERT(!ill->ill_isv6);
1405 		/* Make sure the ill isn't going away. */
1406 		if (!ill_check_and_refhold(ill))
1407 			continue;
1408 		rw_exit(&ipst->ips_ill_g_lock);
1409 		next = igmp_timeout_handler_per_ill(ill);
1410 		if (next < global_next)
1411 			global_next = next;
1412 		ill_refrele(ill);
1413 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1414 	}
1415 	rw_exit(&ipst->ips_ill_g_lock);
1416 	if (global_next != INFINITY)
1417 		igmp_start_timers(global_next, ipst);
1418 }
1419 
1420 /*
1421  * mld_timeout_handler:
1422  * Called when there are timeout events, every next (tick).
1423  * Returns number of ticks to next event (or 0 if none).
1424  */
1425 uint_t
1426 mld_timeout_handler_per_ill(ill_t *ill)
1427 {
1428 	ilm_t 	*ilm;
1429 	uint_t	next = INFINITY, current;
1430 	mrec_t	*rp, *rtxrp;
1431 	rtx_state_t *rtxp;
1432 	mcast_record_t	rtype;
1433 
1434 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1435 
1436 	current = CURRENT_MSTIME;
1437 	/*
1438 	 * First check the global timer on this interface; the global timer
1439 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1440 	 */
1441 	if (ill->ill_global_timer == INFINITY)
1442 		goto per_ilm_timer;
1443 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1444 		ill->ill_global_timer = INFINITY;
1445 		/*
1446 		 * Send report for each group on this interface.
1447 		 * Since we just set the global timer (received a v2 general
1448 		 * query), need to skip the all hosts addr (ff02::1), per
1449 		 * RFC 3810 section 6.
1450 		 */
1451 		rp = NULL;
1452 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1453 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1454 			    &ipv6_all_hosts_mcast))
1455 				continue;
1456 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1457 			    ilm->ilm_filter, rp);
1458 			/*
1459 			 * Since we're sending a report on this group, okay
1460 			 * to delete pending group-specific timers.  Note
1461 			 * that group-specific retransmit timers still need
1462 			 * to be checked in the per_ilm_timer for-loop.
1463 			 */
1464 			ilm->ilm_timer = INFINITY;
1465 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1466 			FREE_SLIST(ilm->ilm_pendsrcs);
1467 			ilm->ilm_pendsrcs = NULL;
1468 		}
1469 		mldv2_sendrpt(ill, rp);
1470 	} else {
1471 		if ((ill->ill_global_timer - current) < next)
1472 			next = ill->ill_global_timer - current;
1473 	}
1474 
1475 per_ilm_timer:
1476 	rp = rtxrp = NULL;
1477 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1478 		if (ilm->ilm_timer == INFINITY)
1479 			goto per_ilm_rtxtimer;
1480 
1481 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1482 			if ((ilm->ilm_timer - current) < next)
1483 				next = ilm->ilm_timer - current;
1484 
1485 			if (ip_debug > 1) {
1486 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1487 				    "igmp_timo_hlr 2: ilm_timr"
1488 				    " %d typ %d nxt %d",
1489 				    (int)ntohl(ilm->ilm_timer - current),
1490 				    (ill->ill_mcast_type), next);
1491 			}
1492 
1493 			goto per_ilm_rtxtimer;
1494 		}
1495 
1496 		/* the timer has expired, need to take action */
1497 		ilm->ilm_timer = INFINITY;
1498 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1499 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1500 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1501 		} else {
1502 			slist_t *rsp;
1503 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1504 			    (rsp = l_alloc()) != NULL) {
1505 				/*
1506 				 * Contents of reply depend on pending
1507 				 * requested source list.
1508 				 */
1509 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1510 					l_intersection(ilm->ilm_filter,
1511 					    ilm->ilm_pendsrcs, rsp);
1512 				} else {
1513 					l_difference(ilm->ilm_pendsrcs,
1514 					    ilm->ilm_filter, rsp);
1515 				}
1516 				FREE_SLIST(ilm->ilm_pendsrcs);
1517 				ilm->ilm_pendsrcs = NULL;
1518 				if (!SLIST_IS_EMPTY(rsp))
1519 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1520 					    &ilm->ilm_v6addr, rsp, rp);
1521 				FREE_SLIST(rsp);
1522 			} else {
1523 				rp = mcast_bldmrec(ilm->ilm_fmode,
1524 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1525 			}
1526 		}
1527 
1528 per_ilm_rtxtimer:
1529 		rtxp = &ilm->ilm_rtx;
1530 
1531 		if (rtxp->rtx_timer == INFINITY)
1532 			continue;
1533 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1534 			if ((rtxp->rtx_timer - current) < next)
1535 				next = rtxp->rtx_timer - current;
1536 			continue;
1537 		}
1538 
1539 		rtxp->rtx_timer = INFINITY;
1540 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1541 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1542 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1543 			continue;
1544 		}
1545 
1546 		/*
1547 		 * The retransmit timer has popped, and our router is
1548 		 * MLDv2.  We have to delve into the retransmit state
1549 		 * stored in the ilm.
1550 		 *
1551 		 * Decrement the retransmit count.  If the fmode rtx
1552 		 * count is active, decrement it, and send a filter
1553 		 * mode change report with the ilm's source list.
1554 		 * Otherwise, send a source list change report with
1555 		 * the current retransmit lists.
1556 		 */
1557 		ASSERT(rtxp->rtx_cnt > 0);
1558 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1559 		rtxp->rtx_cnt--;
1560 		if (rtxp->rtx_fmode_cnt > 0) {
1561 			rtxp->rtx_fmode_cnt--;
1562 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1563 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1564 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1565 			    ilm->ilm_filter, rtxrp);
1566 		} else {
1567 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1568 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1569 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1570 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1571 		}
1572 		if (rtxp->rtx_cnt > 0) {
1573 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1574 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1575 			if (rtxp->rtx_timer < next)
1576 				next = rtxp->rtx_timer;
1577 			rtxp->rtx_timer += current;
1578 		} else {
1579 			ASSERT(rtxp->rtx_timer == INFINITY);
1580 			CLEAR_SLIST(rtxp->rtx_allow);
1581 			CLEAR_SLIST(rtxp->rtx_block);
1582 		}
1583 	}
1584 
1585 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1586 		mldv2_sendrpt(ill, rp);
1587 		mldv2_sendrpt(ill, rtxrp);
1588 	}
1589 	rw_exit(&ill->ill_mcast_lock);
1590 	/* Send any deferred/queued IP packets */
1591 	ill_mcast_send_queued(ill);
1592 	/* Defer ill_mcast_timer_start() until the caller is done */
1593 
1594 	return (next);
1595 }
1596 
1597 /*
1598  * mld_timeout_handler:
1599  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1600  * Returns number of ticks to next event (or 0 if none).
1601  * MT issues are same as igmp_timeout_handler
1602  */
1603 void
1604 mld_timeout_handler(void *arg)
1605 {
1606 	ill_t	*ill;
1607 	uint_t  global_next = INFINITY;
1608 	uint_t  next;
1609 	ill_walk_context_t ctx;
1610 	ip_stack_t *ipst = arg;
1611 
1612 	ASSERT(arg != NULL);
1613 	mutex_enter(&ipst->ips_mld_timer_lock);
1614 	ASSERT(ipst->ips_mld_timeout_id != 0);
1615 	ipst->ips_mld_timeout_id = 0;
1616 	ipst->ips_mld_timer_scheduled_last = 0;
1617 	ipst->ips_mld_time_to_next = 0;
1618 	mutex_exit(&ipst->ips_mld_timer_lock);
1619 
1620 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1621 	ill = ILL_START_WALK_V6(&ctx, ipst);
1622 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1623 		ASSERT(ill->ill_isv6);
1624 		/* Make sure the ill isn't going away. */
1625 		if (!ill_check_and_refhold(ill))
1626 			continue;
1627 		rw_exit(&ipst->ips_ill_g_lock);
1628 		next = mld_timeout_handler_per_ill(ill);
1629 		if (next < global_next)
1630 			global_next = next;
1631 		ill_refrele(ill);
1632 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1633 	}
1634 	rw_exit(&ipst->ips_ill_g_lock);
1635 	if (global_next != INFINITY)
1636 		mld_start_timers(global_next, ipst);
1637 }
1638 
1639 /*
1640  * Calculate the Older Version Querier Present timeout value, in number
1641  * of slowtimo intervals, for the given ill.
1642  */
1643 #define	OVQP(ill) \
1644 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1645 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1646 
1647 /*
1648  * igmp_slowtimo:
1649  * - Resets to new router if we didnt we hear from the router
1650  *   in IGMP_AGE_THRESHOLD seconds.
1651  * - Resets slowtimeout.
1652  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1653  * IGMP version than configured.
1654  */
1655 void
1656 igmp_slowtimo(void *arg)
1657 {
1658 	ill_t	*ill;
1659 	ill_if_t *ifp;
1660 	avl_tree_t *avl_tree;
1661 	ip_stack_t *ipst = (ip_stack_t *)arg;
1662 
1663 	ASSERT(arg != NULL);
1664 
1665 	/*
1666 	 * The ill_if_t list is circular, hence the odd loop parameters.
1667 	 *
1668 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1669 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1670 	 * structure (allowing us to skip if none of the instances have timers
1671 	 * running).
1672 	 */
1673 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1674 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1675 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1676 	    ifp = ifp->illif_next) {
1677 		/*
1678 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1679 		 * a V1 or V2 query now and we miss seeing the count now,
1680 		 * we will see it the next time igmp_slowtimo is called.
1681 		 */
1682 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1683 			continue;
1684 
1685 		avl_tree = &ifp->illif_avl_by_ppa;
1686 		for (ill = avl_first(avl_tree); ill != NULL;
1687 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1688 			/* Make sure the ill isn't going away. */
1689 			if (!ill_check_and_refhold(ill))
1690 				continue;
1691 			rw_exit(&ipst->ips_ill_g_lock);
1692 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1693 			if (ill->ill_mcast_v1_tset == 1)
1694 				ill->ill_mcast_v1_time++;
1695 			if (ill->ill_mcast_v2_tset == 1)
1696 				ill->ill_mcast_v2_time++;
1697 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1698 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1699 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1700 				if ((ill->ill_mcast_v2_tset > 0) ||
1701 				    (ipst->ips_igmp_max_version ==
1702 				    IGMP_V2_ROUTER)) {
1703 					ip1dbg(("V1 query timer "
1704 					    "expired on %s; switching "
1705 					    "mode to IGMP_V2\n",
1706 					    ill->ill_name));
1707 					ill->ill_mcast_type =
1708 					    IGMP_V2_ROUTER;
1709 				} else {
1710 					ip1dbg(("V1 query timer "
1711 					    "expired on %s; switching "
1712 					    "mode to IGMP_V3\n",
1713 					    ill->ill_name));
1714 					ill->ill_mcast_type =
1715 					    IGMP_V3_ROUTER;
1716 				}
1717 				ill->ill_mcast_v1_time = 0;
1718 				ill->ill_mcast_v1_tset = 0;
1719 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1720 			}
1721 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1722 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1723 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1724 				ip1dbg(("V2 query timer expired on "
1725 				    "%s; switching mode to IGMP_V3\n",
1726 				    ill->ill_name));
1727 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1728 				ill->ill_mcast_v2_time = 0;
1729 				ill->ill_mcast_v2_tset = 0;
1730 				atomic_add_16(&ifp->illif_mcast_v2, -1);
1731 			}
1732 			rw_exit(&ill->ill_mcast_lock);
1733 			ill_refrele(ill);
1734 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1735 		}
1736 	}
1737 	rw_exit(&ipst->ips_ill_g_lock);
1738 	ill_mcast_timer_start(ipst);
1739 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1740 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1741 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1742 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1743 }
1744 
1745 /*
1746  * mld_slowtimo:
1747  * - Resets to newer version if we didn't hear from the older version router
1748  *   in MLD_AGE_THRESHOLD seconds.
1749  * - Restarts slowtimeout.
1750  * Check for ips_mld_max_version ensures that we don't revert to a higher
1751  * IGMP version than configured.
1752  */
1753 void
1754 mld_slowtimo(void *arg)
1755 {
1756 	ill_t *ill;
1757 	ill_if_t *ifp;
1758 	avl_tree_t *avl_tree;
1759 	ip_stack_t *ipst = (ip_stack_t *)arg;
1760 
1761 	ASSERT(arg != NULL);
1762 	/* See comments in igmp_slowtimo() above... */
1763 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1764 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1765 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1766 	    ifp = ifp->illif_next) {
1767 		if (ifp->illif_mcast_v1 == 0)
1768 			continue;
1769 
1770 		avl_tree = &ifp->illif_avl_by_ppa;
1771 		for (ill = avl_first(avl_tree); ill != NULL;
1772 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1773 			/* Make sure the ill isn't going away. */
1774 			if (!ill_check_and_refhold(ill))
1775 				continue;
1776 			rw_exit(&ipst->ips_ill_g_lock);
1777 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1778 			if (ill->ill_mcast_v1_tset == 1)
1779 				ill->ill_mcast_v1_time++;
1780 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1781 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1782 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1783 				ip1dbg(("MLD query timer expired on"
1784 				    " %s; switching mode to MLD_V2\n",
1785 				    ill->ill_name));
1786 				ill->ill_mcast_type = MLD_V2_ROUTER;
1787 				ill->ill_mcast_v1_time = 0;
1788 				ill->ill_mcast_v1_tset = 0;
1789 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1790 			}
1791 			rw_exit(&ill->ill_mcast_lock);
1792 			ill_refrele(ill);
1793 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1794 		}
1795 	}
1796 	rw_exit(&ipst->ips_ill_g_lock);
1797 	ill_mcast_timer_start(ipst);
1798 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1799 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1800 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1801 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1802 }
1803 
1804 /*
1805  * igmp_sendpkt:
1806  * This will send to ip_output_simple just like icmp_inbound.
1807  */
1808 static void
1809 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1810 {
1811 	mblk_t	*mp;
1812 	igmpa_t	*igmpa;
1813 	uint8_t *rtralert;
1814 	ipha_t	*ipha;
1815 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1816 	size_t	size  = hdrlen + sizeof (igmpa_t);
1817 	ill_t 	*ill  = ilm->ilm_ill;
1818 	ip_stack_t *ipst = ill->ill_ipst;
1819 
1820 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1821 
1822 	mp = allocb(size, BPRI_HI);
1823 	if (mp == NULL) {
1824 		return;
1825 	}
1826 	mp->b_wptr = mp->b_rptr + size;
1827 
1828 	ipha = (ipha_t *)mp->b_rptr;
1829 	rtralert = (uint8_t *)&(ipha[1]);
1830 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1831 	igmpa->igmpa_type   = type;
1832 	igmpa->igmpa_code   = 0;
1833 	igmpa->igmpa_group  = ilm->ilm_addr;
1834 	igmpa->igmpa_cksum  = 0;
1835 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1836 
1837 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1838 	rtralert[1] = RTRALERT_LEN;
1839 	rtralert[2] = 0;
1840 	rtralert[3] = 0;
1841 
1842 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1843 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1844 	ipha->ipha_type_of_service 	= 0;
1845 	ipha->ipha_length = htons(size);
1846 	ipha->ipha_ident = 0;
1847 	ipha->ipha_fragment_offset_and_flags = 0;
1848 	ipha->ipha_ttl 		= IGMP_TTL;
1849 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1850 	ipha->ipha_hdr_checksum 	= 0;
1851 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1852 	ipha->ipha_src 		= INADDR_ANY;
1853 
1854 	ill_mcast_queue(ill, mp);
1855 
1856 	++ipst->ips_igmpstat.igps_snd_reports;
1857 }
1858 
1859 /*
1860  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
1861  * The report will contain one group record
1862  * for each element of reclist.  If this causes packet length to
1863  * exceed ill->ill_mc_mtu, multiple reports are sent.
1864  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1865  * and those buffers are freed here.
1866  */
1867 static void
1868 igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
1869 {
1870 	igmp3ra_t *igmp3ra;
1871 	grphdra_t *grphdr;
1872 	mblk_t *mp;
1873 	ipha_t *ipha;
1874 	uint8_t *rtralert;
1875 	ipaddr_t *src_array;
1876 	int i, j, numrec, more_src_cnt;
1877 	size_t hdrsize, size, rsize;
1878 	mrec_t *rp, *cur_reclist;
1879 	mrec_t *next_reclist = reclist;
1880 	boolean_t morepkts;
1881 	ip_stack_t	 *ipst = ill->ill_ipst;
1882 
1883 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1884 
1885 	/* if there aren't any records, there's nothing to send */
1886 	if (reclist == NULL)
1887 		return;
1888 
1889 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1890 nextpkt:
1891 	size = hdrsize + sizeof (igmp3ra_t);
1892 	morepkts = B_FALSE;
1893 	more_src_cnt = 0;
1894 	cur_reclist = next_reclist;
1895 	numrec = 0;
1896 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1897 		rsize = sizeof (grphdra_t) +
1898 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1899 		if (size + rsize > ill->ill_mc_mtu) {
1900 			if (rp == cur_reclist) {
1901 				/*
1902 				 * If the first mrec we looked at is too big
1903 				 * to fit in a single packet (i.e the source
1904 				 * list is too big), we must either truncate
1905 				 * the list (if TO_EX or IS_EX), or send
1906 				 * multiple reports for the same group (all
1907 				 * other types).
1908 				 */
1909 				int srcspace, srcsperpkt;
1910 				srcspace = ill->ill_mc_mtu - (size +
1911 				    sizeof (grphdra_t));
1912 
1913 				/*
1914 				 * Skip if there's not even enough room in
1915 				 * a single packet to send something useful.
1916 				 */
1917 				if (srcspace <= sizeof (ipaddr_t))
1918 					continue;
1919 
1920 				srcsperpkt = srcspace / sizeof (ipaddr_t);
1921 				/*
1922 				 * Increment size and numrec, because we will
1923 				 * be sending a record for the mrec we're
1924 				 * looking at now.
1925 				 */
1926 				size += sizeof (grphdra_t) +
1927 				    (srcsperpkt * sizeof (ipaddr_t));
1928 				numrec++;
1929 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
1930 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
1931 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1932 					if (rp->mrec_next == NULL) {
1933 						/* no more packets to send */
1934 						break;
1935 					} else {
1936 						/*
1937 						 * more packets, but we're
1938 						 * done with this mrec.
1939 						 */
1940 						next_reclist = rp->mrec_next;
1941 					}
1942 				} else {
1943 					more_src_cnt = rp->mrec_srcs.sl_numsrc
1944 					    - srcsperpkt;
1945 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1946 					/*
1947 					 * We'll fix up this mrec (remove the
1948 					 * srcs we've already sent) before
1949 					 * returning to nextpkt above.
1950 					 */
1951 					next_reclist = rp;
1952 				}
1953 			} else {
1954 				next_reclist = rp;
1955 			}
1956 			morepkts = B_TRUE;
1957 			break;
1958 		}
1959 		size += rsize;
1960 		numrec++;
1961 	}
1962 
1963 	mp = allocb(size, BPRI_HI);
1964 	if (mp == NULL) {
1965 		goto free_reclist;
1966 	}
1967 	bzero((char *)mp->b_rptr, size);
1968 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
1969 
1970 	ipha = (ipha_t *)mp->b_rptr;
1971 	rtralert = (uint8_t *)&(ipha[1]);
1972 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
1973 	grphdr = (grphdra_t *)&(igmp3ra[1]);
1974 
1975 	rp = cur_reclist;
1976 	for (i = 0; i < numrec; i++) {
1977 		grphdr->grphdra_type = rp->mrec_type;
1978 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
1979 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
1980 		src_array = (ipaddr_t *)&(grphdr[1]);
1981 
1982 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
1983 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
1984 
1985 		grphdr = (grphdra_t *)&(src_array[j]);
1986 		rp = rp->mrec_next;
1987 	}
1988 
1989 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
1990 	igmp3ra->igmp3ra_numrec = htons(numrec);
1991 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
1992 
1993 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1994 	rtralert[1] = RTRALERT_LEN;
1995 	rtralert[2] = 0;
1996 	rtralert[3] = 0;
1997 
1998 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
1999 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2000 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2001 	ipha->ipha_length = htons(size);
2002 	ipha->ipha_ttl = IGMP_TTL;
2003 	ipha->ipha_protocol = IPPROTO_IGMP;
2004 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2005 	ipha->ipha_src = INADDR_ANY;
2006 
2007 	ill_mcast_queue(ill, mp);
2008 
2009 	++ipst->ips_igmpstat.igps_snd_reports;
2010 
2011 	if (morepkts) {
2012 		if (more_src_cnt > 0) {
2013 			int index, mvsize;
2014 			slist_t *sl = &next_reclist->mrec_srcs;
2015 			index = sl->sl_numsrc;
2016 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2017 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2018 			    mvsize);
2019 			sl->sl_numsrc = more_src_cnt;
2020 		}
2021 		goto nextpkt;
2022 	}
2023 
2024 free_reclist:
2025 	while (reclist != NULL) {
2026 		rp = reclist->mrec_next;
2027 		mi_free(reclist);
2028 		reclist = rp;
2029 	}
2030 }
2031 
2032 /*
2033  * mld_input:
2034  * Return NULL for a bad packet that is discarded here.
2035  * Return mp if the message is OK and should be handed to "raw" receivers.
2036  * Callers of mld_input() may need to reinitialize variables that were copied
2037  * from the mblk as this calls pullupmsg().
2038  */
2039 mblk_t *
2040 mld_input(mblk_t *mp, ip_recv_attr_t *ira)
2041 {
2042 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2043 	mld_hdr_t	*mldh;
2044 	ilm_t		*ilm;
2045 	ipif_t		*ipif;
2046 	uint16_t	hdr_length, exthdr_length;
2047 	in6_addr_t	*v6group_ptr;
2048 	uint_t		next;
2049 	int		mldlen;
2050 	ill_t		*ill = ira->ira_ill;
2051 	ip_stack_t	*ipst = ill->ill_ipst;
2052 
2053 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2054 
2055 	/* Make sure the src address of the packet is link-local */
2056 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2057 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2058 		freemsg(mp);
2059 		return (NULL);
2060 	}
2061 
2062 	if (ip6h->ip6_hlim != 1) {
2063 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2064 		freemsg(mp);
2065 		return (NULL);
2066 	}
2067 
2068 	/* Get to the icmp header part */
2069 	hdr_length = ira->ira_ip_hdr_length;
2070 	exthdr_length = hdr_length - IPV6_HDR_LEN;
2071 
2072 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2073 
2074 	/* An MLD packet must at least be 24 octets to be valid */
2075 	if (mldlen < MLD_MINLEN) {
2076 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2077 		freemsg(mp);
2078 		return (NULL);
2079 	}
2080 
2081 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2082 
2083 	switch (mldh->mld_type) {
2084 	case MLD_LISTENER_QUERY:
2085 		/*
2086 		 * packet length differentiates between v1 and v2.  v1
2087 		 * query should be exactly 24 octets long; v2 is >= 28.
2088 		 */
2089 		if ((mldlen == MLD_MINLEN) ||
2090 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2091 			next = mld_query_in(mldh, ill);
2092 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2093 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2094 		} else {
2095 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2096 			freemsg(mp);
2097 			return (NULL);
2098 		}
2099 		if (next == 0) {
2100 			return (mp);
2101 		}
2102 
2103 		if (next != INFINITY)
2104 			mld_start_timers(next, ipst);
2105 		break;
2106 
2107 	case MLD_LISTENER_REPORT:
2108 		/*
2109 		 * For fast leave to work, we have to know that we are the
2110 		 * last person to send a report for this group.  Reports
2111 		 * generated by us are looped back since we could potentially
2112 		 * be a multicast router, so discard reports sourced by me.
2113 		 */
2114 		mutex_enter(&ill->ill_lock);
2115 		for (ipif = ill->ill_ipif; ipif != NULL;
2116 		    ipif = ipif->ipif_next) {
2117 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2118 			    &ip6h->ip6_src)) {
2119 				if (ip_debug > 1) {
2120 					char    buf1[INET6_ADDRSTRLEN];
2121 
2122 					(void) mi_strlog(ill->ill_rq,
2123 					    1,
2124 					    SL_TRACE,
2125 					    "mld_input: we are only "
2126 					    "member src %s\n",
2127 					    inet_ntop(AF_INET6, &ip6h->ip6_src,
2128 					    buf1, sizeof (buf1)));
2129 				}
2130 				mutex_exit(&ill->ill_lock);
2131 				return (mp);
2132 			}
2133 		}
2134 		mutex_exit(&ill->ill_lock);
2135 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2136 
2137 		v6group_ptr = &mldh->mld_addr;
2138 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2139 			BUMP_MIB(ill->ill_icmp6_mib,
2140 			    ipv6IfIcmpInGroupMembBadReports);
2141 			freemsg(mp);
2142 			return (NULL);
2143 		}
2144 
2145 
2146 		/*
2147 		 * If we belong to the group being reported, and we are a
2148 		 * 'Delaying member' per the RFC terminology, stop our timer
2149 		 * for that group and 'clear flag' i.e. mark ilm_state as
2150 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2151 		 * membership entries for the same group address (one per zone)
2152 		 * so we need to walk the ill_ilm list.
2153 		 */
2154 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2155 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2156 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2157 				continue;
2158 			BUMP_MIB(ill->ill_icmp6_mib,
2159 			    ipv6IfIcmpInGroupMembOurReports);
2160 
2161 			ilm->ilm_timer = INFINITY;
2162 			ilm->ilm_state = IGMP_OTHERMEMBER;
2163 		}
2164 		rw_exit(&ill->ill_mcast_lock);
2165 		/*
2166 		 * No packets have been sent above - no
2167 		 * ill_mcast_send_queued is needed.
2168 		 */
2169 		ill_mcast_timer_start(ill->ill_ipst);
2170 		break;
2171 
2172 	case MLD_LISTENER_REDUCTION:
2173 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2174 		break;
2175 	}
2176 	return (mp);
2177 }
2178 
2179 /*
2180  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2181  * (non-zero, unsigned) timer value to be set on success.
2182  */
2183 static uint_t
2184 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2185 {
2186 	ilm_t	*ilm;
2187 	int	timer;
2188 	uint_t	next, current;
2189 	in6_addr_t *v6group;
2190 
2191 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2192 
2193 	/*
2194 	 * In the MLD specification, there are 3 states and a flag.
2195 	 *
2196 	 * In Non-Listener state, we simply don't have a membership record.
2197 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2198 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2199 	 * INFINITY)
2200 	 *
2201 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2202 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2203 	 * if I sent the last report.
2204 	 */
2205 	v6group = &mldh->mld_addr;
2206 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2207 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2208 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2209 		return (0);
2210 	}
2211 
2212 	/* Need to do compatibility mode checking */
2213 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2214 	ill->ill_mcast_v1_time = 0;
2215 	ill->ill_mcast_v1_tset = 1;
2216 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2217 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2218 		    "MLD_V1_ROUTER\n", ill->ill_name));
2219 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2220 		ill->ill_mcast_type = MLD_V1_ROUTER;
2221 	}
2222 
2223 	timer = (int)ntohs(mldh->mld_maxdelay);
2224 	if (ip_debug > 1) {
2225 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2226 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2227 		    timer, (int)mldh->mld_type);
2228 	}
2229 
2230 	/*
2231 	 * -Start the timers in all of our membership records for
2232 	 * the physical interface on which the query arrived,
2233 	 * excl:
2234 	 *	1.  those that belong to the "all hosts" group,
2235 	 *	2.  those with 0 scope, or 1 node-local scope.
2236 	 *
2237 	 * -Restart any timer that is already running but has a value
2238 	 * longer that the requested timeout.
2239 	 * -Use the value specified in the query message as the
2240 	 * maximum timeout.
2241 	 */
2242 	next = INFINITY;
2243 
2244 	current = CURRENT_MSTIME;
2245 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2246 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2247 
2248 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2249 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2250 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2251 			continue;
2252 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2253 		    &ipv6_all_hosts_mcast)) &&
2254 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2255 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2256 			if (timer == 0) {
2257 				/* Respond immediately */
2258 				ilm->ilm_timer = INFINITY;
2259 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2260 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2261 				break;
2262 			}
2263 			if (ilm->ilm_timer > timer) {
2264 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2265 				if (ilm->ilm_timer < next)
2266 					next = ilm->ilm_timer;
2267 				ilm->ilm_timer += current;
2268 			}
2269 			break;
2270 		}
2271 	}
2272 	rw_exit(&ill->ill_mcast_lock);
2273 	/* Send any deferred/queued IP packets */
2274 	ill_mcast_send_queued(ill);
2275 	ill_mcast_timer_start(ill->ill_ipst);
2276 
2277 	return (next);
2278 }
2279 
2280 /*
2281  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2282  * returns the appropriate (non-zero, unsigned) timer value (which may
2283  * be INFINITY) to be set.
2284  */
2285 static uint_t
2286 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2287 {
2288 	ilm_t	*ilm;
2289 	in6_addr_t *v6group, *src_array;
2290 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2291 	uint8_t	qrv;
2292 
2293 	v6group = &mld2q->mld2q_addr;
2294 	numsrc = ntohs(mld2q->mld2q_numsrc);
2295 
2296 	/* make sure numsrc matches packet size */
2297 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2298 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2299 		return (0);
2300 	}
2301 	src_array = (in6_addr_t *)&mld2q[1];
2302 
2303 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2304 
2305 	/* extract Maximum Response Delay from code in header */
2306 	mrd = ntohs(mld2q->mld2q_mxrc);
2307 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2308 		uint_t hdrval, mant, exp;
2309 		hdrval = mrd;
2310 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2311 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2312 		mrd = (mant | 0x1000) << (exp + 3);
2313 	}
2314 	if (mrd == 0)
2315 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2316 
2317 	MCAST_RANDOM_DELAY(delay, mrd);
2318 	next = (unsigned)INFINITY;
2319 	current = CURRENT_MSTIME;
2320 
2321 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2322 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2323 	else
2324 		ill->ill_mcast_rv = qrv;
2325 
2326 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2327 		uint_t mant, exp;
2328 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2329 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2330 		qqi = (mant | 0x10) << (exp + 3);
2331 	}
2332 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2333 
2334 	/*
2335 	 * If we have a pending general query response that's scheduled
2336 	 * sooner than the delay we calculated for this response, then
2337 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2338 	 */
2339 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2340 	if (ill->ill_global_timer < (current + delay)) {
2341 		rw_exit(&ill->ill_mcast_lock);
2342 		return (next);
2343 	}
2344 
2345 	/*
2346 	 * Now take action depending on query type: general,
2347 	 * group specific, or group/source specific.
2348 	 */
2349 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2350 		/*
2351 		 * general query
2352 		 * We know global timer is either not running or is
2353 		 * greater than our calculated delay, so reset it to
2354 		 * our delay (random value in range [0, response time])
2355 		 */
2356 		ill->ill_global_timer = current + delay;
2357 		next = delay;
2358 	} else {
2359 		/* group or group/source specific query */
2360 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2361 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2362 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2363 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2364 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2365 				continue;
2366 
2367 			/*
2368 			 * If the query is group specific or we have a
2369 			 * pending group specific query, the response is
2370 			 * group specific (pending sources list should be
2371 			 * empty).  Otherwise, need to update the pending
2372 			 * sources list for the group and source specific
2373 			 * response.
2374 			 */
2375 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2376 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2377 group_query:
2378 				FREE_SLIST(ilm->ilm_pendsrcs);
2379 				ilm->ilm_pendsrcs = NULL;
2380 			} else {
2381 				boolean_t overflow;
2382 				slist_t *pktl;
2383 				if (numsrc > MAX_FILTER_SIZE ||
2384 				    (ilm->ilm_pendsrcs == NULL &&
2385 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2386 					/*
2387 					 * We've been sent more sources than
2388 					 * we can deal with; or we can't deal
2389 					 * with a source list at all. Revert
2390 					 * to a group specific query.
2391 					 */
2392 					goto group_query;
2393 				}
2394 				if ((pktl = l_alloc()) == NULL)
2395 					goto group_query;
2396 				pktl->sl_numsrc = numsrc;
2397 				for (i = 0; i < numsrc; i++)
2398 					pktl->sl_addr[i] = src_array[i];
2399 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2400 				    &overflow);
2401 				l_free(pktl);
2402 				if (overflow)
2403 					goto group_query;
2404 			}
2405 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2406 			    INFINITY : (ilm->ilm_timer - current);
2407 			/* set timer to soonest value */
2408 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2409 			if (ilm->ilm_timer < next)
2410 				next = ilm->ilm_timer;
2411 			ilm->ilm_timer += current;
2412 			break;
2413 		}
2414 	}
2415 	rw_exit(&ill->ill_mcast_lock);
2416 	/*
2417 	 * No packets have been sent above - no
2418 	 * ill_mcast_send_queued is needed.
2419 	 */
2420 	ill_mcast_timer_start(ill->ill_ipst);
2421 
2422 	return (next);
2423 }
2424 
2425 /*
2426  * Send MLDv1 response packet with hoplimit 1
2427  */
2428 static void
2429 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2430 {
2431 	mblk_t		*mp;
2432 	mld_hdr_t	*mldh;
2433 	ip6_t 		*ip6h;
2434 	ip6_hbh_t	*ip6hbh;
2435 	struct ip6_opt_router	*ip6router;
2436 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2437 	ill_t		*ill = ilm->ilm_ill;
2438 
2439 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2440 
2441 	/*
2442 	 * We need to place a router alert option in this packet.  The length
2443 	 * of the options must be a multiple of 8.  The hbh option header is 2
2444 	 * bytes followed by the 4 byte router alert option.  That leaves
2445 	 * 2 bytes of pad for a total of 8 bytes.
2446 	 */
2447 	const int	router_alert_length = 8;
2448 
2449 	ASSERT(ill->ill_isv6);
2450 
2451 	size += router_alert_length;
2452 	mp = allocb(size, BPRI_HI);
2453 	if (mp == NULL)
2454 		return;
2455 	bzero(mp->b_rptr, size);
2456 	mp->b_wptr = mp->b_rptr + size;
2457 
2458 	ip6h = (ip6_t *)mp->b_rptr;
2459 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2460 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2461 	/*
2462 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2463 	 * above will pad between ip6router and mld.
2464 	 */
2465 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2466 
2467 	mldh->mld_type = type;
2468 	mldh->mld_addr = ilm->ilm_v6addr;
2469 
2470 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2471 	ip6router->ip6or_len = 2;
2472 	ip6router->ip6or_value[0] = 0;
2473 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2474 
2475 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2476 	ip6hbh->ip6h_len = 0;
2477 
2478 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2479 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2480 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2481 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2482 	if (v6addr == NULL)
2483 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2484 	else
2485 		ip6h->ip6_dst = *v6addr;
2486 
2487 	ip6h->ip6_src = ipv6_all_zeros;
2488 	/*
2489 	 * Prepare for checksum by putting icmp length in the icmp
2490 	 * checksum field. The checksum is calculated in ip_output.
2491 	 */
2492 	mldh->mld_cksum = htons(sizeof (*mldh));
2493 
2494 	ill_mcast_queue(ill, mp);
2495 }
2496 
2497 /*
2498  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2499  * report will contain one multicast address record for each element of
2500  * reclist.  If this causes packet length to exceed ill->ill_mc_mtu,
2501  * multiple reports are sent.  reclist is assumed to be made up of
2502  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2503  */
2504 static void
2505 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2506 {
2507 	mblk_t		*mp;
2508 	mld2r_t		*mld2r;
2509 	mld2mar_t	*mld2mar;
2510 	in6_addr_t	*srcarray;
2511 	ip6_t		*ip6h;
2512 	ip6_hbh_t	*ip6hbh;
2513 	struct ip6_opt_router	*ip6router;
2514 	size_t		size, optlen, padlen, icmpsize, rsize;
2515 	int		i, numrec, more_src_cnt;
2516 	mrec_t		*rp, *cur_reclist;
2517 	mrec_t		*next_reclist = reclist;
2518 	boolean_t	morepkts;
2519 
2520 	/* If there aren't any records, there's nothing to send */
2521 	if (reclist == NULL)
2522 		return;
2523 
2524 	ASSERT(ill->ill_isv6);
2525 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2526 
2527 	/*
2528 	 * Total option length (optlen + padlen) must be a multiple of
2529 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2530 	 * length will be 8.  Assert this in case anything ever changes.
2531 	 */
2532 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2533 	ASSERT(optlen <= 8);
2534 	padlen = 8 - optlen;
2535 nextpkt:
2536 	icmpsize = sizeof (mld2r_t);
2537 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2538 	morepkts = B_FALSE;
2539 	more_src_cnt = 0;
2540 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2541 	    rp = rp->mrec_next, numrec++) {
2542 		rsize = sizeof (mld2mar_t) +
2543 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2544 		if (size + rsize > ill->ill_mc_mtu) {
2545 			if (rp == cur_reclist) {
2546 				/*
2547 				 * If the first mrec we looked at is too big
2548 				 * to fit in a single packet (i.e the source
2549 				 * list is too big), we must either truncate
2550 				 * the list (if TO_EX or IS_EX), or send
2551 				 * multiple reports for the same group (all
2552 				 * other types).
2553 				 */
2554 				int srcspace, srcsperpkt;
2555 				srcspace = ill->ill_mc_mtu -
2556 				    (size + sizeof (mld2mar_t));
2557 
2558 				/*
2559 				 * Skip if there's not even enough room in
2560 				 * a single packet to send something useful.
2561 				 */
2562 				if (srcspace <= sizeof (in6_addr_t))
2563 					continue;
2564 
2565 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2566 				/*
2567 				 * Increment icmpsize and size, because we will
2568 				 * be sending a record for the mrec we're
2569 				 * looking at now.
2570 				 */
2571 				rsize = sizeof (mld2mar_t) +
2572 				    (srcsperpkt * sizeof (in6_addr_t));
2573 				icmpsize += rsize;
2574 				size += rsize;
2575 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2576 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2577 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2578 					if (rp->mrec_next == NULL) {
2579 						/* no more packets to send */
2580 						break;
2581 					} else {
2582 						/*
2583 						 * more packets, but we're
2584 						 * done with this mrec.
2585 						 */
2586 						next_reclist = rp->mrec_next;
2587 					}
2588 				} else {
2589 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2590 					    - srcsperpkt;
2591 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2592 					/*
2593 					 * We'll fix up this mrec (remove the
2594 					 * srcs we've already sent) before
2595 					 * returning to nextpkt above.
2596 					 */
2597 					next_reclist = rp;
2598 				}
2599 			} else {
2600 				next_reclist = rp;
2601 			}
2602 			morepkts = B_TRUE;
2603 			break;
2604 		}
2605 		icmpsize += rsize;
2606 		size += rsize;
2607 	}
2608 
2609 	mp = allocb(size, BPRI_HI);
2610 	if (mp == NULL)
2611 		goto free_reclist;
2612 	bzero(mp->b_rptr, size);
2613 	mp->b_wptr = mp->b_rptr + size;
2614 
2615 	ip6h = (ip6_t *)mp->b_rptr;
2616 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2617 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2618 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2619 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2620 
2621 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2622 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2623 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2624 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2625 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2626 	ip6h->ip6_src = ipv6_all_zeros;
2627 
2628 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2629 	/*
2630 	 * ip6h_len is the number of 8-byte words, not including the first
2631 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2632 	 */
2633 	ip6hbh->ip6h_len = 0;
2634 
2635 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2636 	ip6router->ip6or_len = 2;
2637 	ip6router->ip6or_value[0] = 0;
2638 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2639 
2640 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2641 	mld2r->mld2r_nummar = htons(numrec);
2642 	/*
2643 	 * Prepare for the checksum by putting icmp length in the icmp
2644 	 * checksum field. The checksum is calculated in ip_output_simple.
2645 	 */
2646 	mld2r->mld2r_cksum = htons(icmpsize);
2647 
2648 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2649 		mld2mar->mld2mar_type = rp->mrec_type;
2650 		mld2mar->mld2mar_auxlen = 0;
2651 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2652 		mld2mar->mld2mar_group = rp->mrec_group;
2653 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2654 
2655 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2656 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2657 
2658 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2659 	}
2660 
2661 	ill_mcast_queue(ill, mp);
2662 
2663 	if (morepkts) {
2664 		if (more_src_cnt > 0) {
2665 			int index, mvsize;
2666 			slist_t *sl = &next_reclist->mrec_srcs;
2667 			index = sl->sl_numsrc;
2668 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2669 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2670 			    mvsize);
2671 			sl->sl_numsrc = more_src_cnt;
2672 		}
2673 		goto nextpkt;
2674 	}
2675 
2676 free_reclist:
2677 	while (reclist != NULL) {
2678 		rp = reclist->mrec_next;
2679 		mi_free(reclist);
2680 		reclist = rp;
2681 	}
2682 }
2683 
2684 static mrec_t *
2685 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2686     mrec_t *next)
2687 {
2688 	mrec_t *rp;
2689 	int i;
2690 
2691 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2692 	    SLIST_IS_EMPTY(srclist))
2693 		return (next);
2694 
2695 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2696 	if (rp == NULL)
2697 		return (next);
2698 
2699 	rp->mrec_next = next;
2700 	rp->mrec_type = type;
2701 	rp->mrec_auxlen = 0;
2702 	rp->mrec_group = *grp;
2703 	if (srclist == NULL) {
2704 		rp->mrec_srcs.sl_numsrc = 0;
2705 	} else {
2706 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2707 		for (i = 0; i < srclist->sl_numsrc; i++)
2708 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2709 	}
2710 
2711 	return (rp);
2712 }
2713 
2714 /*
2715  * Set up initial retransmit state.  If memory cannot be allocated for
2716  * the source lists, simply create as much state as is possible; memory
2717  * allocation failures are considered one type of transient error that
2718  * the retransmissions are designed to overcome (and if they aren't
2719  * transient, there are bigger problems than failing to notify the
2720  * router about multicast group membership state changes).
2721  */
2722 static void
2723 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2724     slist_t *flist)
2725 {
2726 	/*
2727 	 * There are only three possibilities for rtype:
2728 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2729 	 *	  => rtype is ALLOW_NEW_SOURCES
2730 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2731 	 *	  => rtype is CHANGE_TO_EXCLUDE
2732 	 *	State change that involves a filter mode change
2733 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2734 	 */
2735 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2736 	    rtype == ALLOW_NEW_SOURCES);
2737 
2738 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2739 
2740 	switch (rtype) {
2741 	case CHANGE_TO_EXCLUDE:
2742 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2743 		CLEAR_SLIST(rtxp->rtx_allow);
2744 		COPY_SLIST(flist, rtxp->rtx_block);
2745 		break;
2746 	case ALLOW_NEW_SOURCES:
2747 	case CHANGE_TO_INCLUDE:
2748 		rtxp->rtx_fmode_cnt =
2749 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2750 		CLEAR_SLIST(rtxp->rtx_block);
2751 		COPY_SLIST(flist, rtxp->rtx_allow);
2752 		break;
2753 	}
2754 }
2755 
2756 /*
2757  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2758  * RFC 3376 section 5.1, covers three cases:
2759  *	* The current state change is a filter mode change
2760  *		Set filter mode retransmit counter; set retransmit allow or
2761  *		block list to new source list as appropriate, and clear the
2762  *		retransmit list that was not set; send TO_IN or TO_EX with
2763  *		new source list.
2764  *	* The current state change is a source list change, but the filter
2765  *	  mode retransmit counter is > 0
2766  *		Decrement filter mode retransmit counter; set retransmit
2767  *		allow or block list to  new source list as appropriate,
2768  *		and clear the retransmit list that was not set; send TO_IN
2769  *		or TO_EX with new source list.
2770  *	* The current state change is a source list change, and the filter
2771  *	  mode retransmit counter is 0.
2772  *		Merge existing rtx allow and block lists with new state:
2773  *		  rtx_allow = (new allow + rtx_allow) - new block
2774  *		  rtx_block = (new block + rtx_block) - new allow
2775  *		Send ALLOW and BLOCK records for new retransmit lists;
2776  *		decrement retransmit counter.
2777  *
2778  * As is the case for mcast_init_rtx(), memory allocation failures are
2779  * acceptable; we just create as much state as we can.
2780  */
2781 static mrec_t *
2782 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2783 {
2784 	ill_t *ill;
2785 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2786 	mcast_record_t txtype;
2787 	mrec_t *rp, *rpnext, *rtnmrec;
2788 	boolean_t ovf;
2789 
2790 	ill = ilm->ilm_ill;
2791 
2792 	if (mreclist == NULL)
2793 		return (mreclist);
2794 
2795 	/*
2796 	 * A filter mode change is indicated by a single mrec, which is
2797 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2798 	 * retransmit state as if this were an initial join.  There is
2799 	 * no change to the mrec list.
2800 	 */
2801 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2802 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2803 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2804 		    &mreclist->mrec_srcs);
2805 		return (mreclist);
2806 	}
2807 
2808 	/*
2809 	 * Only the source list has changed
2810 	 */
2811 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2812 	if (rtxp->rtx_fmode_cnt > 0) {
2813 		/* but we're still sending filter mode change reports */
2814 		rtxp->rtx_fmode_cnt--;
2815 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
2816 			CLEAR_SLIST(rtxp->rtx_block);
2817 			COPY_SLIST(flist, rtxp->rtx_allow);
2818 			txtype = CHANGE_TO_INCLUDE;
2819 		} else {
2820 			CLEAR_SLIST(rtxp->rtx_allow);
2821 			COPY_SLIST(flist, rtxp->rtx_block);
2822 			txtype = CHANGE_TO_EXCLUDE;
2823 		}
2824 		/* overwrite first mrec with new info */
2825 		mreclist->mrec_type = txtype;
2826 		l_copy(flist, &mreclist->mrec_srcs);
2827 		/* then free any remaining mrecs */
2828 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
2829 			rpnext = rp->mrec_next;
2830 			mi_free(rp);
2831 		}
2832 		mreclist->mrec_next = NULL;
2833 		rtnmrec = mreclist;
2834 	} else {
2835 		mrec_t *allow_mrec, *block_mrec;
2836 		/*
2837 		 * Just send the source change reports; but we need to
2838 		 * recalculate the ALLOW and BLOCK lists based on previous
2839 		 * state and new changes.
2840 		 */
2841 		rtnmrec = mreclist;
2842 		allow_mrec = block_mrec = NULL;
2843 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
2844 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
2845 			    rp->mrec_type == BLOCK_OLD_SOURCES);
2846 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
2847 				allow_mrec = rp;
2848 			else
2849 				block_mrec = rp;
2850 		}
2851 		/*
2852 		 * Perform calculations:
2853 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
2854 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
2855 		 *
2856 		 * Each calc requires two steps, for example:
2857 		 *   rtx_allow = rtx_allow - mrec_block;
2858 		 *   new_allow = mrec_allow + rtx_allow;
2859 		 *
2860 		 * Store results in mrec lists, and then copy into rtx lists.
2861 		 * We do it in this order in case the rtx list hasn't been
2862 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
2863 		 * Overflows are also okay.
2864 		 */
2865 		if (block_mrec != NULL) {
2866 			l_difference_in_a(rtxp->rtx_allow,
2867 			    &block_mrec->mrec_srcs);
2868 		}
2869 		if (allow_mrec != NULL) {
2870 			l_difference_in_a(rtxp->rtx_block,
2871 			    &allow_mrec->mrec_srcs);
2872 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
2873 			    &ovf);
2874 		}
2875 		if (block_mrec != NULL) {
2876 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
2877 			    &ovf);
2878 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
2879 		} else {
2880 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
2881 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
2882 		}
2883 		if (allow_mrec != NULL) {
2884 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
2885 		} else {
2886 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
2887 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
2888 		}
2889 	}
2890 
2891 	return (rtnmrec);
2892 }
2893