xref: /titanic_52/usr/src/uts/common/inet/ip/igmp.c (revision 9b4e3ac25d882519cad3fc11f0c53b07f4e60536)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * Internet Group Management Protocol (IGMP) routines.
29  * Multicast Listener Discovery Protocol (MLD) routines.
30  *
31  * Written by Steve Deering, Stanford, May 1988.
32  * Modified by Rosen Sharma, Stanford, Aug 1994.
33  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
34  *
35  * MULTICAST 3.5.1.1
36  */
37 
38 #include <sys/types.h>
39 #include <sys/stream.h>
40 #include <sys/stropts.h>
41 #include <sys/strlog.h>
42 #include <sys/strsun.h>
43 #include <sys/systm.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/cmn_err.h>
47 #include <sys/atomic.h>
48 #include <sys/zone.h>
49 
50 #include <sys/param.h>
51 #include <sys/socket.h>
52 #include <inet/ipclassifier.h>
53 #include <net/if.h>
54 #include <net/route.h>
55 #include <netinet/in.h>
56 #include <netinet/igmp_var.h>
57 #include <netinet/ip6.h>
58 #include <netinet/icmp6.h>
59 
60 #include <inet/common.h>
61 #include <inet/mi.h>
62 #include <inet/nd.h>
63 #include <inet/ip.h>
64 #include <inet/ip6.h>
65 #include <inet/ip_multi.h>
66 #include <inet/ip_listutils.h>
67 
68 #include <netinet/igmp.h>
69 #include <inet/ip_if.h>
70 #include <net/pfkeyv2.h>
71 #include <inet/ipsec_info.h>
72 
73 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
74 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
75 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
76 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
77 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
78 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
79 static void	igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist);
80 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
81 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
82 		    slist_t *srclist, mrec_t *next);
83 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
84 		    mcast_record_t rtype, slist_t *flist);
85 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
86 
87 
88 /*
89  * Macros used to do timer len conversions.  Timer values are always
90  * stored and passed to the timer functions as milliseconds; but the
91  * default values and values from the wire may not be.
92  *
93  * And yes, it's obscure, but decisecond is easier to abbreviate than
94  * "tenths of a second".
95  */
96 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
97 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
98 
99 /*
100  * A running timer (scheduled thru timeout) can be cancelled if another
101  * timer with a shorter timeout value is scheduled before it has timed
102  * out.  When the shorter timer expires, the original timer is updated
103  * to account for the time elapsed while the shorter timer ran; but this
104  * does not take into account the amount of time already spent in timeout
105  * state before being preempted by the shorter timer, that is the time
106  * interval between time scheduled to time cancelled.  This can cause
107  * delays in sending out multicast membership reports.  To resolve this
108  * problem, wallclock time (absolute time) is used instead of deltas
109  * (relative time) to track timers.
110  *
111  * The MACRO below gets the lbolt value, used for proper timer scheduling
112  * and firing. Therefore multicast membership reports are sent on time.
113  * The timer does not exactly fire at the time it was scehduled to fire,
114  * there is a difference of a few milliseconds observed. An offset is used
115  * to take care of the difference.
116  */
117 
118 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
119 #define	CURRENT_OFFSET	(999)
120 
121 /*
122  * The first multicast join will trigger the igmp timers / mld timers
123  * The unit for next is milliseconds.
124  */
125 void
126 igmp_start_timers(unsigned next, ip_stack_t *ipst)
127 {
128 	int	time_left;
129 	int	ret;
130 
131 	ASSERT(next != 0 && next != INFINITY);
132 
133 	mutex_enter(&ipst->ips_igmp_timer_lock);
134 
135 	if (ipst->ips_igmp_timer_setter_active) {
136 		/*
137 		 * Serialize timer setters, one at a time. If the
138 		 * timer is currently being set by someone,
139 		 * just record the next time when it has to be
140 		 * invoked and return. The current setter will
141 		 * take care.
142 		 */
143 		ipst->ips_igmp_time_to_next =
144 		    MIN(ipst->ips_igmp_time_to_next, next);
145 		mutex_exit(&ipst->ips_igmp_timer_lock);
146 		return;
147 	} else {
148 		ipst->ips_igmp_timer_setter_active = B_TRUE;
149 	}
150 	if (ipst->ips_igmp_timeout_id == 0) {
151 		/*
152 		 * The timer is inactive. We need to start a timer
153 		 */
154 		ipst->ips_igmp_time_to_next = next;
155 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
156 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
157 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
158 		ipst->ips_igmp_timer_setter_active = B_FALSE;
159 		mutex_exit(&ipst->ips_igmp_timer_lock);
160 		return;
161 	}
162 
163 	/*
164 	 * The timer was scheduled sometime back for firing in
165 	 * 'igmp_time_to_next' ms and is active. We need to
166 	 * reschedule the timeout if the new 'next' will happen
167 	 * earlier than the currently scheduled timeout
168 	 */
169 	time_left = ipst->ips_igmp_timer_scheduled_last +
170 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
171 	if (time_left < MSEC_TO_TICK(next)) {
172 		ipst->ips_igmp_timer_setter_active = B_FALSE;
173 		mutex_exit(&ipst->ips_igmp_timer_lock);
174 		return;
175 	}
176 
177 	mutex_exit(&ipst->ips_igmp_timer_lock);
178 	ret = untimeout(ipst->ips_igmp_timeout_id);
179 	mutex_enter(&ipst->ips_igmp_timer_lock);
180 	/*
181 	 * The timeout was cancelled, or the timeout handler
182 	 * completed, while we were blocked in the untimeout.
183 	 * No other thread could have set the timer meanwhile
184 	 * since we serialized all the timer setters. Thus
185 	 * no timer is currently active nor executing nor will
186 	 * any timer fire in the future. We start the timer now
187 	 * if needed.
188 	 */
189 	if (ret == -1) {
190 		ASSERT(ipst->ips_igmp_timeout_id == 0);
191 	} else {
192 		ASSERT(ipst->ips_igmp_timeout_id != 0);
193 		ipst->ips_igmp_timeout_id = 0;
194 	}
195 	if (ipst->ips_igmp_time_to_next != 0) {
196 		ipst->ips_igmp_time_to_next =
197 		    MIN(ipst->ips_igmp_time_to_next, next);
198 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
199 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
200 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
201 	}
202 	ipst->ips_igmp_timer_setter_active = B_FALSE;
203 	mutex_exit(&ipst->ips_igmp_timer_lock);
204 }
205 
206 /*
207  * mld_start_timers:
208  * The unit for next is milliseconds.
209  */
210 void
211 mld_start_timers(unsigned next, ip_stack_t *ipst)
212 {
213 	int	time_left;
214 	int	ret;
215 
216 	ASSERT(next != 0 && next != INFINITY);
217 
218 	mutex_enter(&ipst->ips_mld_timer_lock);
219 	if (ipst->ips_mld_timer_setter_active) {
220 		/*
221 		 * Serialize timer setters, one at a time. If the
222 		 * timer is currently being set by someone,
223 		 * just record the next time when it has to be
224 		 * invoked and return. The current setter will
225 		 * take care.
226 		 */
227 		ipst->ips_mld_time_to_next =
228 		    MIN(ipst->ips_mld_time_to_next, next);
229 		mutex_exit(&ipst->ips_mld_timer_lock);
230 		return;
231 	} else {
232 		ipst->ips_mld_timer_setter_active = B_TRUE;
233 	}
234 	if (ipst->ips_mld_timeout_id == 0) {
235 		/*
236 		 * The timer is inactive. We need to start a timer
237 		 */
238 		ipst->ips_mld_time_to_next = next;
239 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
240 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
241 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
242 		ipst->ips_mld_timer_setter_active = B_FALSE;
243 		mutex_exit(&ipst->ips_mld_timer_lock);
244 		return;
245 	}
246 
247 	/*
248 	 * The timer was scheduled sometime back for firing in
249 	 * 'igmp_time_to_next' ms and is active. We need to
250 	 * reschedule the timeout if the new 'next' will happen
251 	 * earlier than the currently scheduled timeout
252 	 */
253 	time_left = ipst->ips_mld_timer_scheduled_last +
254 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
255 	if (time_left < MSEC_TO_TICK(next)) {
256 		ipst->ips_mld_timer_setter_active = B_FALSE;
257 		mutex_exit(&ipst->ips_mld_timer_lock);
258 		return;
259 	}
260 
261 	mutex_exit(&ipst->ips_mld_timer_lock);
262 	ret = untimeout(ipst->ips_mld_timeout_id);
263 	mutex_enter(&ipst->ips_mld_timer_lock);
264 	/*
265 	 * The timeout was cancelled, or the timeout handler
266 	 * completed, while we were blocked in the untimeout.
267 	 * No other thread could have set the timer meanwhile
268 	 * since we serialized all the timer setters. Thus
269 	 * no timer is currently active nor executing nor will
270 	 * any timer fire in the future. We start the timer now
271 	 * if needed.
272 	 */
273 	if (ret == -1) {
274 		ASSERT(ipst->ips_mld_timeout_id == 0);
275 	} else {
276 		ASSERT(ipst->ips_mld_timeout_id != 0);
277 		ipst->ips_mld_timeout_id = 0;
278 	}
279 	if (ipst->ips_mld_time_to_next != 0) {
280 		ipst->ips_mld_time_to_next =
281 		    MIN(ipst->ips_mld_time_to_next, next);
282 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
283 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
284 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
285 	}
286 	ipst->ips_mld_timer_setter_active = B_FALSE;
287 	mutex_exit(&ipst->ips_mld_timer_lock);
288 }
289 
290 /*
291  * igmp_input:
292  * Return NULL for a bad packet that is discarded here.
293  * Return mp if the message is OK and should be handed to "raw" receivers.
294  * Callers of igmp_input() may need to reinitialize variables that were copied
295  * from the mblk as this calls pullupmsg().
296  */
297 /* ARGSUSED */
298 mblk_t *
299 igmp_input(queue_t *q, mblk_t *mp, ill_t *ill)
300 {
301 	igmpa_t 	*igmpa;
302 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
303 	int		iphlen, igmplen, mblklen;
304 	ilm_t 		*ilm;
305 	uint32_t	src, dst;
306 	uint32_t 	group;
307 	uint_t		next;
308 	ipif_t 		*ipif;
309 	ip_stack_t	 *ipst;
310 
311 	ASSERT(ill != NULL);
312 	ASSERT(!ill->ill_isv6);
313 	ipst = ill->ill_ipst;
314 	++ipst->ips_igmpstat.igps_rcv_total;
315 
316 	mblklen = MBLKL(mp);
317 	if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) {
318 		++ipst->ips_igmpstat.igps_rcv_tooshort;
319 		goto bad_pkt;
320 	}
321 	igmplen = ntohs(ipha->ipha_length) - iphlen;
322 	/*
323 	 * Since msg sizes are more variable with v3, just pullup the
324 	 * whole thing now.
325 	 */
326 	if (MBLKL(mp) < (igmplen + iphlen)) {
327 		mblk_t *mp1;
328 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
329 			++ipst->ips_igmpstat.igps_rcv_tooshort;
330 			goto bad_pkt;
331 		}
332 		freemsg(mp);
333 		mp = mp1;
334 		ipha = (ipha_t *)(mp->b_rptr);
335 	}
336 
337 	/*
338 	 * Validate lengths
339 	 */
340 	if (igmplen < IGMP_MINLEN) {
341 		++ipst->ips_igmpstat.igps_rcv_tooshort;
342 		goto bad_pkt;
343 	}
344 	/*
345 	 * Validate checksum
346 	 */
347 	if (IP_CSUM(mp, iphlen, 0)) {
348 		++ipst->ips_igmpstat.igps_rcv_badsum;
349 		goto bad_pkt;
350 	}
351 
352 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
353 	src = ipha->ipha_src;
354 	dst = ipha->ipha_dst;
355 	if (ip_debug > 1)
356 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
357 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
358 		    (int)ntohl(src), (int)ntohl(dst),
359 		    ill->ill_name);
360 
361 	switch (igmpa->igmpa_type) {
362 	case IGMP_MEMBERSHIP_QUERY:
363 		/*
364 		 * packet length differentiates between v1/v2 and v3
365 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
366 		 */
367 		if ((igmplen == IGMP_MINLEN) ||
368 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
369 			next = igmp_query_in(ipha, igmpa, ill);
370 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
371 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
372 			    igmplen);
373 		} else {
374 			++ipst->ips_igmpstat.igps_rcv_tooshort;
375 			goto bad_pkt;
376 		}
377 		if (next == 0)
378 			goto bad_pkt;
379 
380 		if (next != INFINITY)
381 			igmp_start_timers(next, ipst);
382 
383 		break;
384 
385 	case IGMP_V1_MEMBERSHIP_REPORT:
386 	case IGMP_V2_MEMBERSHIP_REPORT:
387 		/*
388 		 * For fast leave to work, we have to know that we are the
389 		 * last person to send a report for this group. Reports
390 		 * generated by us are looped back since we could potentially
391 		 * be a multicast router, so discard reports sourced by me.
392 		 */
393 		mutex_enter(&ill->ill_lock);
394 		for (ipif = ill->ill_ipif; ipif != NULL;
395 		    ipif = ipif->ipif_next) {
396 			if (ipif->ipif_lcl_addr == src) {
397 				if (ip_debug > 1) {
398 					(void) mi_strlog(ill->ill_rq,
399 					    1,
400 					    SL_TRACE,
401 					    "igmp_input: we are only "
402 					    "member src 0x%x ipif_local 0x%x",
403 					    (int)ntohl(src),
404 					    (int)
405 					    ntohl(ipif->ipif_lcl_addr));
406 				}
407 				mutex_exit(&ill->ill_lock);
408 				return (mp);
409 			}
410 		}
411 		mutex_exit(&ill->ill_lock);
412 
413 		++ipst->ips_igmpstat.igps_rcv_reports;
414 		group = igmpa->igmpa_group;
415 		if (!CLASSD(group)) {
416 			++ipst->ips_igmpstat.igps_rcv_badreports;
417 			goto bad_pkt;
418 		}
419 
420 		/*
421 		 * KLUDGE: if the IP source address of the report has an
422 		 * unspecified (i.e., zero) subnet number, as is allowed for
423 		 * a booting host, replace it with the correct subnet number
424 		 * so that a process-level multicast routing demon can
425 		 * determine which subnet it arrived from.  This is necessary
426 		 * to compensate for the lack of any way for a process to
427 		 * determine the arrival interface of an incoming packet.
428 		 *
429 		 * Requires that a copy of *this* message it passed up
430 		 * to the raw interface which is done by our caller.
431 		 */
432 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
433 			/* Pick the first ipif on this ill */
434 			mutex_enter(&ill->ill_lock);
435 			src = ill->ill_ipif->ipif_subnet;
436 			mutex_exit(&ill->ill_lock);
437 			ip1dbg(("igmp_input: changed src to 0x%x\n",
438 			    (int)ntohl(src)));
439 			ipha->ipha_src = src;
440 		}
441 
442 		/*
443 		 * If we belong to the group being reported, and
444 		 * we are a 'Delaying member' in the RFC terminology,
445 		 * stop our timer for that group and 'clear flag' i.e.
446 		 * mark as IGMP_OTHERMEMBER. Do this for all logical
447 		 * interfaces on the given physical interface.
448 		 */
449 		mutex_enter(&ill->ill_lock);
450 		for (ipif = ill->ill_ipif; ipif != NULL;
451 		    ipif = ipif->ipif_next) {
452 			ilm = ilm_lookup_ipif(ipif, group);
453 			if (ilm != NULL) {
454 				++ipst->ips_igmpstat.igps_rcv_ourreports;
455 				ilm->ilm_timer = INFINITY;
456 				ilm->ilm_state = IGMP_OTHERMEMBER;
457 			}
458 		} /* for */
459 		mutex_exit(&ill->ill_lock);
460 		break;
461 
462 	case IGMP_V3_MEMBERSHIP_REPORT:
463 		/*
464 		 * Currently nothing to do here; IGMP router is not
465 		 * implemented in ip, and v3 hosts don't pay attention
466 		 * to membership reports.
467 		 */
468 		break;
469 	}
470 	/*
471 	 * Pass all valid IGMP packets up to any process(es) listening
472 	 * on a raw IGMP socket. Do not free the packet.
473 	 */
474 	return (mp);
475 
476 bad_pkt:
477 	freemsg(mp);
478 	return (NULL);
479 }
480 
481 static uint_t
482 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
483 {
484 	ilm_t	*ilm;
485 	int	timer;
486 	uint_t	next, current;
487 	ip_stack_t	 *ipst;
488 
489 	ipst = ill->ill_ipst;
490 	++ipst->ips_igmpstat.igps_rcv_queries;
491 
492 	/*
493 	 * In the IGMPv2 specification, there are 3 states and a flag.
494 	 *
495 	 * In Non-Member state, we simply don't have a membership record.
496 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
497 	 * < INFINITY).  In Idle Member state, our timer is not running
498 	 * (ilm->ilm_timer == INFINITY).
499 	 *
500 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
501 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
502 	 * if I sent the last report.
503 	 */
504 	if ((igmpa->igmpa_code == 0) ||
505 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
506 		/*
507 		 * Query from an old router.
508 		 * Remember that the querier on this interface is old,
509 		 * and set the timer to the value in RFC 1112.
510 		 */
511 
512 
513 		mutex_enter(&ill->ill_lock);
514 		ill->ill_mcast_v1_time = 0;
515 		ill->ill_mcast_v1_tset = 1;
516 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
517 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
518 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
519 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
520 			ill->ill_mcast_type = IGMP_V1_ROUTER;
521 		}
522 		mutex_exit(&ill->ill_lock);
523 
524 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
525 
526 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
527 		    igmpa->igmpa_group != 0) {
528 			++ipst->ips_igmpstat.igps_rcv_badqueries;
529 			return (0);
530 		}
531 
532 	} else {
533 		in_addr_t group;
534 
535 		/*
536 		 * Query from a new router
537 		 * Simply do a validity check
538 		 */
539 		group = igmpa->igmpa_group;
540 		if (group != 0 && (!CLASSD(group))) {
541 			++ipst->ips_igmpstat.igps_rcv_badqueries;
542 			return (0);
543 		}
544 
545 		/*
546 		 * Switch interface state to v2 on receipt of a v2 query
547 		 * ONLY IF current state is v3.  Let things be if current
548 		 * state if v1 but do reset the v2-querier-present timer.
549 		 */
550 		mutex_enter(&ill->ill_lock);
551 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
552 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
553 			    "to IGMP_V2_ROUTER", ill->ill_name));
554 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
555 			ill->ill_mcast_type = IGMP_V2_ROUTER;
556 		}
557 		ill->ill_mcast_v2_time = 0;
558 		ill->ill_mcast_v2_tset = 1;
559 		mutex_exit(&ill->ill_lock);
560 
561 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
562 	}
563 
564 	if (ip_debug > 1) {
565 		mutex_enter(&ill->ill_lock);
566 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
567 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
568 		    (int)ntohs(igmpa->igmpa_code),
569 		    (int)ntohs(igmpa->igmpa_type));
570 		mutex_exit(&ill->ill_lock);
571 	}
572 
573 	/*
574 	 * -Start the timers in all of our membership records
575 	 *  for the physical interface on which the query
576 	 *  arrived, excluding those that belong to the "all
577 	 *  hosts" group (224.0.0.1).
578 	 *
579 	 * -Restart any timer that is already running but has
580 	 *  a value longer than the requested timeout.
581 	 *
582 	 * -Use the value specified in the query message as
583 	 *  the maximum timeout.
584 	 */
585 	next = (unsigned)INFINITY;
586 	mutex_enter(&ill->ill_lock);
587 
588 	current = CURRENT_MSTIME;
589 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
590 
591 		/*
592 		 * A multicast router joins INADDR_ANY address
593 		 * to enable promiscuous reception of all
594 		 * mcasts from the interface. This INADDR_ANY
595 		 * is stored in the ilm_v6addr as V6 unspec addr
596 		 */
597 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
598 			continue;
599 		if (ilm->ilm_addr == htonl(INADDR_ANY))
600 			continue;
601 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
602 		    (igmpa->igmpa_group == 0) ||
603 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
604 			if (ilm->ilm_timer > timer) {
605 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
606 				if (ilm->ilm_timer < next)
607 					next = ilm->ilm_timer;
608 				ilm->ilm_timer += current;
609 			}
610 		}
611 	}
612 	mutex_exit(&ill->ill_lock);
613 
614 	return (next);
615 }
616 
617 static uint_t
618 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
619 {
620 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
621 	uint_t		current;
622 	ilm_t		*ilm;
623 	ipaddr_t	*src_array;
624 	uint8_t		qrv;
625 	ip_stack_t	 *ipst;
626 
627 	ipst = ill->ill_ipst;
628 	/* make sure numsrc matches packet size */
629 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
630 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
631 		++ipst->ips_igmpstat.igps_rcv_tooshort;
632 		return (0);
633 	}
634 	src_array = (ipaddr_t *)&igmp3qa[1];
635 
636 	++ipst->ips_igmpstat.igps_rcv_queries;
637 
638 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
639 		uint_t hdrval, mant, exp;
640 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
641 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
642 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
643 		mrd = (mant | 0x10) << (exp + 3);
644 	}
645 	if (mrd == 0)
646 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
647 	timer = DSEC_TO_MSEC(mrd);
648 	MCAST_RANDOM_DELAY(delay, timer);
649 	next = (unsigned)INFINITY;
650 	current = CURRENT_MSTIME;
651 
652 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
653 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
654 	else
655 		ill->ill_mcast_rv = qrv;
656 
657 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
658 		uint_t hdrval, mant, exp;
659 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
660 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
661 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
662 		qqi = (mant | 0x10) << (exp + 3);
663 	}
664 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
665 
666 	/*
667 	 * If we have a pending general query response that's scheduled
668 	 * sooner than the delay we calculated for this response, then
669 	 * no action is required (RFC3376 section 5.2 rule 1)
670 	 */
671 	mutex_enter(&ill->ill_lock);
672 	if (ill->ill_global_timer < (current + delay)) {
673 		mutex_exit(&ill->ill_lock);
674 		return (next);
675 	}
676 	mutex_exit(&ill->ill_lock);
677 
678 	/*
679 	 * Now take action depending upon query type:
680 	 * general, group specific, or group/source specific.
681 	 */
682 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
683 		/*
684 		 * general query
685 		 * We know global timer is either not running or is
686 		 * greater than our calculated delay, so reset it to
687 		 * our delay (random value in range [0, response time]).
688 		 */
689 		mutex_enter(&ill->ill_lock);
690 		ill->ill_global_timer =  current + delay;
691 		mutex_exit(&ill->ill_lock);
692 		next = delay;
693 
694 	} else {
695 		/* group or group/source specific query */
696 		mutex_enter(&ill->ill_lock);
697 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
698 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
699 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
700 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
701 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
702 				continue;
703 			/*
704 			 * If the query is group specific or we have a
705 			 * pending group specific query, the response is
706 			 * group specific (pending sources list should be
707 			 * empty).  Otherwise, need to update the pending
708 			 * sources list for the group and source specific
709 			 * response.
710 			 */
711 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
712 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
713 group_query:
714 				FREE_SLIST(ilm->ilm_pendsrcs);
715 				ilm->ilm_pendsrcs = NULL;
716 			} else {
717 				boolean_t overflow;
718 				slist_t *pktl;
719 				if (numsrc > MAX_FILTER_SIZE ||
720 				    (ilm->ilm_pendsrcs == NULL &&
721 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
722 					/*
723 					 * We've been sent more sources than
724 					 * we can deal with; or we can't deal
725 					 * with a source list at all.  Revert
726 					 * to a group specific query.
727 					 */
728 					goto group_query;
729 				}
730 				if ((pktl = l_alloc()) == NULL)
731 					goto group_query;
732 				pktl->sl_numsrc = numsrc;
733 				for (i = 0; i < numsrc; i++)
734 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
735 					    &(pktl->sl_addr[i]));
736 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
737 				    &overflow);
738 				l_free(pktl);
739 				if (overflow)
740 					goto group_query;
741 			}
742 
743 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
744 			    INFINITY : (ilm->ilm_timer - current);
745 			/* choose soonest timer */
746 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
747 			if (ilm->ilm_timer < next)
748 				next = ilm->ilm_timer;
749 			ilm->ilm_timer += current;
750 		}
751 		mutex_exit(&ill->ill_lock);
752 	}
753 
754 	return (next);
755 }
756 
757 void
758 igmp_joingroup(ilm_t *ilm)
759 {
760 	uint_t	timer;
761 	ill_t	*ill;
762 	ip_stack_t	*ipst = ilm->ilm_ipst;
763 
764 	ill = ilm->ilm_ipif->ipif_ill;
765 
766 	ASSERT(IAM_WRITER_ILL(ill));
767 	ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6);
768 
769 	mutex_enter(&ill->ill_lock);
770 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
771 		ilm->ilm_rtx.rtx_timer = INFINITY;
772 		ilm->ilm_state = IGMP_OTHERMEMBER;
773 		mutex_exit(&ill->ill_lock);
774 	} else {
775 		ip1dbg(("Querier mode %d, sending report, group %x\n",
776 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
777 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
778 			mutex_exit(&ill->ill_lock);
779 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
780 			mutex_enter(&ill->ill_lock);
781 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
782 			mutex_exit(&ill->ill_lock);
783 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
784 			mutex_enter(&ill->ill_lock);
785 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
786 			mrec_t *rp;
787 			mcast_record_t rtype;
788 			/*
789 			 * The possible state changes we need to handle here:
790 			 *   Old State	New State	Report
791 			 *
792 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
793 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
794 			 *
795 			 * No need to send the BLOCK(0) report; ALLOW(X)
796 			 * is enough.
797 			 */
798 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
799 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
800 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
801 			    ilm->ilm_filter, NULL);
802 			mutex_exit(&ill->ill_lock);
803 			igmpv3_sendrpt(ilm->ilm_ipif, rp);
804 			mutex_enter(&ill->ill_lock);
805 			/*
806 			 * Set up retransmission state.  Timer is set below,
807 			 * for both v3 and older versions.
808 			 */
809 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
810 			    ilm->ilm_filter);
811 		}
812 
813 		/* Set the ilm timer value */
814 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
815 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
816 		timer = ilm->ilm_rtx.rtx_timer;
817 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
818 		ilm->ilm_state = IGMP_IREPORTEDLAST;
819 		mutex_exit(&ill->ill_lock);
820 
821 		/*
822 		 * To avoid deadlock, we defer igmp_start_timers() to
823 		 * ipsq_exit().  See the comment in ipsq_exit() for details.
824 		 */
825 		mutex_enter(&ipst->ips_igmp_timer_lock);
826 		ipst->ips_igmp_deferred_next = MIN(timer,
827 		    ipst->ips_igmp_deferred_next);
828 		mutex_exit(&ipst->ips_igmp_timer_lock);
829 	}
830 
831 	if (ip_debug > 1) {
832 		(void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE,
833 		    "igmp_joingroup: multicast_type %d timer %d",
834 		    (ilm->ilm_ipif->ipif_ill->ill_mcast_type),
835 		    (int)ntohl(timer));
836 	}
837 }
838 
839 void
840 mld_joingroup(ilm_t *ilm)
841 {
842 	uint_t	timer;
843 	ill_t	*ill;
844 	ip_stack_t	*ipst = ilm->ilm_ipst;
845 
846 	ill = ilm->ilm_ill;
847 
848 	ASSERT(IAM_WRITER_ILL(ill));
849 	ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6);
850 
851 	mutex_enter(&ill->ill_lock);
852 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
853 		ilm->ilm_rtx.rtx_timer = INFINITY;
854 		ilm->ilm_state = IGMP_OTHERMEMBER;
855 		mutex_exit(&ill->ill_lock);
856 	} else {
857 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
858 			mutex_exit(&ill->ill_lock);
859 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
860 			mutex_enter(&ill->ill_lock);
861 		} else {
862 			mrec_t *rp;
863 			mcast_record_t rtype;
864 			/*
865 			 * The possible state changes we need to handle here:
866 			 *	Old State   New State	Report
867 			 *
868 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
869 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
870 			 *
871 			 * No need to send the BLOCK(0) report; ALLOW(X)
872 			 * is enough
873 			 */
874 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
875 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
876 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
877 			    ilm->ilm_filter, NULL);
878 			mutex_exit(&ill->ill_lock);
879 			mldv2_sendrpt(ill, rp);
880 			mutex_enter(&ill->ill_lock);
881 			/*
882 			 * Set up retransmission state.  Timer is set below,
883 			 * for both v2 and v1.
884 			 */
885 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
886 			    ilm->ilm_filter);
887 		}
888 
889 		/* Set the ilm timer value */
890 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
891 		    ilm->ilm_rtx.rtx_cnt > 0);
892 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
893 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
894 		timer = ilm->ilm_rtx.rtx_timer;
895 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
896 		ilm->ilm_state = IGMP_IREPORTEDLAST;
897 		mutex_exit(&ill->ill_lock);
898 
899 		/*
900 		 * To avoid deadlock, we defer mld_start_timers() to
901 		 * ipsq_exit().  See the comment in ipsq_exit() for details.
902 		 */
903 		mutex_enter(&ipst->ips_mld_timer_lock);
904 		ipst->ips_mld_deferred_next = MIN(timer,
905 		    ipst->ips_mld_deferred_next);
906 		mutex_exit(&ipst->ips_mld_timer_lock);
907 	}
908 
909 	if (ip_debug > 1) {
910 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
911 		    "mld_joingroup: multicast_type %d timer %d",
912 		    (ilm->ilm_ill->ill_mcast_type),
913 		    (int)ntohl(timer));
914 	}
915 }
916 
917 void
918 igmp_leavegroup(ilm_t *ilm)
919 {
920 	ill_t *ill = ilm->ilm_ipif->ipif_ill;
921 
922 	ASSERT(ilm->ilm_ill == NULL);
923 	ASSERT(!ill->ill_isv6);
924 
925 	mutex_enter(&ill->ill_lock);
926 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
927 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
928 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
929 		mutex_exit(&ill->ill_lock);
930 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
931 		    (htonl(INADDR_ALLRTRS_GROUP)));
932 		return;
933 	} else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
934 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
935 		mrec_t *rp;
936 		/*
937 		 * The possible state changes we need to handle here:
938 		 *	Old State	New State	Report
939 		 *
940 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
941 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
942 		 *
943 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
944 		 */
945 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
946 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
947 			    ilm->ilm_filter, NULL);
948 		} else {
949 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
950 			    NULL, NULL);
951 		}
952 		mutex_exit(&ill->ill_lock);
953 		igmpv3_sendrpt(ilm->ilm_ipif, rp);
954 		return;
955 	}
956 	mutex_exit(&ill->ill_lock);
957 }
958 
959 void
960 mld_leavegroup(ilm_t *ilm)
961 {
962 	ill_t *ill = ilm->ilm_ill;
963 
964 	ASSERT(ilm->ilm_ipif == NULL);
965 	ASSERT(ill->ill_isv6);
966 
967 	mutex_enter(&ill->ill_lock);
968 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
969 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
970 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
971 		mutex_exit(&ill->ill_lock);
972 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
973 		return;
974 	} else if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
975 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
976 		mrec_t *rp;
977 		/*
978 		 * The possible state changes we need to handle here:
979 		 *	Old State	New State	Report
980 		 *
981 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
982 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
983 		 *
984 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
985 		 */
986 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
987 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
988 			    ilm->ilm_filter, NULL);
989 		} else {
990 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
991 			    NULL, NULL);
992 		}
993 		mutex_exit(&ill->ill_lock);
994 		mldv2_sendrpt(ill, rp);
995 		return;
996 	}
997 	mutex_exit(&ill->ill_lock);
998 }
999 
1000 void
1001 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1002 {
1003 	ill_t *ill;
1004 	mrec_t *rp;
1005 	ip_stack_t	*ipst = ilm->ilm_ipst;
1006 
1007 	ASSERT(ilm != NULL);
1008 
1009 	/* state change reports should only be sent if the router is v3 */
1010 	if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER)
1011 		return;
1012 
1013 	if (ilm->ilm_ill == NULL) {
1014 		ASSERT(ilm->ilm_ipif != NULL);
1015 		ill = ilm->ilm_ipif->ipif_ill;
1016 	} else {
1017 		ill = ilm->ilm_ill;
1018 	}
1019 
1020 	mutex_enter(&ill->ill_lock);
1021 
1022 	/*
1023 	 * Compare existing(old) state with the new state and prepare
1024 	 * State Change Report, according to the rules in RFC 3376:
1025 	 *
1026 	 *	Old State	New State	State Change Report
1027 	 *
1028 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1029 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1030 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1031 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1032 	 */
1033 
1034 	if (ilm->ilm_fmode == fmode) {
1035 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1036 		slist_t *allow, *block;
1037 		if (((a_minus_b = l_alloc()) == NULL) ||
1038 		    ((b_minus_a = l_alloc()) == NULL)) {
1039 			l_free(a_minus_b);
1040 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1041 				goto send_to_ex;
1042 			else
1043 				goto send_to_in;
1044 		}
1045 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1046 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1047 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1048 			allow = b_minus_a;
1049 			block = a_minus_b;
1050 		} else {
1051 			allow = a_minus_b;
1052 			block = b_minus_a;
1053 		}
1054 		rp = NULL;
1055 		if (!SLIST_IS_EMPTY(allow))
1056 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1057 			    allow, rp);
1058 		if (!SLIST_IS_EMPTY(block))
1059 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1060 			    block, rp);
1061 		l_free(a_minus_b);
1062 		l_free(b_minus_a);
1063 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1064 send_to_ex:
1065 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1066 		    NULL);
1067 	} else {
1068 send_to_in:
1069 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1070 		    NULL);
1071 	}
1072 
1073 	/*
1074 	 * Need to set up retransmission state; merge the new info with the
1075 	 * current state (which may be null).  If the timer is not currently
1076 	 * running, start it (need to do a delayed start of the timer as
1077 	 * we're currently in the sq).
1078 	 */
1079 	rp = mcast_merge_rtx(ilm, rp, flist);
1080 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1081 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1082 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1083 		mutex_enter(&ipst->ips_igmp_timer_lock);
1084 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1085 		    ilm->ilm_rtx.rtx_timer);
1086 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1087 		mutex_exit(&ipst->ips_igmp_timer_lock);
1088 	}
1089 
1090 	mutex_exit(&ill->ill_lock);
1091 	igmpv3_sendrpt(ilm->ilm_ipif, rp);
1092 }
1093 
1094 void
1095 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1096 {
1097 	ill_t *ill;
1098 	mrec_t *rp = NULL;
1099 	ip_stack_t	*ipst = ilm->ilm_ipst;
1100 
1101 	ASSERT(ilm != NULL);
1102 
1103 	ill = ilm->ilm_ill;
1104 
1105 	/* only need to send if we have an mldv2-capable router */
1106 	mutex_enter(&ill->ill_lock);
1107 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1108 		mutex_exit(&ill->ill_lock);
1109 		return;
1110 	}
1111 
1112 	/*
1113 	 * Compare existing (old) state with the new state passed in
1114 	 * and send appropriate MLDv2 State Change Report.
1115 	 *
1116 	 *	Old State	New State	State Change Report
1117 	 *
1118 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1119 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1120 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1121 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1122 	 */
1123 	if (ilm->ilm_fmode == fmode) {
1124 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1125 		slist_t *allow, *block;
1126 		if (((a_minus_b = l_alloc()) == NULL) ||
1127 		    ((b_minus_a = l_alloc()) == NULL)) {
1128 			l_free(a_minus_b);
1129 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1130 				goto send_to_ex;
1131 			else
1132 				goto send_to_in;
1133 		}
1134 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1135 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1136 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1137 			allow = b_minus_a;
1138 			block = a_minus_b;
1139 		} else {
1140 			allow = a_minus_b;
1141 			block = b_minus_a;
1142 		}
1143 		if (!SLIST_IS_EMPTY(allow))
1144 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1145 			    allow, rp);
1146 		if (!SLIST_IS_EMPTY(block))
1147 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1148 			    block, rp);
1149 		l_free(a_minus_b);
1150 		l_free(b_minus_a);
1151 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1152 send_to_ex:
1153 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1154 		    NULL);
1155 	} else {
1156 send_to_in:
1157 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1158 		    NULL);
1159 	}
1160 
1161 	/*
1162 	 * Need to set up retransmission state; merge the new info with the
1163 	 * current state (which may be null).  If the timer is not currently
1164 	 * running, start it (need to do a deferred start of the timer as
1165 	 * we're currently in the sq).
1166 	 */
1167 	rp = mcast_merge_rtx(ilm, rp, flist);
1168 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1169 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1170 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1171 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1172 		mutex_enter(&ipst->ips_mld_timer_lock);
1173 		ipst->ips_mld_deferred_next =
1174 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1175 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1176 		mutex_exit(&ipst->ips_mld_timer_lock);
1177 	}
1178 
1179 	mutex_exit(&ill->ill_lock);
1180 	mldv2_sendrpt(ill, rp);
1181 }
1182 
1183 uint_t
1184 igmp_timeout_handler_per_ill(ill_t *ill)
1185 {
1186 	uint_t	next = INFINITY, current;
1187 	ilm_t	*ilm;
1188 	ipif_t	*ipif;
1189 	mrec_t	*rp = NULL;
1190 	mrec_t	*rtxrp = NULL;
1191 	rtx_state_t *rtxp;
1192 	mcast_record_t	rtype;
1193 
1194 	ASSERT(IAM_WRITER_ILL(ill));
1195 
1196 	mutex_enter(&ill->ill_lock);
1197 
1198 	current = CURRENT_MSTIME;
1199 	/* First check the global timer on this interface */
1200 	if (ill->ill_global_timer == INFINITY)
1201 		goto per_ilm_timer;
1202 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1203 		ill->ill_global_timer = INFINITY;
1204 		/*
1205 		 * Send report for each group on this interface.
1206 		 * Since we just set the global timer (received a v3 general
1207 		 * query), need to skip the all hosts addr (224.0.0.1), per
1208 		 * RFC 3376 section 5.
1209 		 */
1210 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1211 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1212 				continue;
1213 			ASSERT(ilm->ilm_ipif != NULL);
1214 			ilm->ilm_ipif->ipif_igmp_rpt =
1215 			    mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1216 			    ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt);
1217 			/*
1218 			 * Since we're sending a report on this group, okay
1219 			 * to delete pending group-specific timers.  Note
1220 			 * that group-specific retransmit timers still need
1221 			 * to be checked in the per_ilm_timer for-loop.
1222 			 */
1223 			ilm->ilm_timer = INFINITY;
1224 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1225 			FREE_SLIST(ilm->ilm_pendsrcs);
1226 			ilm->ilm_pendsrcs = NULL;
1227 		}
1228 		/*
1229 		 * We've built per-ipif mrec lists; walk the ill's ipif list
1230 		 * and send a report for each ipif that has an mrec list.
1231 		 */
1232 		for (ipif = ill->ill_ipif; ipif != NULL;
1233 		    ipif = ipif->ipif_next) {
1234 			if (ipif->ipif_igmp_rpt == NULL)
1235 				continue;
1236 			mutex_exit(&ill->ill_lock);
1237 			igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt);
1238 			mutex_enter(&ill->ill_lock);
1239 			/* mrec list was freed by igmpv3_sendrpt() */
1240 			ipif->ipif_igmp_rpt = NULL;
1241 		}
1242 	} else {
1243 		if ((ill->ill_global_timer - current) < next)
1244 			next = ill->ill_global_timer - current;
1245 	}
1246 
1247 per_ilm_timer:
1248 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1249 		if (ilm->ilm_timer == INFINITY)
1250 			goto per_ilm_rtxtimer;
1251 
1252 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1253 			if ((ilm->ilm_timer - current) < next)
1254 				next = ilm->ilm_timer - current;
1255 
1256 			if (ip_debug > 1) {
1257 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1258 				    "igmp_timo_hlr 2: ilm_timr %d "
1259 				    "typ %d nxt %d",
1260 				    (int)ntohl(ilm->ilm_timer - current),
1261 				    (ill->ill_mcast_type), next);
1262 			}
1263 
1264 			goto per_ilm_rtxtimer;
1265 		}
1266 
1267 		/* the timer has expired, need to take action */
1268 		ilm->ilm_timer = INFINITY;
1269 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1270 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1271 			mutex_exit(&ill->ill_lock);
1272 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1273 			mutex_enter(&ill->ill_lock);
1274 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1275 			mutex_exit(&ill->ill_lock);
1276 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1277 			mutex_enter(&ill->ill_lock);
1278 		} else {
1279 			slist_t *rsp;
1280 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1281 			    (rsp = l_alloc()) != NULL) {
1282 				/*
1283 				 * Contents of reply depend on pending
1284 				 * requested source list.
1285 				 */
1286 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1287 					l_intersection(ilm->ilm_filter,
1288 					    ilm->ilm_pendsrcs, rsp);
1289 				} else {
1290 					l_difference(ilm->ilm_pendsrcs,
1291 					    ilm->ilm_filter, rsp);
1292 				}
1293 				FREE_SLIST(ilm->ilm_pendsrcs);
1294 				ilm->ilm_pendsrcs = NULL;
1295 				if (!SLIST_IS_EMPTY(rsp))
1296 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1297 					    &ilm->ilm_v6addr, rsp, rp);
1298 				FREE_SLIST(rsp);
1299 			} else {
1300 				/*
1301 				 * Either the pending request is just group-
1302 				 * specific, or we couldn't get the resources
1303 				 * (rsp) to build a source-specific reply.
1304 				 */
1305 				rp = mcast_bldmrec(ilm->ilm_fmode,
1306 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1307 			}
1308 			mutex_exit(&ill->ill_lock);
1309 			igmpv3_sendrpt(ill->ill_ipif, rp);
1310 			mutex_enter(&ill->ill_lock);
1311 			rp = NULL;
1312 		}
1313 
1314 per_ilm_rtxtimer:
1315 		rtxp = &ilm->ilm_rtx;
1316 
1317 		if (rtxp->rtx_timer == INFINITY)
1318 			continue;
1319 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1320 			if ((rtxp->rtx_timer - current) < next)
1321 				next = rtxp->rtx_timer - current;
1322 			continue;
1323 		}
1324 
1325 		rtxp->rtx_timer = INFINITY;
1326 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1327 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1328 			mutex_exit(&ill->ill_lock);
1329 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1330 			mutex_enter(&ill->ill_lock);
1331 			continue;
1332 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1333 			mutex_exit(&ill->ill_lock);
1334 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1335 			mutex_enter(&ill->ill_lock);
1336 			continue;
1337 		}
1338 
1339 		/*
1340 		 * The retransmit timer has popped, and our router is
1341 		 * IGMPv3.  We have to delve into the retransmit state
1342 		 * stored in the ilm.
1343 		 *
1344 		 * Decrement the retransmit count.  If the fmode rtx
1345 		 * count is active, decrement it, and send a filter
1346 		 * mode change report with the ilm's source list.
1347 		 * Otherwise, send a source list change report with
1348 		 * the current retransmit lists.
1349 		 */
1350 		ASSERT(rtxp->rtx_cnt > 0);
1351 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1352 		rtxp->rtx_cnt--;
1353 		if (rtxp->rtx_fmode_cnt > 0) {
1354 			rtxp->rtx_fmode_cnt--;
1355 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1356 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1357 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1358 			    ilm->ilm_filter, rtxrp);
1359 		} else {
1360 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1361 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1362 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1363 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1364 		}
1365 		if (rtxp->rtx_cnt > 0) {
1366 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1367 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1368 			if (rtxp->rtx_timer < next)
1369 				next = rtxp->rtx_timer;
1370 			rtxp->rtx_timer += current;
1371 		} else {
1372 			CLEAR_SLIST(rtxp->rtx_allow);
1373 			CLEAR_SLIST(rtxp->rtx_block);
1374 		}
1375 		mutex_exit(&ill->ill_lock);
1376 		igmpv3_sendrpt(ilm->ilm_ipif, rtxrp);
1377 		mutex_enter(&ill->ill_lock);
1378 		rtxrp = NULL;
1379 	}
1380 
1381 	mutex_exit(&ill->ill_lock);
1382 
1383 	return (next);
1384 }
1385 
1386 /*
1387  * igmp_timeout_handler:
1388  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1389  * Returns number of ticks to next event (or 0 if none).
1390  *
1391  * As part of multicast join and leave igmp we may need to send out an
1392  * igmp request. The igmp related state variables in the ilm are protected
1393  * by ill_lock. A single global igmp timer is used to track igmp timeouts.
1394  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1395  * starts the igmp timer if needed. It serializes multiple threads trying to
1396  * simultaneously start the timer using the igmp_timer_setter_active flag.
1397  *
1398  * igmp_input() receives igmp queries and responds to the queries
1399  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1400  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1401  * performs the action exclusively after entering each ill's ipsq as writer.
1402  * The actual igmp timeout handler needs to run in the ipsq since it has to
1403  * access the ilm's and we don't want another exclusive operation like
1404  * say an IPMP failover to be simultaneously moving the ilms from one ill to
1405  * another.
1406  *
1407  * The igmp_slowtimeo() function is called thru another timer.
1408  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1409  */
1410 void
1411 igmp_timeout_handler(void *arg)
1412 {
1413 	ill_t	*ill;
1414 	uint_t  global_next = INFINITY;
1415 	uint_t  next;
1416 	ill_walk_context_t ctx;
1417 	boolean_t success;
1418 	ip_stack_t *ipst = arg;
1419 
1420 	ASSERT(arg != NULL);
1421 	mutex_enter(&ipst->ips_igmp_timer_lock);
1422 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1423 	ipst->ips_igmp_timer_thread = curthread;
1424 	ipst->ips_igmp_timer_scheduled_last = 0;
1425 	ipst->ips_igmp_time_to_next = 0;
1426 	mutex_exit(&ipst->ips_igmp_timer_lock);
1427 
1428 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1429 	ill = ILL_START_WALK_V4(&ctx, ipst);
1430 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1431 		ASSERT(!ill->ill_isv6);
1432 		/*
1433 		 * We may not be able to refhold the ill if the ill/ipif
1434 		 * is changing. But we need to make sure that the ill will
1435 		 * not vanish. So we just bump up the ill_waiter count.
1436 		 */
1437 		if (!ill_waiter_inc(ill))
1438 			continue;
1439 		rw_exit(&ipst->ips_ill_g_lock);
1440 		success = ipsq_enter(ill, B_TRUE, NEW_OP);
1441 		if (success) {
1442 			next = igmp_timeout_handler_per_ill(ill);
1443 			if (next < global_next)
1444 				global_next = next;
1445 			ipsq_exit(ill->ill_phyint->phyint_ipsq);
1446 		}
1447 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1448 		ill_waiter_dcr(ill);
1449 	}
1450 	rw_exit(&ipst->ips_ill_g_lock);
1451 
1452 	mutex_enter(&ipst->ips_igmp_timer_lock);
1453 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1454 	ipst->ips_igmp_timeout_id = 0;
1455 	ipst->ips_igmp_timer_thread = NULL;
1456 	mutex_exit(&ipst->ips_igmp_timer_lock);
1457 
1458 	if (global_next != INFINITY)
1459 		igmp_start_timers(global_next, ipst);
1460 }
1461 
1462 /*
1463  * mld_timeout_handler:
1464  * Called when there are timeout events, every next (tick).
1465  * Returns number of ticks to next event (or 0 if none).
1466  */
1467 /* ARGSUSED */
1468 uint_t
1469 mld_timeout_handler_per_ill(ill_t *ill)
1470 {
1471 	ilm_t 	*ilm;
1472 	uint_t	next = INFINITY, current;
1473 	mrec_t	*rp, *rtxrp;
1474 	rtx_state_t *rtxp;
1475 	mcast_record_t	rtype;
1476 
1477 	ASSERT(IAM_WRITER_ILL(ill));
1478 
1479 	mutex_enter(&ill->ill_lock);
1480 
1481 	current = CURRENT_MSTIME;
1482 	/*
1483 	 * First check the global timer on this interface; the global timer
1484 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1485 	 */
1486 	if (ill->ill_global_timer == INFINITY)
1487 		goto per_ilm_timer;
1488 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1489 		ill->ill_global_timer = INFINITY;
1490 		/*
1491 		 * Send report for each group on this interface.
1492 		 * Since we just set the global timer (received a v2 general
1493 		 * query), need to skip the all hosts addr (ff02::1), per
1494 		 * RFC 3810 section 6.
1495 		 */
1496 		rp = NULL;
1497 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1498 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1499 			    &ipv6_all_hosts_mcast))
1500 				continue;
1501 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1502 			    ilm->ilm_filter, rp);
1503 			/*
1504 			 * Since we're sending a report on this group, okay
1505 			 * to delete pending group-specific timers.  Note
1506 			 * that group-specific retransmit timers still need
1507 			 * to be checked in the per_ilm_timer for-loop.
1508 			 */
1509 			ilm->ilm_timer = INFINITY;
1510 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1511 			FREE_SLIST(ilm->ilm_pendsrcs);
1512 			ilm->ilm_pendsrcs = NULL;
1513 		}
1514 		mutex_exit(&ill->ill_lock);
1515 		mldv2_sendrpt(ill, rp);
1516 		mutex_enter(&ill->ill_lock);
1517 	} else {
1518 		if ((ill->ill_global_timer - current) < next)
1519 			next = ill->ill_global_timer - current;
1520 	}
1521 
1522 per_ilm_timer:
1523 	rp = rtxrp = NULL;
1524 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1525 		if (ilm->ilm_timer == INFINITY)
1526 			goto per_ilm_rtxtimer;
1527 
1528 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1529 			if ((ilm->ilm_timer - current) < next)
1530 				next = ilm->ilm_timer - current;
1531 
1532 			if (ip_debug > 1) {
1533 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1534 				    "igmp_timo_hlr 2: ilm_timr"
1535 				    " %d typ %d nxt %d",
1536 				    (int)ntohl(ilm->ilm_timer - current),
1537 				    (ill->ill_mcast_type), next);
1538 			}
1539 
1540 			goto per_ilm_rtxtimer;
1541 		}
1542 
1543 		/* the timer has expired, need to take action */
1544 		ilm->ilm_timer = INFINITY;
1545 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1546 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1547 			mutex_exit(&ill->ill_lock);
1548 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1549 			mutex_enter(&ill->ill_lock);
1550 		} else {
1551 			slist_t *rsp;
1552 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1553 			    (rsp = l_alloc()) != NULL) {
1554 				/*
1555 				 * Contents of reply depend on pending
1556 				 * requested source list.
1557 				 */
1558 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1559 					l_intersection(ilm->ilm_filter,
1560 					    ilm->ilm_pendsrcs, rsp);
1561 				} else {
1562 					l_difference(ilm->ilm_pendsrcs,
1563 					    ilm->ilm_filter, rsp);
1564 				}
1565 				FREE_SLIST(ilm->ilm_pendsrcs);
1566 				ilm->ilm_pendsrcs = NULL;
1567 				if (!SLIST_IS_EMPTY(rsp))
1568 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1569 					    &ilm->ilm_v6addr, rsp, rp);
1570 				FREE_SLIST(rsp);
1571 			} else {
1572 				rp = mcast_bldmrec(ilm->ilm_fmode,
1573 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1574 			}
1575 		}
1576 
1577 per_ilm_rtxtimer:
1578 		rtxp = &ilm->ilm_rtx;
1579 
1580 		if (rtxp->rtx_timer == INFINITY)
1581 			continue;
1582 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1583 			if ((rtxp->rtx_timer - current) < next)
1584 				next = rtxp->rtx_timer - current;
1585 			continue;
1586 		}
1587 
1588 		rtxp->rtx_timer = INFINITY;
1589 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1590 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1591 			mutex_exit(&ill->ill_lock);
1592 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1593 			mutex_enter(&ill->ill_lock);
1594 			continue;
1595 		}
1596 
1597 		/*
1598 		 * The retransmit timer has popped, and our router is
1599 		 * MLDv2.  We have to delve into the retransmit state
1600 		 * stored in the ilm.
1601 		 *
1602 		 * Decrement the retransmit count.  If the fmode rtx
1603 		 * count is active, decrement it, and send a filter
1604 		 * mode change report with the ilm's source list.
1605 		 * Otherwise, send a source list change report with
1606 		 * the current retransmit lists.
1607 		 */
1608 		ASSERT(rtxp->rtx_cnt > 0);
1609 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1610 		rtxp->rtx_cnt--;
1611 		if (rtxp->rtx_fmode_cnt > 0) {
1612 			rtxp->rtx_fmode_cnt--;
1613 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1614 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1615 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1616 			    ilm->ilm_filter, rtxrp);
1617 		} else {
1618 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1619 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1620 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1621 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1622 		}
1623 		if (rtxp->rtx_cnt > 0) {
1624 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1625 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1626 			if (rtxp->rtx_timer < next)
1627 				next = rtxp->rtx_timer;
1628 			rtxp->rtx_timer += current;
1629 		} else {
1630 			CLEAR_SLIST(rtxp->rtx_allow);
1631 			CLEAR_SLIST(rtxp->rtx_block);
1632 		}
1633 	}
1634 
1635 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1636 		mutex_exit(&ill->ill_lock);
1637 		mldv2_sendrpt(ill, rp);
1638 		mldv2_sendrpt(ill, rtxrp);
1639 		return (next);
1640 	}
1641 
1642 	mutex_exit(&ill->ill_lock);
1643 
1644 	return (next);
1645 }
1646 
1647 /*
1648  * mld_timeout_handler:
1649  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1650  * Returns number of ticks to next event (or 0 if none).
1651  * MT issues are same as igmp_timeout_handler
1652  */
1653 void
1654 mld_timeout_handler(void *arg)
1655 {
1656 	ill_t	*ill;
1657 	uint_t  global_next = INFINITY;
1658 	uint_t  next;
1659 	ill_walk_context_t ctx;
1660 	boolean_t success;
1661 	ip_stack_t *ipst = arg;
1662 
1663 	ASSERT(arg != NULL);
1664 	mutex_enter(&ipst->ips_mld_timer_lock);
1665 	ASSERT(ipst->ips_mld_timeout_id != 0);
1666 	ipst->ips_mld_timer_thread = curthread;
1667 	ipst->ips_mld_timer_scheduled_last = 0;
1668 	ipst->ips_mld_time_to_next = 0;
1669 	mutex_exit(&ipst->ips_mld_timer_lock);
1670 
1671 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1672 	ill = ILL_START_WALK_V6(&ctx, ipst);
1673 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1674 		ASSERT(ill->ill_isv6);
1675 		/*
1676 		 * We may not be able to refhold the ill if the ill/ipif
1677 		 * is changing. But we need to make sure that the ill will
1678 		 * not vanish. So we just bump up the ill_waiter count.
1679 		 */
1680 		if (!ill_waiter_inc(ill))
1681 			continue;
1682 		rw_exit(&ipst->ips_ill_g_lock);
1683 		success = ipsq_enter(ill, B_TRUE, NEW_OP);
1684 		if (success) {
1685 			next = mld_timeout_handler_per_ill(ill);
1686 			if (next < global_next)
1687 				global_next = next;
1688 			ipsq_exit(ill->ill_phyint->phyint_ipsq);
1689 		}
1690 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1691 		ill_waiter_dcr(ill);
1692 	}
1693 	rw_exit(&ipst->ips_ill_g_lock);
1694 
1695 	mutex_enter(&ipst->ips_mld_timer_lock);
1696 	ASSERT(ipst->ips_mld_timeout_id != 0);
1697 	ipst->ips_mld_timeout_id = 0;
1698 	ipst->ips_mld_timer_thread = NULL;
1699 	mutex_exit(&ipst->ips_mld_timer_lock);
1700 
1701 	if (global_next != INFINITY)
1702 		mld_start_timers(global_next, ipst);
1703 }
1704 
1705 /*
1706  * Calculate the Older Version Querier Present timeout value, in number
1707  * of slowtimo intervals, for the given ill.
1708  */
1709 #define	OVQP(ill) \
1710 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1711 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1712 
1713 /*
1714  * igmp_slowtimo:
1715  * - Resets to new router if we didnt we hear from the router
1716  *   in IGMP_AGE_THRESHOLD seconds.
1717  * - Resets slowtimeout.
1718  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1719  * IGMP version than configured.
1720  */
1721 void
1722 igmp_slowtimo(void *arg)
1723 {
1724 	ill_t	*ill;
1725 	ill_if_t *ifp;
1726 	avl_tree_t *avl_tree;
1727 	ip_stack_t *ipst = (ip_stack_t *)arg;
1728 
1729 	ASSERT(arg != NULL);
1730 	/* Hold the ill_g_lock so that we can safely walk the ill list */
1731 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1732 
1733 	/*
1734 	 * The ill_if_t list is circular, hence the odd loop parameters.
1735 	 *
1736 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1737 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1738 	 * structure (allowing us to skip if none of the instances have timers
1739 	 * running).
1740 	 */
1741 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1742 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1743 	    ifp = ifp->illif_next) {
1744 		/*
1745 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1746 		 * a V1 or V2 query now and we miss seeing the count now,
1747 		 * we will see it the next time igmp_slowtimo is called.
1748 		 */
1749 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1750 			continue;
1751 
1752 		avl_tree = &ifp->illif_avl_by_ppa;
1753 		for (ill = avl_first(avl_tree); ill != NULL;
1754 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1755 			mutex_enter(&ill->ill_lock);
1756 			if (ill->ill_mcast_v1_tset == 1)
1757 				ill->ill_mcast_v1_time++;
1758 			if (ill->ill_mcast_v2_tset == 1)
1759 				ill->ill_mcast_v2_time++;
1760 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1761 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1762 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1763 				if ((ill->ill_mcast_v2_tset > 0) ||
1764 				    (ipst->ips_igmp_max_version ==
1765 				    IGMP_V2_ROUTER)) {
1766 					ip1dbg(("V1 query timer "
1767 					    "expired on %s; switching "
1768 					    "mode to IGMP_V2\n",
1769 					    ill->ill_name));
1770 					ill->ill_mcast_type =
1771 					    IGMP_V2_ROUTER;
1772 				} else {
1773 					ip1dbg(("V1 query timer "
1774 					    "expired on %s; switching "
1775 					    "mode to IGMP_V3\n",
1776 					    ill->ill_name));
1777 					ill->ill_mcast_type =
1778 					    IGMP_V3_ROUTER;
1779 				}
1780 				ill->ill_mcast_v1_time = 0;
1781 				ill->ill_mcast_v1_tset = 0;
1782 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1783 			}
1784 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1785 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1786 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1787 				ip1dbg(("V2 query timer expired on "
1788 				    "%s; switching mode to IGMP_V3\n",
1789 				    ill->ill_name));
1790 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1791 				ill->ill_mcast_v2_time = 0;
1792 				ill->ill_mcast_v2_tset = 0;
1793 				atomic_add_16(&ifp->illif_mcast_v2, -1);
1794 			}
1795 			mutex_exit(&ill->ill_lock);
1796 		}
1797 	}
1798 	rw_exit(&ipst->ips_ill_g_lock);
1799 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1800 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1801 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1802 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1803 }
1804 
1805 /*
1806  * mld_slowtimo:
1807  * - Resets to newer version if we didn't hear from the older version router
1808  *   in MLD_AGE_THRESHOLD seconds.
1809  * - Restarts slowtimeout.
1810  * Check for ips_mld_max_version ensures that we don't revert to a higher
1811  * IGMP version than configured.
1812  */
1813 /* ARGSUSED */
1814 void
1815 mld_slowtimo(void *arg)
1816 {
1817 	ill_t *ill;
1818 	ill_if_t *ifp;
1819 	avl_tree_t *avl_tree;
1820 	ip_stack_t *ipst = (ip_stack_t *)arg;
1821 
1822 	ASSERT(arg != NULL);
1823 	/* See comments in igmp_slowtimo() above... */
1824 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1825 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1826 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1827 	    ifp = ifp->illif_next) {
1828 		if (ifp->illif_mcast_v1 == 0)
1829 			continue;
1830 
1831 		avl_tree = &ifp->illif_avl_by_ppa;
1832 		for (ill = avl_first(avl_tree); ill != NULL;
1833 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1834 			mutex_enter(&ill->ill_lock);
1835 			if (ill->ill_mcast_v1_tset == 1)
1836 				ill->ill_mcast_v1_time++;
1837 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1838 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1839 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1840 				ip1dbg(("MLD query timer expired on"
1841 				    " %s; switching mode to MLD_V2\n",
1842 				    ill->ill_name));
1843 				ill->ill_mcast_type = MLD_V2_ROUTER;
1844 				ill->ill_mcast_v1_time = 0;
1845 				ill->ill_mcast_v1_tset = 0;
1846 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1847 			}
1848 			mutex_exit(&ill->ill_lock);
1849 		}
1850 	}
1851 	rw_exit(&ipst->ips_ill_g_lock);
1852 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1853 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1854 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1855 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1856 }
1857 
1858 /*
1859  * igmp_sendpkt:
1860  * This will send to ip_wput like icmp_inbound.
1861  * Note that the lower ill (on which the membership is kept) is used
1862  * as an upper ill to pass in the multicast parameters.
1863  */
1864 static void
1865 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1866 {
1867 	mblk_t	*mp;
1868 	igmpa_t	*igmpa;
1869 	uint8_t *rtralert;
1870 	ipha_t	*ipha;
1871 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1872 	size_t	size  = hdrlen + sizeof (igmpa_t);
1873 	ipif_t 	*ipif = ilm->ilm_ipif;
1874 	ill_t 	*ill  = ipif->ipif_ill;	/* Will be the "lower" ill */
1875 	mblk_t	*first_mp;
1876 	ipsec_out_t *io;
1877 	zoneid_t zoneid;
1878 	ip_stack_t *ipst = ill->ill_ipst;
1879 
1880 	/*
1881 	 * We need to make sure this packet goes out on an ipif. If
1882 	 * there is some global policy match in ip_wput_ire, we need
1883 	 * to get to the right interface after IPSEC processing.
1884 	 * To make sure this multicast packet goes out on the right
1885 	 * interface, we attach an ipsec_out and initialize ill_index
1886 	 * like we did in ip_wput. To make sure that this packet does
1887 	 * not get forwarded on other interfaces or looped back, we
1888 	 * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop
1889 	 * to B_FALSE.
1890 	 *
1891 	 * We also need to make sure that this does not get load balanced
1892 	 * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if
1893 	 * here. If it gets load balanced, switches supporting igmp snooping
1894 	 * will send the packet that it receives for this multicast group
1895 	 * to the interface that we are sending on. As we have joined the
1896 	 * multicast group on this ill, by sending the packet out on this
1897 	 * ill, we receive all the packets back on this ill.
1898 	 */
1899 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
1900 	if (first_mp == NULL)
1901 		return;
1902 
1903 	first_mp->b_datap->db_type = M_CTL;
1904 	first_mp->b_wptr += sizeof (ipsec_info_t);
1905 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
1906 	/* ipsec_out_secure is B_FALSE now */
1907 	io = (ipsec_out_t *)first_mp->b_rptr;
1908 	io->ipsec_out_type = IPSEC_OUT;
1909 	io->ipsec_out_len = sizeof (ipsec_out_t);
1910 	io->ipsec_out_use_global_policy = B_TRUE;
1911 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
1912 	io->ipsec_out_attach_if = B_TRUE;
1913 	io->ipsec_out_multicast_loop = B_FALSE;
1914 	io->ipsec_out_dontroute = B_TRUE;
1915 	if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES)
1916 		zoneid = GLOBAL_ZONEID;
1917 	io->ipsec_out_zoneid = zoneid;
1918 	io->ipsec_out_ns = ipst->ips_netstack;	/* No netstack_hold */
1919 
1920 	mp = allocb(size, BPRI_HI);
1921 	if (mp == NULL) {
1922 		freemsg(first_mp);
1923 		return;
1924 	}
1925 	mp->b_wptr = mp->b_rptr + size;
1926 	first_mp->b_cont = mp;
1927 
1928 	ipha = (ipha_t *)mp->b_rptr;
1929 	rtralert = (uint8_t *)&(ipha[1]);
1930 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1931 	igmpa->igmpa_type   = type;
1932 	igmpa->igmpa_code   = 0;
1933 	igmpa->igmpa_group  = ilm->ilm_addr;
1934 	igmpa->igmpa_cksum  = 0;
1935 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1936 
1937 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1938 	rtralert[1] = RTRALERT_LEN;
1939 	rtralert[2] = 0;
1940 	rtralert[3] = 0;
1941 
1942 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1943 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1944 	ipha->ipha_type_of_service 	= 0;
1945 	ipha->ipha_length = htons(size);
1946 	ipha->ipha_ident = 0;
1947 	ipha->ipha_fragment_offset_and_flags = 0;
1948 	ipha->ipha_ttl 		= IGMP_TTL;
1949 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1950 	ipha->ipha_hdr_checksum 	= 0;
1951 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1952 	ipha->ipha_src 		= ipif->ipif_src_addr;
1953 	/*
1954 	 * Request loopback of the report if we are acting as a multicast
1955 	 * router, so that the process-level routing demon can hear it.
1956 	 */
1957 	/*
1958 	 * This will run multiple times for the same group if there are members
1959 	 * on the same group for multiple ipif's on the same ill. The
1960 	 * igmp_input code will suppress this due to the loopback thus we
1961 	 * always loopback membership report.
1962 	 */
1963 	ASSERT(ill->ill_rq != NULL);
1964 	ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid);
1965 
1966 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
1967 
1968 	++ipst->ips_igmpstat.igps_snd_reports;
1969 }
1970 
1971 /*
1972  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated
1973  * with the passed-in ipif.  The report will contain one group record
1974  * for each element of reclist.  If this causes packet length to
1975  * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent.
1976  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1977  * and those buffers are freed here.
1978  */
1979 static void
1980 igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist)
1981 {
1982 	ipsec_out_t *io;
1983 	igmp3ra_t *igmp3ra;
1984 	grphdra_t *grphdr;
1985 	mblk_t *first_mp, *mp;
1986 	ipha_t *ipha;
1987 	uint8_t *rtralert;
1988 	ipaddr_t *src_array;
1989 	int i, j, numrec, more_src_cnt;
1990 	size_t hdrsize, size, rsize;
1991 	ill_t *ill = ipif->ipif_ill;
1992 	mrec_t *rp, *cur_reclist;
1993 	mrec_t *next_reclist = reclist;
1994 	boolean_t morepkts;
1995 	zoneid_t zoneid;
1996 	ip_stack_t	 *ipst = ill->ill_ipst;
1997 
1998 	/* if there aren't any records, there's nothing to send */
1999 	if (reclist == NULL)
2000 		return;
2001 
2002 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
2003 nextpkt:
2004 	size = hdrsize + sizeof (igmp3ra_t);
2005 	morepkts = B_FALSE;
2006 	more_src_cnt = 0;
2007 	cur_reclist = next_reclist;
2008 	numrec = 0;
2009 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2010 		rsize = sizeof (grphdra_t) +
2011 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
2012 		if (size + rsize > ill->ill_max_frag) {
2013 			if (rp == cur_reclist) {
2014 				/*
2015 				 * If the first mrec we looked at is too big
2016 				 * to fit in a single packet (i.e the source
2017 				 * list is too big), we must either truncate
2018 				 * the list (if TO_EX or IS_EX), or send
2019 				 * multiple reports for the same group (all
2020 				 * other types).
2021 				 */
2022 				int srcspace, srcsperpkt;
2023 				srcspace = ill->ill_max_frag - (size +
2024 				    sizeof (grphdra_t));
2025 				srcsperpkt = srcspace / sizeof (ipaddr_t);
2026 				/*
2027 				 * Increment size and numrec, because we will
2028 				 * be sending a record for the mrec we're
2029 				 * looking at now.
2030 				 */
2031 				size += sizeof (grphdra_t) +
2032 				    (srcsperpkt * sizeof (ipaddr_t));
2033 				numrec++;
2034 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2035 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2036 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2037 					if (rp->mrec_next == NULL) {
2038 						/* no more packets to send */
2039 						break;
2040 					} else {
2041 						/*
2042 						 * more packets, but we're
2043 						 * done with this mrec.
2044 						 */
2045 						next_reclist = rp->mrec_next;
2046 					}
2047 				} else {
2048 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2049 					    - srcsperpkt;
2050 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2051 					/*
2052 					 * We'll fix up this mrec (remove the
2053 					 * srcs we've already sent) before
2054 					 * returning to nextpkt above.
2055 					 */
2056 					next_reclist = rp;
2057 				}
2058 			} else {
2059 				next_reclist = rp;
2060 			}
2061 			morepkts = B_TRUE;
2062 			break;
2063 		}
2064 		size += rsize;
2065 		numrec++;
2066 	}
2067 
2068 	/*
2069 	 * See comments in igmp_sendpkt() about initializing for ipsec and
2070 	 * load balancing requirements.
2071 	 */
2072 	first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI);
2073 	if (first_mp == NULL)
2074 		goto free_reclist;
2075 
2076 	first_mp->b_datap->db_type = M_CTL;
2077 	first_mp->b_wptr += sizeof (ipsec_info_t);
2078 	bzero(first_mp->b_rptr, sizeof (ipsec_info_t));
2079 	/* ipsec_out_secure is B_FALSE now */
2080 	io = (ipsec_out_t *)first_mp->b_rptr;
2081 	io->ipsec_out_type = IPSEC_OUT;
2082 	io->ipsec_out_len = sizeof (ipsec_out_t);
2083 	io->ipsec_out_use_global_policy = B_TRUE;
2084 	io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex;
2085 	io->ipsec_out_attach_if = B_TRUE;
2086 	io->ipsec_out_multicast_loop = B_FALSE;
2087 	io->ipsec_out_dontroute = B_TRUE;
2088 	if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES)
2089 		zoneid = GLOBAL_ZONEID;
2090 	io->ipsec_out_zoneid = zoneid;
2091 
2092 	mp = allocb(size, BPRI_HI);
2093 	if (mp == NULL) {
2094 		freemsg(first_mp);
2095 		goto free_reclist;
2096 	}
2097 	bzero((char *)mp->b_rptr, size);
2098 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
2099 	first_mp->b_cont = mp;
2100 
2101 	ipha = (ipha_t *)mp->b_rptr;
2102 	rtralert = (uint8_t *)&(ipha[1]);
2103 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
2104 	grphdr = (grphdra_t *)&(igmp3ra[1]);
2105 
2106 	rp = cur_reclist;
2107 	for (i = 0; i < numrec; i++) {
2108 		grphdr->grphdra_type = rp->mrec_type;
2109 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2110 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
2111 		src_array = (ipaddr_t *)&(grphdr[1]);
2112 
2113 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
2114 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
2115 
2116 		grphdr = (grphdra_t *)&(src_array[j]);
2117 		rp = rp->mrec_next;
2118 	}
2119 
2120 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
2121 	igmp3ra->igmp3ra_numrec = htons(numrec);
2122 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
2123 
2124 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
2125 	rtralert[1] = RTRALERT_LEN;
2126 	rtralert[2] = 0;
2127 	rtralert[3] = 0;
2128 
2129 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2130 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2131 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2132 	ipha->ipha_length = htons(size);
2133 	ipha->ipha_ttl = IGMP_TTL;
2134 	ipha->ipha_protocol = IPPROTO_IGMP;
2135 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2136 	ipha->ipha_src = ipif->ipif_src_addr;
2137 
2138 	/*
2139 	 * Request loopback of the report if we are acting as a multicast
2140 	 * router, so that the process-level routing daemon can hear it.
2141 	 *
2142 	 * This will run multiple times for the same group if there are
2143 	 * members on the same group for multiple ipifs on the same ill.
2144 	 * The igmp_input code will suppress this due to the loopback;
2145 	 * thus we always loopback membership report.
2146 	 */
2147 	ASSERT(ill->ill_rq != NULL);
2148 	ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid);
2149 
2150 	ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid);
2151 
2152 	++ipst->ips_igmpstat.igps_snd_reports;
2153 
2154 	if (morepkts) {
2155 		if (more_src_cnt > 0) {
2156 			int index, mvsize;
2157 			slist_t *sl = &next_reclist->mrec_srcs;
2158 			index = sl->sl_numsrc;
2159 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2160 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2161 			    mvsize);
2162 			sl->sl_numsrc = more_src_cnt;
2163 		}
2164 		goto nextpkt;
2165 	}
2166 
2167 free_reclist:
2168 	while (reclist != NULL) {
2169 		rp = reclist->mrec_next;
2170 		mi_free(reclist);
2171 		reclist = rp;
2172 	}
2173 }
2174 
2175 /*
2176  * mld_input:
2177  */
2178 /* ARGSUSED */
2179 void
2180 mld_input(queue_t *q, mblk_t *mp, ill_t *ill)
2181 {
2182 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2183 	mld_hdr_t	*mldh;
2184 	ilm_t		*ilm;
2185 	ipif_t		*ipif;
2186 	uint16_t	hdr_length, exthdr_length;
2187 	in6_addr_t	*v6group_ptr, *lcladdr_ptr;
2188 	uint_t		next;
2189 	int		mldlen;
2190 	ip_stack_t	*ipst = ill->ill_ipst;
2191 
2192 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2193 
2194 	/* Make sure the src address of the packet is link-local */
2195 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2196 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2197 		freemsg(mp);
2198 		return;
2199 	}
2200 
2201 	if (ip6h->ip6_hlim != 1) {
2202 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2203 		freemsg(mp);
2204 		return;
2205 	}
2206 
2207 	/* Get to the icmp header part */
2208 	if (ip6h->ip6_nxt != IPPROTO_ICMPV6) {
2209 		hdr_length = ip_hdr_length_v6(mp, ip6h);
2210 		exthdr_length = hdr_length - IPV6_HDR_LEN;
2211 	} else {
2212 		hdr_length = IPV6_HDR_LEN;
2213 		exthdr_length = 0;
2214 	}
2215 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2216 
2217 	/* An MLD packet must at least be 24 octets to be valid */
2218 	if (mldlen < MLD_MINLEN) {
2219 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2220 		freemsg(mp);
2221 		return;
2222 	}
2223 
2224 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2225 
2226 	switch (mldh->mld_type) {
2227 	case MLD_LISTENER_QUERY:
2228 		/*
2229 		 * packet length differentiates between v1 and v2.  v1
2230 		 * query should be exactly 24 octets long; v2 is >= 28.
2231 		 */
2232 		if ((mldlen == MLD_MINLEN) ||
2233 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2234 			next = mld_query_in(mldh, ill);
2235 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2236 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2237 		} else {
2238 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2239 			freemsg(mp);
2240 			return;
2241 		}
2242 		if (next == 0) {
2243 			freemsg(mp);
2244 			return;
2245 		}
2246 
2247 		if (next != INFINITY)
2248 			mld_start_timers(next, ipst);
2249 		break;
2250 
2251 	case MLD_LISTENER_REPORT: {
2252 
2253 		ASSERT(ill->ill_ipif != NULL);
2254 		/*
2255 		 * For fast leave to work, we have to know that we are the
2256 		 * last person to send a report for this group.  Reports
2257 		 * generated by us are looped back since we could potentially
2258 		 * be a multicast router, so discard reports sourced by me.
2259 		 */
2260 		lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet);
2261 		mutex_enter(&ill->ill_lock);
2262 		for (ipif = ill->ill_ipif; ipif != NULL;
2263 		    ipif = ipif->ipif_next) {
2264 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2265 			    lcladdr_ptr)) {
2266 				if (ip_debug > 1) {
2267 					char    buf1[INET6_ADDRSTRLEN];
2268 					char	buf2[INET6_ADDRSTRLEN];
2269 
2270 					(void) mi_strlog(ill->ill_rq,
2271 					    1,
2272 					    SL_TRACE,
2273 					    "mld_input: we are only "
2274 					    "member src %s ipif_local %s",
2275 					    inet_ntop(AF_INET6, lcladdr_ptr,
2276 					    buf1, sizeof (buf1)),
2277 					    inet_ntop(AF_INET6,
2278 					    &ipif->ipif_v6lcl_addr,
2279 					    buf2, sizeof (buf2)));
2280 				}
2281 				mutex_exit(&ill->ill_lock);
2282 				freemsg(mp);
2283 				return;
2284 			}
2285 		}
2286 		mutex_exit(&ill->ill_lock);
2287 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2288 
2289 		v6group_ptr = &mldh->mld_addr;
2290 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2291 			BUMP_MIB(ill->ill_icmp6_mib,
2292 			    ipv6IfIcmpInGroupMembBadReports);
2293 			freemsg(mp);
2294 			return;
2295 		}
2296 
2297 
2298 		/*
2299 		 * If we belong to the group being reported, and we are a
2300 		 * 'Delaying member' per the RFC terminology, stop our timer
2301 		 * for that group and 'clear flag' i.e. mark ilm_state as
2302 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2303 		 * membership entries for the same group address (one per zone)
2304 		 * so we need to walk the ill_ilm list.
2305 		 */
2306 		mutex_enter(&ill->ill_lock);
2307 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2308 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2309 				continue;
2310 			BUMP_MIB(ill->ill_icmp6_mib,
2311 			    ipv6IfIcmpInGroupMembOurReports);
2312 
2313 			ilm->ilm_timer = INFINITY;
2314 			ilm->ilm_state = IGMP_OTHERMEMBER;
2315 		}
2316 		mutex_exit(&ill->ill_lock);
2317 		break;
2318 	}
2319 	case MLD_LISTENER_REDUCTION:
2320 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2321 		break;
2322 	}
2323 	/*
2324 	 * All MLD packets have already been passed up to any
2325 	 * process(es) listening on a ICMP6 raw socket. This
2326 	 * has been accomplished in ip_deliver_local_v6 prior to
2327 	 * this function call. It is assumed that the multicast daemon
2328 	 * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the
2329 	 * ICMP6_FILTER socket option to only receive the MLD messages)
2330 	 * Thus we can free the MLD message block here
2331 	 */
2332 	freemsg(mp);
2333 }
2334 
2335 /*
2336  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2337  * (non-zero, unsigned) timer value to be set on success.
2338  */
2339 static uint_t
2340 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2341 {
2342 	ilm_t	*ilm;
2343 	int	timer;
2344 	uint_t	next, current;
2345 	in6_addr_t *v6group;
2346 
2347 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2348 
2349 	/*
2350 	 * In the MLD specification, there are 3 states and a flag.
2351 	 *
2352 	 * In Non-Listener state, we simply don't have a membership record.
2353 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2354 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2355 	 * INFINITY)
2356 	 *
2357 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2358 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2359 	 * if I sent the last report.
2360 	 */
2361 	v6group = &mldh->mld_addr;
2362 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2363 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2364 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2365 		return (0);
2366 	}
2367 
2368 	/* Need to do compatibility mode checking */
2369 	mutex_enter(&ill->ill_lock);
2370 	ill->ill_mcast_v1_time = 0;
2371 	ill->ill_mcast_v1_tset = 1;
2372 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2373 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2374 		    "MLD_V1_ROUTER\n", ill->ill_name));
2375 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2376 		ill->ill_mcast_type = MLD_V1_ROUTER;
2377 	}
2378 	mutex_exit(&ill->ill_lock);
2379 
2380 	timer = (int)ntohs(mldh->mld_maxdelay);
2381 	if (ip_debug > 1) {
2382 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2383 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2384 		    timer, (int)mldh->mld_type);
2385 	}
2386 
2387 	/*
2388 	 * -Start the timers in all of our membership records for
2389 	 * the physical interface on which the query arrived,
2390 	 * excl:
2391 	 *	1.  those that belong to the "all hosts" group,
2392 	 *	2.  those with 0 scope, or 1 node-local scope.
2393 	 *
2394 	 * -Restart any timer that is already running but has a value
2395 	 * longer that the requested timeout.
2396 	 * -Use the value specified in the query message as the
2397 	 * maximum timeout.
2398 	 */
2399 	next = INFINITY;
2400 	mutex_enter(&ill->ill_lock);
2401 
2402 	current = CURRENT_MSTIME;
2403 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2404 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2405 
2406 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2407 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2408 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2409 			continue;
2410 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2411 		    &ipv6_all_hosts_mcast)) &&
2412 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2413 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2414 			if (timer == 0) {
2415 				/* Respond immediately */
2416 				ilm->ilm_timer = INFINITY;
2417 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2418 				mutex_exit(&ill->ill_lock);
2419 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2420 				mutex_enter(&ill->ill_lock);
2421 				break;
2422 			}
2423 			if (ilm->ilm_timer > timer) {
2424 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2425 				if (ilm->ilm_timer < next)
2426 					next = ilm->ilm_timer;
2427 				ilm->ilm_timer += current;
2428 			}
2429 			break;
2430 		}
2431 	}
2432 	mutex_exit(&ill->ill_lock);
2433 
2434 	return (next);
2435 }
2436 
2437 /*
2438  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2439  * returns the appropriate (non-zero, unsigned) timer value (which may
2440  * be INFINITY) to be set.
2441  */
2442 static uint_t
2443 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2444 {
2445 	ilm_t	*ilm;
2446 	in6_addr_t *v6group, *src_array;
2447 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2448 	uint8_t	qrv;
2449 
2450 	v6group = &mld2q->mld2q_addr;
2451 	numsrc = ntohs(mld2q->mld2q_numsrc);
2452 
2453 	/* make sure numsrc matches packet size */
2454 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2455 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2456 		return (0);
2457 	}
2458 	src_array = (in6_addr_t *)&mld2q[1];
2459 
2460 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2461 
2462 	/* extract Maximum Response Delay from code in header */
2463 	mrd = ntohs(mld2q->mld2q_mxrc);
2464 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2465 		uint_t hdrval, mant, exp;
2466 		hdrval = mrd;
2467 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2468 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2469 		mrd = (mant | 0x1000) << (exp + 3);
2470 	}
2471 	if (mrd == 0)
2472 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2473 
2474 	MCAST_RANDOM_DELAY(delay, mrd);
2475 	next = (unsigned)INFINITY;
2476 	current = CURRENT_MSTIME;
2477 
2478 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2479 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2480 	else
2481 		ill->ill_mcast_rv = qrv;
2482 
2483 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2484 		uint_t mant, exp;
2485 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2486 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2487 		qqi = (mant | 0x10) << (exp + 3);
2488 	}
2489 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2490 
2491 	/*
2492 	 * If we have a pending general query response that's scheduled
2493 	 * sooner than the delay we calculated for this response, then
2494 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2495 	 */
2496 	mutex_enter(&ill->ill_lock);
2497 	if (ill->ill_global_timer < (current + delay)) {
2498 		mutex_exit(&ill->ill_lock);
2499 		return (next);
2500 	}
2501 	mutex_exit(&ill->ill_lock);
2502 
2503 	/*
2504 	 * Now take action depending on query type: general,
2505 	 * group specific, or group/source specific.
2506 	 */
2507 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2508 		/*
2509 		 * general query
2510 		 * We know global timer is either not running or is
2511 		 * greater than our calculated delay, so reset it to
2512 		 * our delay (random value in range [0, response time])
2513 		 */
2514 		mutex_enter(&ill->ill_lock);
2515 		ill->ill_global_timer = current + delay;
2516 		mutex_exit(&ill->ill_lock);
2517 		next = delay;
2518 
2519 	} else {
2520 		/* group or group/source specific query */
2521 		mutex_enter(&ill->ill_lock);
2522 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2523 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2524 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2525 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2526 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2527 				continue;
2528 
2529 			/*
2530 			 * If the query is group specific or we have a
2531 			 * pending group specific query, the response is
2532 			 * group specific (pending sources list should be
2533 			 * empty).  Otherwise, need to update the pending
2534 			 * sources list for the group and source specific
2535 			 * response.
2536 			 */
2537 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2538 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2539 group_query:
2540 				FREE_SLIST(ilm->ilm_pendsrcs);
2541 				ilm->ilm_pendsrcs = NULL;
2542 			} else {
2543 				boolean_t overflow;
2544 				slist_t *pktl;
2545 				if (numsrc > MAX_FILTER_SIZE ||
2546 				    (ilm->ilm_pendsrcs == NULL &&
2547 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2548 					/*
2549 					 * We've been sent more sources than
2550 					 * we can deal with; or we can't deal
2551 					 * with a source list at all. Revert
2552 					 * to a group specific query.
2553 					 */
2554 					goto group_query;
2555 				}
2556 				if ((pktl = l_alloc()) == NULL)
2557 					goto group_query;
2558 				pktl->sl_numsrc = numsrc;
2559 				for (i = 0; i < numsrc; i++)
2560 					pktl->sl_addr[i] = src_array[i];
2561 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2562 				    &overflow);
2563 				l_free(pktl);
2564 				if (overflow)
2565 					goto group_query;
2566 			}
2567 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2568 			    INFINITY : (ilm->ilm_timer - current);
2569 			/* set timer to soonest value */
2570 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2571 			if (ilm->ilm_timer < next)
2572 				next = ilm->ilm_timer;
2573 			ilm->ilm_timer += current;
2574 			break;
2575 		}
2576 		mutex_exit(&ill->ill_lock);
2577 	}
2578 
2579 	return (next);
2580 }
2581 
2582 /*
2583  * Send MLDv1 response packet with hoplimit 1
2584  */
2585 static void
2586 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2587 {
2588 	mblk_t		*mp;
2589 	mld_hdr_t	*mldh;
2590 	ip6_t 		*ip6h;
2591 	ip6_hbh_t	*ip6hbh;
2592 	struct ip6_opt_router	*ip6router;
2593 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2594 	ill_t		*ill = ilm->ilm_ill;   /* Will be the "lower" ill */
2595 	ipif_t		*ipif;
2596 	ip6i_t		*ip6i;
2597 
2598 	/*
2599 	 * We need to place a router alert option in this packet.  The length
2600 	 * of the options must be a multiple of 8.  The hbh option header is 2
2601 	 * bytes followed by the 4 byte router alert option.  That leaves
2602 	 * 2 bytes of pad for a total of 8 bytes.
2603 	 */
2604 	const int	router_alert_length = 8;
2605 
2606 	ASSERT(ill->ill_isv6);
2607 
2608 	/*
2609 	 * We need to make sure that this packet does not get load balanced.
2610 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2611 	 * ip_newroute_ipif_v6 knows how to handle such packets.
2612 	 * If it gets load balanced, switches supporting MLD snooping
2613 	 * (in the future) will send the packet that it receives for this
2614 	 * multicast group to the interface that we are sending on. As we have
2615 	 * joined the multicast group on this ill, by sending the packet out
2616 	 * on this ill, we receive all the packets back on this ill.
2617 	 */
2618 	size += sizeof (ip6i_t) + router_alert_length;
2619 	mp = allocb(size, BPRI_HI);
2620 	if (mp == NULL)
2621 		return;
2622 	bzero(mp->b_rptr, size);
2623 	mp->b_wptr = mp->b_rptr + size;
2624 
2625 	ip6i = (ip6i_t *)mp->b_rptr;
2626 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2627 	ip6i->ip6i_nxt = IPPROTO_RAW;
2628 	ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT;
2629 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2630 
2631 	ip6h = (ip6_t *)&ip6i[1];
2632 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2633 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2634 	/*
2635 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2636 	 * above will pad between ip6router and mld.
2637 	 */
2638 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2639 
2640 	mldh->mld_type = type;
2641 	mldh->mld_addr = ilm->ilm_v6addr;
2642 
2643 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2644 	ip6router->ip6or_len = 2;
2645 	ip6router->ip6or_value[0] = 0;
2646 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2647 
2648 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2649 	ip6hbh->ip6h_len = 0;
2650 
2651 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2652 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2653 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2654 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2655 	if (v6addr == NULL)
2656 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2657 	else
2658 		ip6h->ip6_dst = *v6addr;
2659 
2660 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2661 	if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) {
2662 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2663 		ipif_refrele(ipif);
2664 	} else {
2665 		/* Otherwise, use IPv6 default address selection. */
2666 		ip6h->ip6_src = ipv6_all_zeros;
2667 	}
2668 
2669 	/*
2670 	 * Prepare for checksum by putting icmp length in the icmp
2671 	 * checksum field. The checksum is calculated in ip_wput_v6.
2672 	 */
2673 	mldh->mld_cksum = htons(sizeof (*mldh));
2674 
2675 	/*
2676 	 * ip_wput will automatically loopback the multicast packet to
2677 	 * the conn if multicast loopback is enabled.
2678 	 * The MIB stats corresponding to this outgoing MLD packet
2679 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2680 	 * ->icmp_update_out_mib_v6 function call.
2681 	 */
2682 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2683 }
2684 
2685 /*
2686  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2687  * report will contain one multicast address record for each element of
2688  * reclist.  If this causes packet length to exceed ill->ill_max_frag,
2689  * multiple reports are sent.  reclist is assumed to be made up of
2690  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2691  */
2692 static void
2693 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2694 {
2695 	mblk_t		*mp;
2696 	mld2r_t		*mld2r;
2697 	mld2mar_t	*mld2mar;
2698 	in6_addr_t	*srcarray;
2699 	ip6_t		*ip6h;
2700 	ip6_hbh_t	*ip6hbh;
2701 	ip6i_t		*ip6i;
2702 	struct ip6_opt_router	*ip6router;
2703 	size_t		size, optlen, padlen, icmpsize, rsize;
2704 	ipif_t		*ipif;
2705 	int		i, numrec, more_src_cnt;
2706 	mrec_t		*rp, *cur_reclist;
2707 	mrec_t		*next_reclist = reclist;
2708 	boolean_t	morepkts;
2709 
2710 	/* If there aren't any records, there's nothing to send */
2711 	if (reclist == NULL)
2712 		return;
2713 
2714 	ASSERT(ill->ill_isv6);
2715 
2716 	/*
2717 	 * Total option length (optlen + padlen) must be a multiple of
2718 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2719 	 * length will be 8.  Assert this in case anything ever changes.
2720 	 */
2721 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2722 	ASSERT(optlen <= 8);
2723 	padlen = 8 - optlen;
2724 nextpkt:
2725 	icmpsize = sizeof (mld2r_t);
2726 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2727 	morepkts = B_FALSE;
2728 	more_src_cnt = 0;
2729 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2730 	    rp = rp->mrec_next, numrec++) {
2731 		rsize = sizeof (mld2mar_t) +
2732 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2733 		if (size + rsize > ill->ill_max_frag) {
2734 			if (rp == cur_reclist) {
2735 				/*
2736 				 * If the first mrec we looked at is too big
2737 				 * to fit in a single packet (i.e the source
2738 				 * list is too big), we must either truncate
2739 				 * the list (if TO_EX or IS_EX), or send
2740 				 * multiple reports for the same group (all
2741 				 * other types).
2742 				 */
2743 				int srcspace, srcsperpkt;
2744 				srcspace = ill->ill_max_frag -
2745 				    (size + sizeof (mld2mar_t));
2746 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2747 				/*
2748 				 * Increment icmpsize and size, because we will
2749 				 * be sending a record for the mrec we're
2750 				 * looking at now.
2751 				 */
2752 				rsize = sizeof (mld2mar_t) +
2753 				    (srcsperpkt * sizeof (in6_addr_t));
2754 				icmpsize += rsize;
2755 				size += rsize;
2756 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2757 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2758 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2759 					if (rp->mrec_next == NULL) {
2760 						/* no more packets to send */
2761 						break;
2762 					} else {
2763 						/*
2764 						 * more packets, but we're
2765 						 * done with this mrec.
2766 						 */
2767 						next_reclist = rp->mrec_next;
2768 					}
2769 				} else {
2770 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2771 					    - srcsperpkt;
2772 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2773 					/*
2774 					 * We'll fix up this mrec (remove the
2775 					 * srcs we've already sent) before
2776 					 * returning to nextpkt above.
2777 					 */
2778 					next_reclist = rp;
2779 				}
2780 			} else {
2781 				next_reclist = rp;
2782 			}
2783 			morepkts = B_TRUE;
2784 			break;
2785 		}
2786 		icmpsize += rsize;
2787 		size += rsize;
2788 	}
2789 
2790 	/*
2791 	 * We need to make sure that this packet does not get load balanced.
2792 	 * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and
2793 	 * ip_newroute_ipif_v6 know how to handle such packets.
2794 	 * If it gets load balanced, switches supporting MLD snooping
2795 	 * (in the future) will send the packet that it receives for this
2796 	 * multicast group to the interface that we are sending on. As we have
2797 	 * joined the multicast group on this ill, by sending the packet out
2798 	 * on this ill, we receive all the packets back on this ill.
2799 	 */
2800 	size += sizeof (ip6i_t);
2801 	mp = allocb(size, BPRI_HI);
2802 	if (mp == NULL)
2803 		goto free_reclist;
2804 	bzero(mp->b_rptr, size);
2805 	mp->b_wptr = mp->b_rptr + size;
2806 
2807 	ip6i = (ip6i_t *)mp->b_rptr;
2808 	ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2809 	ip6i->ip6i_nxt = IPPROTO_RAW;
2810 	ip6i->ip6i_flags = IP6I_ATTACH_IF;
2811 	ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex;
2812 
2813 	ip6h = (ip6_t *)&(ip6i[1]);
2814 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2815 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2816 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2817 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2818 
2819 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2820 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2821 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2822 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2823 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2824 	/* ipif returned by ipif_lookup_zoneid is link-local (if present) */
2825 	if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) {
2826 		ip6h->ip6_src = ipif->ipif_v6src_addr;
2827 		ipif_refrele(ipif);
2828 	} else {
2829 		/* otherwise, use IPv6 default address selection. */
2830 		ip6h->ip6_src = ipv6_all_zeros;
2831 	}
2832 
2833 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2834 	/*
2835 	 * ip6h_len is the number of 8-byte words, not including the first
2836 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2837 	 */
2838 	ip6hbh->ip6h_len = 0;
2839 
2840 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2841 	ip6router->ip6or_len = 2;
2842 	ip6router->ip6or_value[0] = 0;
2843 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2844 
2845 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2846 	mld2r->mld2r_nummar = htons(numrec);
2847 	/*
2848 	 * Prepare for the checksum by putting icmp length in the icmp
2849 	 * checksum field. The checksum is calculated in ip_wput_v6.
2850 	 */
2851 	mld2r->mld2r_cksum = htons(icmpsize);
2852 
2853 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2854 		mld2mar->mld2mar_type = rp->mrec_type;
2855 		mld2mar->mld2mar_auxlen = 0;
2856 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2857 		mld2mar->mld2mar_group = rp->mrec_group;
2858 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2859 
2860 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2861 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2862 
2863 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2864 	}
2865 
2866 	/*
2867 	 * ip_wput will automatically loopback the multicast packet to
2868 	 * the conn if multicast loopback is enabled.
2869 	 * The MIB stats corresponding to this outgoing MLD packet
2870 	 * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6
2871 	 * ->icmp_update_out_mib_v6 function call.
2872 	 */
2873 	(void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT);
2874 
2875 	if (morepkts) {
2876 		if (more_src_cnt > 0) {
2877 			int index, mvsize;
2878 			slist_t *sl = &next_reclist->mrec_srcs;
2879 			index = sl->sl_numsrc;
2880 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2881 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2882 			    mvsize);
2883 			sl->sl_numsrc = more_src_cnt;
2884 		}
2885 		goto nextpkt;
2886 	}
2887 
2888 free_reclist:
2889 	while (reclist != NULL) {
2890 		rp = reclist->mrec_next;
2891 		mi_free(reclist);
2892 		reclist = rp;
2893 	}
2894 }
2895 
2896 static mrec_t *
2897 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2898     mrec_t *next)
2899 {
2900 	mrec_t *rp;
2901 	int i;
2902 
2903 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2904 	    SLIST_IS_EMPTY(srclist))
2905 		return (next);
2906 
2907 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2908 	if (rp == NULL)
2909 		return (next);
2910 
2911 	rp->mrec_next = next;
2912 	rp->mrec_type = type;
2913 	rp->mrec_auxlen = 0;
2914 	rp->mrec_group = *grp;
2915 	if (srclist == NULL) {
2916 		rp->mrec_srcs.sl_numsrc = 0;
2917 	} else {
2918 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2919 		for (i = 0; i < srclist->sl_numsrc; i++)
2920 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2921 	}
2922 
2923 	return (rp);
2924 }
2925 
2926 /*
2927  * Set up initial retransmit state.  If memory cannot be allocated for
2928  * the source lists, simply create as much state as is possible; memory
2929  * allocation failures are considered one type of transient error that
2930  * the retransmissions are designed to overcome (and if they aren't
2931  * transient, there are bigger problems than failing to notify the
2932  * router about multicast group membership state changes).
2933  */
2934 static void
2935 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2936     slist_t *flist)
2937 {
2938 	/*
2939 	 * There are only three possibilities for rtype:
2940 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2941 	 *	  => rtype is ALLOW_NEW_SOURCES
2942 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2943 	 *	  => rtype is CHANGE_TO_EXCLUDE
2944 	 *	State change that involves a filter mode change
2945 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2946 	 */
2947 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2948 	    rtype == ALLOW_NEW_SOURCES);
2949 
2950 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2951 
2952 	switch (rtype) {
2953 	case CHANGE_TO_EXCLUDE:
2954 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2955 		CLEAR_SLIST(rtxp->rtx_allow);
2956 		COPY_SLIST(flist, rtxp->rtx_block);
2957 		break;
2958 	case ALLOW_NEW_SOURCES:
2959 	case CHANGE_TO_INCLUDE:
2960 		rtxp->rtx_fmode_cnt =
2961 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2962 		CLEAR_SLIST(rtxp->rtx_block);
2963 		COPY_SLIST(flist, rtxp->rtx_allow);
2964 		break;
2965 	}
2966 }
2967 
2968 /*
2969  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2970  * RFC 3376 section 5.1, covers three cases:
2971  *	* The current state change is a filter mode change
2972  *		Set filter mode retransmit counter; set retransmit allow or
2973  *		block list to new source list as appropriate, and clear the
2974  *		retransmit list that was not set; send TO_IN or TO_EX with
2975  *		new source list.
2976  *	* The current state change is a source list change, but the filter
2977  *	  mode retransmit counter is > 0
2978  *		Decrement filter mode retransmit counter; set retransmit
2979  *		allow or block list to  new source list as appropriate,
2980  *		and clear the retransmit list that was not set; send TO_IN
2981  *		or TO_EX with new source list.
2982  *	* The current state change is a source list change, and the filter
2983  *	  mode retransmit counter is 0.
2984  *		Merge existing rtx allow and block lists with new state:
2985  *		  rtx_allow = (new allow + rtx_allow) - new block
2986  *		  rtx_block = (new block + rtx_block) - new allow
2987  *		Send ALLOW and BLOCK records for new retransmit lists;
2988  *		decrement retransmit counter.
2989  *
2990  * As is the case for mcast_init_rtx(), memory allocation failures are
2991  * acceptable; we just create as much state as we can.
2992  */
2993 static mrec_t *
2994 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2995 {
2996 	ill_t *ill;
2997 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2998 	mcast_record_t txtype;
2999 	mrec_t *rp, *rpnext, *rtnmrec;
3000 	boolean_t ovf;
3001 
3002 	ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill);
3003 
3004 	if (mreclist == NULL)
3005 		return (mreclist);
3006 
3007 	/*
3008 	 * A filter mode change is indicated by a single mrec, which is
3009 	 * either TO_IN or TO_EX.  In this case, we just need to set new
3010 	 * retransmit state as if this were an initial join.  There is
3011 	 * no change to the mrec list.
3012 	 */
3013 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
3014 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
3015 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
3016 		    &mreclist->mrec_srcs);
3017 		return (mreclist);
3018 	}
3019 
3020 	/*
3021 	 * Only the source list has changed
3022 	 */
3023 	rtxp->rtx_cnt = ill->ill_mcast_rv;
3024 	if (rtxp->rtx_fmode_cnt > 0) {
3025 		/* but we're still sending filter mode change reports */
3026 		rtxp->rtx_fmode_cnt--;
3027 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
3028 			CLEAR_SLIST(rtxp->rtx_block);
3029 			COPY_SLIST(flist, rtxp->rtx_allow);
3030 			txtype = CHANGE_TO_INCLUDE;
3031 		} else {
3032 			CLEAR_SLIST(rtxp->rtx_allow);
3033 			COPY_SLIST(flist, rtxp->rtx_block);
3034 			txtype = CHANGE_TO_EXCLUDE;
3035 		}
3036 		/* overwrite first mrec with new info */
3037 		mreclist->mrec_type = txtype;
3038 		l_copy(flist, &mreclist->mrec_srcs);
3039 		/* then free any remaining mrecs */
3040 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
3041 			rpnext = rp->mrec_next;
3042 			mi_free(rp);
3043 		}
3044 		mreclist->mrec_next = NULL;
3045 		rtnmrec = mreclist;
3046 	} else {
3047 		mrec_t *allow_mrec, *block_mrec;
3048 		/*
3049 		 * Just send the source change reports; but we need to
3050 		 * recalculate the ALLOW and BLOCK lists based on previous
3051 		 * state and new changes.
3052 		 */
3053 		rtnmrec = mreclist;
3054 		allow_mrec = block_mrec = NULL;
3055 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
3056 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
3057 			    rp->mrec_type == BLOCK_OLD_SOURCES);
3058 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
3059 				allow_mrec = rp;
3060 			else
3061 				block_mrec = rp;
3062 		}
3063 		/*
3064 		 * Perform calculations:
3065 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
3066 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
3067 		 *
3068 		 * Each calc requires two steps, for example:
3069 		 *   rtx_allow = rtx_allow - mrec_block;
3070 		 *   new_allow = mrec_allow + rtx_allow;
3071 		 *
3072 		 * Store results in mrec lists, and then copy into rtx lists.
3073 		 * We do it in this order in case the rtx list hasn't been
3074 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
3075 		 * Overflows are also okay.
3076 		 */
3077 		if (block_mrec != NULL) {
3078 			l_difference_in_a(rtxp->rtx_allow,
3079 			    &block_mrec->mrec_srcs);
3080 		}
3081 		if (allow_mrec != NULL) {
3082 			l_difference_in_a(rtxp->rtx_block,
3083 			    &allow_mrec->mrec_srcs);
3084 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
3085 			    &ovf);
3086 		}
3087 		if (block_mrec != NULL) {
3088 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
3089 			    &ovf);
3090 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
3091 		} else {
3092 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
3093 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
3094 		}
3095 		if (allow_mrec != NULL) {
3096 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
3097 		} else {
3098 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
3099 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
3100 		}
3101 	}
3102 
3103 	return (rtnmrec);
3104 }
3105