xref: /illumos-gate/usr/src/uts/common/inet/ip/igmp.c (revision b1d7ec75953cd517f5b7c3d9cb427ff8ec5d7d07)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 /* Copyright (c) 1990 Mentat Inc. */
26 
27 /*
28  * Internet Group Management Protocol (IGMP) routines.
29  * Multicast Listener Discovery Protocol (MLD) routines.
30  *
31  * Written by Steve Deering, Stanford, May 1988.
32  * Modified by Rosen Sharma, Stanford, Aug 1994.
33  * Modified by Bill Fenner, Xerox PARC, Feb. 1995.
34  *
35  * MULTICAST 3.5.1.1
36  */
37 
38 #include <sys/types.h>
39 #include <sys/stream.h>
40 #include <sys/stropts.h>
41 #include <sys/strlog.h>
42 #include <sys/strsun.h>
43 #include <sys/systm.h>
44 #include <sys/ddi.h>
45 #include <sys/sunddi.h>
46 #include <sys/cmn_err.h>
47 #include <sys/atomic.h>
48 #include <sys/zone.h>
49 #include <sys/callb.h>
50 #include <sys/param.h>
51 #include <sys/socket.h>
52 #include <inet/ipclassifier.h>
53 #include <net/if.h>
54 #include <net/route.h>
55 #include <netinet/in.h>
56 #include <netinet/igmp_var.h>
57 #include <netinet/ip6.h>
58 #include <netinet/icmp6.h>
59 #include <inet/ipsec_impl.h>
60 
61 #include <inet/common.h>
62 #include <inet/mi.h>
63 #include <inet/nd.h>
64 #include <inet/tunables.h>
65 #include <inet/ip.h>
66 #include <inet/ip6.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip_listutils.h>
69 
70 #include <netinet/igmp.h>
71 #include <inet/ip_ndp.h>
72 #include <inet/ip_if.h>
73 
74 static uint_t	igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill);
75 static uint_t	igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen);
76 static uint_t	mld_query_in(mld_hdr_t *mldh, ill_t *ill);
77 static uint_t	mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen);
78 static void	igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr);
79 static void	mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr);
80 static void	igmpv3_sendrpt(ill_t *ill, mrec_t *reclist);
81 static void	mldv2_sendrpt(ill_t *ill, mrec_t *reclist);
82 static mrec_t	*mcast_bldmrec(mcast_record_t type, in6_addr_t *grp,
83 		    slist_t *srclist, mrec_t *next);
84 static void	mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp,
85 		    mcast_record_t rtype, slist_t *flist);
86 static mrec_t	*mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist);
87 
88 /*
89  * Macros used to do timer len conversions.  Timer values are always
90  * stored and passed to the timer functions as milliseconds; but the
91  * default values and values from the wire may not be.
92  *
93  * And yes, it's obscure, but decisecond is easier to abbreviate than
94  * "tenths of a second".
95  */
96 #define	DSEC_TO_MSEC(dsec)	((dsec) * 100)
97 #define	SEC_TO_MSEC(sec)	((sec) * 1000)
98 
99 /*
100  * A running timer (scheduled thru timeout) can be cancelled if another
101  * timer with a shorter timeout value is scheduled before it has timed
102  * out.  When the shorter timer expires, the original timer is updated
103  * to account for the time elapsed while the shorter timer ran; but this
104  * does not take into account the amount of time already spent in timeout
105  * state before being preempted by the shorter timer, that is the time
106  * interval between time scheduled to time cancelled.  This can cause
107  * delays in sending out multicast membership reports.  To resolve this
108  * problem, wallclock time (absolute time) is used instead of deltas
109  * (relative time) to track timers.
110  *
111  * The MACRO below gets the lbolt value, used for proper timer scheduling
112  * and firing. Therefore multicast membership reports are sent on time.
113  * The timer does not exactly fire at the time it was scehduled to fire,
114  * there is a difference of a few milliseconds observed. An offset is used
115  * to take care of the difference.
116  */
117 
118 #define	CURRENT_MSTIME	((uint_t)TICK_TO_MSEC(ddi_get_lbolt()))
119 #define	CURRENT_OFFSET	(999)
120 
121 /*
122  * The first multicast join will trigger the igmp timers / mld timers
123  * The unit for next is milliseconds.
124  */
125 void
126 igmp_start_timers(unsigned next, ip_stack_t *ipst)
127 {
128 	int	time_left;
129 	int	ret;
130 	timeout_id_t id;
131 
132 	ASSERT(next != 0 && next != INFINITY);
133 
134 	mutex_enter(&ipst->ips_igmp_timer_lock);
135 
136 	if (ipst->ips_igmp_timer_setter_active) {
137 		/*
138 		 * Serialize timer setters, one at a time. If the
139 		 * timer is currently being set by someone,
140 		 * just record the next time when it has to be
141 		 * invoked and return. The current setter will
142 		 * take care.
143 		 */
144 		ipst->ips_igmp_time_to_next =
145 		    MIN(ipst->ips_igmp_time_to_next, next);
146 		mutex_exit(&ipst->ips_igmp_timer_lock);
147 		return;
148 	} else {
149 		ipst->ips_igmp_timer_setter_active = B_TRUE;
150 	}
151 	if (ipst->ips_igmp_timeout_id == 0) {
152 		/*
153 		 * The timer is inactive. We need to start a timer
154 		 */
155 		ipst->ips_igmp_time_to_next = next;
156 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
157 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
158 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
159 		ipst->ips_igmp_timer_setter_active = B_FALSE;
160 		mutex_exit(&ipst->ips_igmp_timer_lock);
161 		return;
162 	}
163 
164 	/*
165 	 * The timer was scheduled sometime back for firing in
166 	 * 'igmp_time_to_next' ms and is active. We need to
167 	 * reschedule the timeout if the new 'next' will happen
168 	 * earlier than the currently scheduled timeout
169 	 */
170 	time_left = ipst->ips_igmp_timer_scheduled_last +
171 	    MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt();
172 	if (time_left < MSEC_TO_TICK(next)) {
173 		ipst->ips_igmp_timer_setter_active = B_FALSE;
174 		mutex_exit(&ipst->ips_igmp_timer_lock);
175 		return;
176 	}
177 	id = ipst->ips_igmp_timeout_id;
178 
179 	mutex_exit(&ipst->ips_igmp_timer_lock);
180 	ret = untimeout(id);
181 	mutex_enter(&ipst->ips_igmp_timer_lock);
182 	/*
183 	 * The timeout was cancelled, or the timeout handler
184 	 * completed, while we were blocked in the untimeout.
185 	 * No other thread could have set the timer meanwhile
186 	 * since we serialized all the timer setters. Thus
187 	 * no timer is currently active nor executing nor will
188 	 * any timer fire in the future. We start the timer now
189 	 * if needed.
190 	 */
191 	if (ret == -1) {
192 		ASSERT(ipst->ips_igmp_timeout_id == 0);
193 	} else {
194 		ASSERT(ipst->ips_igmp_timeout_id != 0);
195 		ipst->ips_igmp_timeout_id = 0;
196 	}
197 	if (ipst->ips_igmp_time_to_next != 0) {
198 		ipst->ips_igmp_time_to_next =
199 		    MIN(ipst->ips_igmp_time_to_next, next);
200 		ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler,
201 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next));
202 		ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt();
203 	}
204 	ipst->ips_igmp_timer_setter_active = B_FALSE;
205 	mutex_exit(&ipst->ips_igmp_timer_lock);
206 }
207 
208 /*
209  * mld_start_timers:
210  * The unit for next is milliseconds.
211  */
212 void
213 mld_start_timers(unsigned next, ip_stack_t *ipst)
214 {
215 	int	time_left;
216 	int	ret;
217 	timeout_id_t id;
218 
219 	ASSERT(next != 0 && next != INFINITY);
220 
221 	mutex_enter(&ipst->ips_mld_timer_lock);
222 	if (ipst->ips_mld_timer_setter_active) {
223 		/*
224 		 * Serialize timer setters, one at a time. If the
225 		 * timer is currently being set by someone,
226 		 * just record the next time when it has to be
227 		 * invoked and return. The current setter will
228 		 * take care.
229 		 */
230 		ipst->ips_mld_time_to_next =
231 		    MIN(ipst->ips_mld_time_to_next, next);
232 		mutex_exit(&ipst->ips_mld_timer_lock);
233 		return;
234 	} else {
235 		ipst->ips_mld_timer_setter_active = B_TRUE;
236 	}
237 	if (ipst->ips_mld_timeout_id == 0) {
238 		/*
239 		 * The timer is inactive. We need to start a timer
240 		 */
241 		ipst->ips_mld_time_to_next = next;
242 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
243 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
244 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
245 		ipst->ips_mld_timer_setter_active = B_FALSE;
246 		mutex_exit(&ipst->ips_mld_timer_lock);
247 		return;
248 	}
249 
250 	/*
251 	 * The timer was scheduled sometime back for firing in
252 	 * 'igmp_time_to_next' ms and is active. We need to
253 	 * reschedule the timeout if the new 'next' will happen
254 	 * earlier than the currently scheduled timeout
255 	 */
256 	time_left = ipst->ips_mld_timer_scheduled_last +
257 	    MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt();
258 	if (time_left < MSEC_TO_TICK(next)) {
259 		ipst->ips_mld_timer_setter_active = B_FALSE;
260 		mutex_exit(&ipst->ips_mld_timer_lock);
261 		return;
262 	}
263 	id = ipst->ips_mld_timeout_id;
264 
265 	mutex_exit(&ipst->ips_mld_timer_lock);
266 	ret = untimeout(id);
267 	mutex_enter(&ipst->ips_mld_timer_lock);
268 	/*
269 	 * The timeout was cancelled, or the timeout handler
270 	 * completed, while we were blocked in the untimeout.
271 	 * No other thread could have set the timer meanwhile
272 	 * since we serialized all the timer setters. Thus
273 	 * no timer is currently active nor executing nor will
274 	 * any timer fire in the future. We start the timer now
275 	 * if needed.
276 	 */
277 	if (ret == -1) {
278 		ASSERT(ipst->ips_mld_timeout_id == 0);
279 	} else {
280 		ASSERT(ipst->ips_mld_timeout_id != 0);
281 		ipst->ips_mld_timeout_id = 0;
282 	}
283 	if (ipst->ips_mld_time_to_next != 0) {
284 		ipst->ips_mld_time_to_next =
285 		    MIN(ipst->ips_mld_time_to_next, next);
286 		ipst->ips_mld_timeout_id = timeout(mld_timeout_handler,
287 		    (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next));
288 		ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt();
289 	}
290 	ipst->ips_mld_timer_setter_active = B_FALSE;
291 	mutex_exit(&ipst->ips_mld_timer_lock);
292 }
293 
294 /*
295  * igmp_input:
296  * Return NULL for a bad packet that is discarded here.
297  * Return mp if the message is OK and should be handed to "raw" receivers.
298  * Callers of igmp_input() may need to reinitialize variables that were copied
299  * from the mblk as this calls pullupmsg().
300  */
301 mblk_t *
302 igmp_input(mblk_t *mp, ip_recv_attr_t *ira)
303 {
304 	igmpa_t 	*igmpa;
305 	ipha_t		*ipha = (ipha_t *)(mp->b_rptr);
306 	int		iphlen, igmplen, mblklen;
307 	ilm_t 		*ilm;
308 	uint32_t	src, dst;
309 	uint32_t 	group;
310 	in6_addr_t	v6group;
311 	uint_t		next;
312 	ipif_t 		*ipif;
313 	ill_t		*ill = ira->ira_ill;
314 	ip_stack_t	*ipst = ill->ill_ipst;
315 
316 	ASSERT(!ill->ill_isv6);
317 	++ipst->ips_igmpstat.igps_rcv_total;
318 
319 	mblklen = MBLKL(mp);
320 	iphlen = ira->ira_ip_hdr_length;
321 	if (mblklen < 1 || mblklen < iphlen) {
322 		++ipst->ips_igmpstat.igps_rcv_tooshort;
323 		goto bad_pkt;
324 	}
325 	igmplen = ira->ira_pktlen - iphlen;
326 	/*
327 	 * Since msg sizes are more variable with v3, just pullup the
328 	 * whole thing now.
329 	 */
330 	if (MBLKL(mp) < (igmplen + iphlen)) {
331 		mblk_t *mp1;
332 		if ((mp1 = msgpullup(mp, -1)) == NULL) {
333 			++ipst->ips_igmpstat.igps_rcv_tooshort;
334 			goto bad_pkt;
335 		}
336 		freemsg(mp);
337 		mp = mp1;
338 		ipha = (ipha_t *)(mp->b_rptr);
339 	}
340 
341 	/*
342 	 * Validate lengths
343 	 */
344 	if (igmplen < IGMP_MINLEN) {
345 		++ipst->ips_igmpstat.igps_rcv_tooshort;
346 		goto bad_pkt;
347 	}
348 
349 	igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]);
350 	src = ipha->ipha_src;
351 	dst = ipha->ipha_dst;
352 	if (ip_debug > 1)
353 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
354 		    "igmp_input: src 0x%x, dst 0x%x on %s\n",
355 		    (int)ntohl(src), (int)ntohl(dst),
356 		    ill->ill_name);
357 
358 	switch (igmpa->igmpa_type) {
359 	case IGMP_MEMBERSHIP_QUERY:
360 		/*
361 		 * packet length differentiates between v1/v2 and v3
362 		 * v1/v2 should be exactly 8 octets long; v3 is >= 12
363 		 */
364 		if ((igmplen == IGMP_MINLEN) ||
365 		    (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) {
366 			next = igmp_query_in(ipha, igmpa, ill);
367 		} else if (igmplen >= IGMP_V3_QUERY_MINLEN) {
368 			next = igmpv3_query_in((igmp3qa_t *)igmpa, ill,
369 			    igmplen);
370 		} else {
371 			++ipst->ips_igmpstat.igps_rcv_tooshort;
372 			goto bad_pkt;
373 		}
374 		if (next == 0)
375 			goto bad_pkt;
376 
377 		if (next != INFINITY)
378 			igmp_start_timers(next, ipst);
379 
380 		break;
381 
382 	case IGMP_V1_MEMBERSHIP_REPORT:
383 	case IGMP_V2_MEMBERSHIP_REPORT:
384 		/*
385 		 * For fast leave to work, we have to know that we are the
386 		 * last person to send a report for this group. Reports
387 		 * generated by us are looped back since we could potentially
388 		 * be a multicast router, so discard reports sourced by me.
389 		 */
390 		mutex_enter(&ill->ill_lock);
391 		for (ipif = ill->ill_ipif; ipif != NULL;
392 		    ipif = ipif->ipif_next) {
393 			if (ipif->ipif_lcl_addr == src) {
394 				if (ip_debug > 1) {
395 					(void) mi_strlog(ill->ill_rq,
396 					    1,
397 					    SL_TRACE,
398 					    "igmp_input: we are only "
399 					    "member src 0x%x\n",
400 					    (int)ntohl(src));
401 				}
402 				mutex_exit(&ill->ill_lock);
403 				return (mp);
404 			}
405 		}
406 		mutex_exit(&ill->ill_lock);
407 
408 		++ipst->ips_igmpstat.igps_rcv_reports;
409 		group = igmpa->igmpa_group;
410 		if (!CLASSD(group)) {
411 			++ipst->ips_igmpstat.igps_rcv_badreports;
412 			goto bad_pkt;
413 		}
414 
415 		/*
416 		 * KLUDGE: if the IP source address of the report has an
417 		 * unspecified (i.e., zero) subnet number, as is allowed for
418 		 * a booting host, replace it with the correct subnet number
419 		 * so that a process-level multicast routing demon can
420 		 * determine which subnet it arrived from.  This is necessary
421 		 * to compensate for the lack of any way for a process to
422 		 * determine the arrival interface of an incoming packet.
423 		 *
424 		 * Requires that a copy of *this* message it passed up
425 		 * to the raw interface which is done by our caller.
426 		 */
427 		if ((src & htonl(0xFF000000U)) == 0) {	/* Minimum net mask */
428 			/* Pick the first ipif on this ill */
429 			mutex_enter(&ill->ill_lock);
430 			src = ill->ill_ipif->ipif_subnet;
431 			mutex_exit(&ill->ill_lock);
432 			ip1dbg(("igmp_input: changed src to 0x%x\n",
433 			    (int)ntohl(src)));
434 			ipha->ipha_src = src;
435 		}
436 
437 		/*
438 		 * If our ill has ILMs that belong to the group being
439 		 * reported, and we are a 'Delaying Member' in the RFC
440 		 * terminology, stop our timer for that group and 'clear
441 		 * flag' i.e. mark as IGMP_OTHERMEMBER.
442 		 */
443 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
444 		IN6_IPADDR_TO_V4MAPPED(group, &v6group);
445 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
446 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &v6group))
447 				continue;
448 
449 			++ipst->ips_igmpstat.igps_rcv_ourreports;
450 			ilm->ilm_timer = INFINITY;
451 			ilm->ilm_state = IGMP_OTHERMEMBER;
452 		} /* for */
453 		rw_exit(&ill->ill_mcast_lock);
454 		ill_mcast_timer_start(ill->ill_ipst);
455 		break;
456 
457 	case IGMP_V3_MEMBERSHIP_REPORT:
458 		/*
459 		 * Currently nothing to do here; IGMP router is not
460 		 * implemented in ip, and v3 hosts don't pay attention
461 		 * to membership reports.
462 		 */
463 		break;
464 	}
465 	/*
466 	 * Pass all valid IGMP packets up to any process(es) listening
467 	 * on a raw IGMP socket. Do not free the packet.
468 	 */
469 	return (mp);
470 
471 bad_pkt:
472 	freemsg(mp);
473 	return (NULL);
474 }
475 
476 static uint_t
477 igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill)
478 {
479 	ilm_t	*ilm;
480 	int	timer;
481 	uint_t	next, current;
482 	ip_stack_t	 *ipst;
483 
484 	ipst = ill->ill_ipst;
485 	++ipst->ips_igmpstat.igps_rcv_queries;
486 
487 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
488 	/*
489 	 * In the IGMPv2 specification, there are 3 states and a flag.
490 	 *
491 	 * In Non-Member state, we simply don't have a membership record.
492 	 * In Delaying Member state, our timer is running (ilm->ilm_timer
493 	 * < INFINITY).  In Idle Member state, our timer is not running
494 	 * (ilm->ilm_timer == INFINITY).
495 	 *
496 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
497 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
498 	 * if I sent the last report.
499 	 */
500 	if ((igmpa->igmpa_code == 0) ||
501 	    (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) {
502 		/*
503 		 * Query from an old router.
504 		 * Remember that the querier on this interface is old,
505 		 * and set the timer to the value in RFC 1112.
506 		 */
507 		ill->ill_mcast_v1_time = 0;
508 		ill->ill_mcast_v1_tset = 1;
509 		if (ill->ill_mcast_type != IGMP_V1_ROUTER) {
510 			ip1dbg(("Received IGMPv1 Query on %s, switching mode "
511 			    "to IGMP_V1_ROUTER\n", ill->ill_name));
512 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
513 			ill->ill_mcast_type = IGMP_V1_ROUTER;
514 		}
515 
516 		timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY);
517 
518 		if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) ||
519 		    igmpa->igmpa_group != 0) {
520 			++ipst->ips_igmpstat.igps_rcv_badqueries;
521 			rw_exit(&ill->ill_mcast_lock);
522 			ill_mcast_timer_start(ill->ill_ipst);
523 			return (0);
524 		}
525 
526 	} else {
527 		in_addr_t group;
528 
529 		/*
530 		 * Query from a new router
531 		 * Simply do a validity check
532 		 */
533 		group = igmpa->igmpa_group;
534 		if (group != 0 && (!CLASSD(group))) {
535 			++ipst->ips_igmpstat.igps_rcv_badqueries;
536 			rw_exit(&ill->ill_mcast_lock);
537 			ill_mcast_timer_start(ill->ill_ipst);
538 			return (0);
539 		}
540 
541 		/*
542 		 * Switch interface state to v2 on receipt of a v2 query
543 		 * ONLY IF current state is v3.  Let things be if current
544 		 * state if v1 but do reset the v2-querier-present timer.
545 		 */
546 		if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
547 			ip1dbg(("Received IGMPv2 Query on %s, switching mode "
548 			    "to IGMP_V2_ROUTER", ill->ill_name));
549 			atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1);
550 			ill->ill_mcast_type = IGMP_V2_ROUTER;
551 		}
552 		ill->ill_mcast_v2_time = 0;
553 		ill->ill_mcast_v2_tset = 1;
554 
555 		timer = DSEC_TO_MSEC((int)igmpa->igmpa_code);
556 	}
557 
558 	if (ip_debug > 1) {
559 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
560 		    "igmp_input: TIMER = igmp_code %d igmp_type 0x%x",
561 		    (int)ntohs(igmpa->igmpa_code),
562 		    (int)ntohs(igmpa->igmpa_type));
563 	}
564 
565 	/*
566 	 * -Start the timers in all of our membership records
567 	 *  for the physical interface on which the query
568 	 *  arrived, excluding those that belong to the "all
569 	 *  hosts" group (224.0.0.1).
570 	 *
571 	 * -Restart any timer that is already running but has
572 	 *  a value longer than the requested timeout.
573 	 *
574 	 * -Use the value specified in the query message as
575 	 *  the maximum timeout.
576 	 */
577 	next = (unsigned)INFINITY;
578 
579 	current = CURRENT_MSTIME;
580 	for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
581 
582 		/*
583 		 * A multicast router joins INADDR_ANY address
584 		 * to enable promiscuous reception of all
585 		 * mcasts from the interface. This INADDR_ANY
586 		 * is stored in the ilm_v6addr as V6 unspec addr
587 		 */
588 		if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr))
589 			continue;
590 		if (ilm->ilm_addr == htonl(INADDR_ANY))
591 			continue;
592 		if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) &&
593 		    (igmpa->igmpa_group == 0) ||
594 		    (igmpa->igmpa_group == ilm->ilm_addr)) {
595 			if (ilm->ilm_timer > timer) {
596 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
597 				if (ilm->ilm_timer < next)
598 					next = ilm->ilm_timer;
599 				ilm->ilm_timer += current;
600 			}
601 		}
602 	}
603 	rw_exit(&ill->ill_mcast_lock);
604 	/*
605 	 * No packets have been sent above - no
606 	 * ill_mcast_send_queued is needed.
607 	 */
608 	ill_mcast_timer_start(ill->ill_ipst);
609 
610 	return (next);
611 }
612 
613 static uint_t
614 igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen)
615 {
616 	uint_t		i, next, mrd, qqi, timer, delay, numsrc;
617 	uint_t		current;
618 	ilm_t		*ilm;
619 	ipaddr_t	*src_array;
620 	uint8_t		qrv;
621 	ip_stack_t	 *ipst;
622 
623 	ipst = ill->ill_ipst;
624 	/* make sure numsrc matches packet size */
625 	numsrc = ntohs(igmp3qa->igmp3qa_numsrc);
626 	if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) {
627 		++ipst->ips_igmpstat.igps_rcv_tooshort;
628 		return (0);
629 	}
630 	src_array = (ipaddr_t *)&igmp3qa[1];
631 
632 	++ipst->ips_igmpstat.igps_rcv_queries;
633 
634 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
635 
636 	if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) {
637 		uint_t hdrval, mant, exp;
638 		hdrval = (uint_t)igmp3qa->igmp3qa_mxrc;
639 		mant = hdrval & IGMP_V3_MAXRT_MANT_MASK;
640 		exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4;
641 		mrd = (mant | 0x10) << (exp + 3);
642 	}
643 	if (mrd == 0)
644 		mrd = MCAST_DEF_QUERY_RESP_INTERVAL;
645 	timer = DSEC_TO_MSEC(mrd);
646 	MCAST_RANDOM_DELAY(delay, timer);
647 	next = (unsigned)INFINITY;
648 	current = CURRENT_MSTIME;
649 
650 	if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0)
651 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
652 	else
653 		ill->ill_mcast_rv = qrv;
654 
655 	if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) {
656 		uint_t hdrval, mant, exp;
657 		hdrval = (uint_t)igmp3qa->igmp3qa_qqic;
658 		mant = hdrval & IGMP_V3_QQI_MANT_MASK;
659 		exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4;
660 		qqi = (mant | 0x10) << (exp + 3);
661 	}
662 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
663 
664 	/*
665 	 * If we have a pending general query response that's scheduled
666 	 * sooner than the delay we calculated for this response, then
667 	 * no action is required (RFC3376 section 5.2 rule 1)
668 	 */
669 	if (ill->ill_global_timer < (current + delay)) {
670 		rw_exit(&ill->ill_mcast_lock);
671 		ill_mcast_timer_start(ill->ill_ipst);
672 		return (next);
673 	}
674 
675 	/*
676 	 * Now take action depending upon query type:
677 	 * general, group specific, or group/source specific.
678 	 */
679 	if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) {
680 		/*
681 		 * general query
682 		 * We know global timer is either not running or is
683 		 * greater than our calculated delay, so reset it to
684 		 * our delay (random value in range [0, response time]).
685 		 */
686 		ill->ill_global_timer =  current + delay;
687 		next = delay;
688 	} else {
689 		/* group or group/source specific query */
690 		for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) {
691 			if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) ||
692 			    (ilm->ilm_addr == htonl(INADDR_ANY)) ||
693 			    (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) ||
694 			    (igmp3qa->igmp3qa_group != ilm->ilm_addr))
695 				continue;
696 			/*
697 			 * If the query is group specific or we have a
698 			 * pending group specific query, the response is
699 			 * group specific (pending sources list should be
700 			 * empty).  Otherwise, need to update the pending
701 			 * sources list for the group and source specific
702 			 * response.
703 			 */
704 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
705 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
706 group_query:
707 				FREE_SLIST(ilm->ilm_pendsrcs);
708 				ilm->ilm_pendsrcs = NULL;
709 			} else {
710 				boolean_t overflow;
711 				slist_t *pktl;
712 				if (numsrc > MAX_FILTER_SIZE ||
713 				    (ilm->ilm_pendsrcs == NULL &&
714 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
715 					/*
716 					 * We've been sent more sources than
717 					 * we can deal with; or we can't deal
718 					 * with a source list at all.  Revert
719 					 * to a group specific query.
720 					 */
721 					goto group_query;
722 				}
723 				if ((pktl = l_alloc()) == NULL)
724 					goto group_query;
725 				pktl->sl_numsrc = numsrc;
726 				for (i = 0; i < numsrc; i++)
727 					IN6_IPADDR_TO_V4MAPPED(src_array[i],
728 					    &(pktl->sl_addr[i]));
729 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
730 				    &overflow);
731 				l_free(pktl);
732 				if (overflow)
733 					goto group_query;
734 			}
735 
736 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
737 			    INFINITY : (ilm->ilm_timer - current);
738 			/* choose soonest timer */
739 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
740 			if (ilm->ilm_timer < next)
741 				next = ilm->ilm_timer;
742 			ilm->ilm_timer += current;
743 		}
744 	}
745 	rw_exit(&ill->ill_mcast_lock);
746 	/*
747 	 * No packets have been sent above - no
748 	 * ill_mcast_send_queued is needed.
749 	 */
750 	ill_mcast_timer_start(ill->ill_ipst);
751 
752 	return (next);
753 }
754 
755 /*
756  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
757  * and it gets sent after the lock is dropped.
758  */
759 void
760 igmp_joingroup(ilm_t *ilm)
761 {
762 	uint_t	timer;
763 	ill_t	*ill;
764 	ip_stack_t	*ipst = ilm->ilm_ipst;
765 
766 	ill = ilm->ilm_ill;
767 
768 	ASSERT(!ill->ill_isv6);
769 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
770 
771 	if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) {
772 		ilm->ilm_rtx.rtx_timer = INFINITY;
773 		ilm->ilm_state = IGMP_OTHERMEMBER;
774 	} else {
775 		ip1dbg(("Querier mode %d, sending report, group %x\n",
776 		    ill->ill_mcast_type, htonl(ilm->ilm_addr)));
777 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
778 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
779 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
780 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
781 		} else if (ill->ill_mcast_type == IGMP_V3_ROUTER) {
782 			mrec_t *rp;
783 			mcast_record_t rtype;
784 			/*
785 			 * The possible state changes we need to handle here:
786 			 *   Old State	New State	Report
787 			 *
788 			 *   INCLUDE(0)	INCLUDE(X)	ALLOW(X),BLOCK(0)
789 			 *   INCLUDE(0)	EXCLUDE(X)	TO_EX(X)
790 			 *
791 			 * No need to send the BLOCK(0) report; ALLOW(X)
792 			 * is enough.
793 			 */
794 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
795 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
796 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
797 			    ilm->ilm_filter, NULL);
798 			igmpv3_sendrpt(ill, rp);
799 			/*
800 			 * Set up retransmission state.  Timer is set below,
801 			 * for both v3 and older versions.
802 			 */
803 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
804 			    ilm->ilm_filter);
805 		}
806 
807 		/* Set the ilm timer value */
808 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
809 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
810 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
811 		timer = ilm->ilm_rtx.rtx_timer;
812 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
813 		ilm->ilm_state = IGMP_IREPORTEDLAST;
814 
815 		/*
816 		 * We are holding ill_mcast_lock here and the timeout
817 		 * handler (igmp_timeout_handler_per_ill) acquires that
818 		 * lock. Hence we can't call igmp_start_timers since it could
819 		 * deadlock in untimeout().
820 		 * Instead the thread which drops ill_mcast_lock will have
821 		 * to call ill_mcast_timer_start().
822 		 */
823 		mutex_enter(&ipst->ips_igmp_timer_lock);
824 		ipst->ips_igmp_deferred_next = MIN(timer,
825 		    ipst->ips_igmp_deferred_next);
826 		mutex_exit(&ipst->ips_igmp_timer_lock);
827 	}
828 
829 	if (ip_debug > 1) {
830 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
831 		    "igmp_joingroup: multicast_type %d timer %d",
832 		    (ilm->ilm_ill->ill_mcast_type),
833 		    (int)ntohl(timer));
834 	}
835 }
836 
837 /*
838  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
839  * and it gets sent after the lock is dropped.
840  */
841 void
842 mld_joingroup(ilm_t *ilm)
843 {
844 	uint_t	timer;
845 	ill_t	*ill;
846 	ip_stack_t	*ipst = ilm->ilm_ipst;
847 
848 	ill = ilm->ilm_ill;
849 
850 	ASSERT(ill->ill_isv6);
851 
852 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
853 
854 	if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) {
855 		ilm->ilm_rtx.rtx_timer = INFINITY;
856 		ilm->ilm_state = IGMP_OTHERMEMBER;
857 	} else {
858 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
859 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
860 		} else {
861 			mrec_t *rp;
862 			mcast_record_t rtype;
863 			/*
864 			 * The possible state changes we need to handle here:
865 			 *	Old State   New State	Report
866 			 *
867 			 *	INCLUDE(0)  INCLUDE(X)	ALLOW(X),BLOCK(0)
868 			 *	INCLUDE(0)  EXCLUDE(X)	TO_EX(X)
869 			 *
870 			 * No need to send the BLOCK(0) report; ALLOW(X)
871 			 * is enough
872 			 */
873 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
874 			    ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE;
875 			rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
876 			    ilm->ilm_filter, NULL);
877 			mldv2_sendrpt(ill, rp);
878 			/*
879 			 * Set up retransmission state.  Timer is set below,
880 			 * for both v2 and v1.
881 			 */
882 			mcast_init_rtx(ill, &ilm->ilm_rtx, rtype,
883 			    ilm->ilm_filter);
884 		}
885 
886 		/* Set the ilm timer value */
887 		ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER ||
888 		    ilm->ilm_rtx.rtx_cnt > 0);
889 
890 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
891 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
892 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
893 		timer = ilm->ilm_rtx.rtx_timer;
894 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
895 		ilm->ilm_state = IGMP_IREPORTEDLAST;
896 
897 		/*
898 		 * We are holding ill_mcast_lock here and the timeout
899 		 * handler (mld_timeout_handler_per_ill) acquires that
900 		 * lock. Hence we can't call mld_start_timers since it could
901 		 * deadlock in untimeout().
902 		 * Instead the thread which drops ill_mcast_lock will have
903 		 * to call ill_mcast_timer_start().
904 		 */
905 		mutex_enter(&ipst->ips_mld_timer_lock);
906 		ipst->ips_mld_deferred_next = MIN(timer,
907 		    ipst->ips_mld_deferred_next);
908 		mutex_exit(&ipst->ips_mld_timer_lock);
909 	}
910 
911 	if (ip_debug > 1) {
912 		(void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE,
913 		    "mld_joingroup: multicast_type %d timer %d",
914 		    (ilm->ilm_ill->ill_mcast_type),
915 		    (int)ntohl(timer));
916 	}
917 }
918 
919 /*
920  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
921  * and it gets sent after the lock is dropped.
922  */
923 void
924 igmp_leavegroup(ilm_t *ilm)
925 {
926 	ill_t *ill = ilm->ilm_ill;
927 
928 	ASSERT(!ill->ill_isv6);
929 
930 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
931 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
932 	    ill->ill_mcast_type == IGMP_V2_ROUTER &&
933 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
934 		igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP,
935 		    (htonl(INADDR_ALLRTRS_GROUP)));
936 		return;
937 	}
938 	if ((ill->ill_mcast_type == IGMP_V3_ROUTER) &&
939 	    (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) {
940 		mrec_t *rp;
941 		/*
942 		 * The possible state changes we need to handle here:
943 		 *	Old State	New State	Report
944 		 *
945 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
946 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
947 		 *
948 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
949 		 */
950 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
951 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
952 			    ilm->ilm_filter, NULL);
953 		} else {
954 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
955 			    NULL, NULL);
956 		}
957 		igmpv3_sendrpt(ill, rp);
958 		return;
959 	}
960 }
961 
962 /*
963  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
964  * and it gets sent after the lock is dropped.
965  */
966 void
967 mld_leavegroup(ilm_t *ilm)
968 {
969 	ill_t *ill = ilm->ilm_ill;
970 
971 	ASSERT(ill->ill_isv6);
972 
973 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
974 	if (ilm->ilm_state == IGMP_IREPORTEDLAST &&
975 	    ill->ill_mcast_type == MLD_V1_ROUTER &&
976 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
977 		mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast);
978 		return;
979 	}
980 	if ((ill->ill_mcast_type == MLD_V2_ROUTER) &&
981 	    (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) {
982 		mrec_t *rp;
983 		/*
984 		 * The possible state changes we need to handle here:
985 		 *	Old State	New State	Report
986 		 *
987 		 *	INCLUDE(X)	INCLUDE(0)	ALLOW(0),BLOCK(X)
988 		 *	EXCLUDE(X)	INCLUDE(0)	TO_IN(0)
989 		 *
990 		 * No need to send the ALLOW(0) report; BLOCK(X) is enough
991 		 */
992 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
993 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
994 			    ilm->ilm_filter, NULL);
995 		} else {
996 			rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr,
997 			    NULL, NULL);
998 		}
999 		mldv2_sendrpt(ill, rp);
1000 		return;
1001 	}
1002 }
1003 
1004 /*
1005  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1006  * and it gets sent after the lock is dropped.
1007  */
1008 void
1009 igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1010 {
1011 	ill_t *ill;
1012 	mrec_t *rp;
1013 	ip_stack_t	*ipst = ilm->ilm_ipst;
1014 
1015 	ASSERT(ilm != NULL);
1016 
1017 	/* state change reports should only be sent if the router is v3 */
1018 	if (ilm->ilm_ill->ill_mcast_type != IGMP_V3_ROUTER)
1019 		return;
1020 
1021 	ill = ilm->ilm_ill;
1022 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1023 
1024 	/*
1025 	 * Compare existing(old) state with the new state and prepare
1026 	 * State Change Report, according to the rules in RFC 3376:
1027 	 *
1028 	 *	Old State	New State	State Change Report
1029 	 *
1030 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1031 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1032 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1033 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1034 	 */
1035 
1036 	if (ilm->ilm_fmode == fmode) {
1037 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1038 		slist_t *allow, *block;
1039 		if (((a_minus_b = l_alloc()) == NULL) ||
1040 		    ((b_minus_a = l_alloc()) == NULL)) {
1041 			l_free(a_minus_b);
1042 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1043 				goto send_to_ex;
1044 			else
1045 				goto send_to_in;
1046 		}
1047 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1048 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1049 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1050 			allow = b_minus_a;
1051 			block = a_minus_b;
1052 		} else {
1053 			allow = a_minus_b;
1054 			block = b_minus_a;
1055 		}
1056 		rp = NULL;
1057 		if (!SLIST_IS_EMPTY(allow))
1058 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1059 			    allow, rp);
1060 		if (!SLIST_IS_EMPTY(block))
1061 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1062 			    block, rp);
1063 		l_free(a_minus_b);
1064 		l_free(b_minus_a);
1065 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1066 send_to_ex:
1067 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1068 		    NULL);
1069 	} else {
1070 send_to_in:
1071 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1072 		    NULL);
1073 	}
1074 
1075 	/*
1076 	 * Need to set up retransmission state; merge the new info with the
1077 	 * current state (which may be null).  If the timer is not currently
1078 	 * running, the caller will start it when dropping ill_mcast_lock.
1079 	 */
1080 	rp = mcast_merge_rtx(ilm, rp, flist);
1081 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1082 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1083 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1084 		    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1085 		mutex_enter(&ipst->ips_igmp_timer_lock);
1086 		ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next,
1087 		    ilm->ilm_rtx.rtx_timer);
1088 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1089 		mutex_exit(&ipst->ips_igmp_timer_lock);
1090 	}
1091 
1092 	igmpv3_sendrpt(ill, rp);
1093 }
1094 
1095 /*
1096  * Caller holds ill_mcast_lock. We queue the packet using ill_mcast_queue
1097  * and it gets sent after the lock is dropped.
1098  */
1099 void
1100 mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist)
1101 {
1102 	ill_t *ill;
1103 	mrec_t *rp = NULL;
1104 	ip_stack_t	*ipst = ilm->ilm_ipst;
1105 
1106 	ASSERT(ilm != NULL);
1107 
1108 	ill = ilm->ilm_ill;
1109 	ASSERT(RW_WRITE_HELD(&ill->ill_mcast_lock));
1110 
1111 	/* only need to send if we have an mldv2-capable router */
1112 	if (ill->ill_mcast_type != MLD_V2_ROUTER) {
1113 		return;
1114 	}
1115 
1116 	/*
1117 	 * Compare existing (old) state with the new state passed in
1118 	 * and send appropriate MLDv2 State Change Report.
1119 	 *
1120 	 *	Old State	New State	State Change Report
1121 	 *
1122 	 *	INCLUDE(A)	INCLUDE(B)	ALLOW(B-A),BLOCK(A-B)
1123 	 *	EXCLUDE(A)	EXCLUDE(B)	ALLOW(A-B),BLOCK(B-A)
1124 	 *	INCLUDE(A)	EXCLUDE(B)	TO_EX(B)
1125 	 *	EXCLUDE(A)	INCLUDE(B)	TO_IN(B)
1126 	 */
1127 	if (ilm->ilm_fmode == fmode) {
1128 		slist_t	*a_minus_b = NULL, *b_minus_a = NULL;
1129 		slist_t *allow, *block;
1130 		if (((a_minus_b = l_alloc()) == NULL) ||
1131 		    ((b_minus_a = l_alloc()) == NULL)) {
1132 			l_free(a_minus_b);
1133 			if (ilm->ilm_fmode == MODE_IS_INCLUDE)
1134 				goto send_to_ex;
1135 			else
1136 				goto send_to_in;
1137 		}
1138 		l_difference(ilm->ilm_filter, flist, a_minus_b);
1139 		l_difference(flist, ilm->ilm_filter, b_minus_a);
1140 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1141 			allow = b_minus_a;
1142 			block = a_minus_b;
1143 		} else {
1144 			allow = a_minus_b;
1145 			block = b_minus_a;
1146 		}
1147 		if (!SLIST_IS_EMPTY(allow))
1148 			rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr,
1149 			    allow, rp);
1150 		if (!SLIST_IS_EMPTY(block))
1151 			rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr,
1152 			    block, rp);
1153 		l_free(a_minus_b);
1154 		l_free(b_minus_a);
1155 	} else if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1156 send_to_ex:
1157 		rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist,
1158 		    NULL);
1159 	} else {
1160 send_to_in:
1161 		rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist,
1162 		    NULL);
1163 	}
1164 
1165 	/*
1166 	 * Need to set up retransmission state; merge the new info with the
1167 	 * current state (which may be null).  If the timer is not currently
1168 	 * running, the caller will start it when dropping ill_mcast_lock.
1169 	 */
1170 	rp = mcast_merge_rtx(ilm, rp, flist);
1171 	ASSERT(ilm->ilm_rtx.rtx_cnt > 0);
1172 	if (ilm->ilm_rtx.rtx_timer == INFINITY) {
1173 		ilm->ilm_rtx.rtx_cnt = ill->ill_mcast_rv;
1174 		MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer,
1175 		    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1176 		mutex_enter(&ipst->ips_mld_timer_lock);
1177 		ipst->ips_mld_deferred_next =
1178 		    MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer);
1179 		ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME;
1180 		mutex_exit(&ipst->ips_mld_timer_lock);
1181 	}
1182 
1183 	mldv2_sendrpt(ill, rp);
1184 }
1185 
1186 uint_t
1187 igmp_timeout_handler_per_ill(ill_t *ill)
1188 {
1189 	uint_t	next = INFINITY, current;
1190 	ilm_t	*ilm;
1191 	mrec_t	*rp = NULL;
1192 	mrec_t	*rtxrp = NULL;
1193 	rtx_state_t *rtxp;
1194 	mcast_record_t	rtype;
1195 
1196 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1197 
1198 	current = CURRENT_MSTIME;
1199 	/* First check the global timer on this interface */
1200 	if (ill->ill_global_timer == INFINITY)
1201 		goto per_ilm_timer;
1202 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1203 		ill->ill_global_timer = INFINITY;
1204 		/*
1205 		 * Send report for each group on this interface.
1206 		 * Since we just set the global timer (received a v3 general
1207 		 * query), need to skip the all hosts addr (224.0.0.1), per
1208 		 * RFC 3376 section 5.
1209 		 */
1210 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1211 			if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP))
1212 				continue;
1213 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1214 			    ilm->ilm_filter, rp);
1215 			/*
1216 			 * Since we're sending a report on this group, okay
1217 			 * to delete pending group-specific timers.  Note
1218 			 * that group-specific retransmit timers still need
1219 			 * to be checked in the per_ilm_timer for-loop.
1220 			 */
1221 			ilm->ilm_timer = INFINITY;
1222 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1223 			FREE_SLIST(ilm->ilm_pendsrcs);
1224 			ilm->ilm_pendsrcs = NULL;
1225 		}
1226 		igmpv3_sendrpt(ill, rp);
1227 		rp = NULL;
1228 	} else {
1229 		if ((ill->ill_global_timer - current) < next)
1230 			next = ill->ill_global_timer - current;
1231 	}
1232 
1233 per_ilm_timer:
1234 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1235 		if (ilm->ilm_timer == INFINITY)
1236 			goto per_ilm_rtxtimer;
1237 
1238 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1239 			if ((ilm->ilm_timer - current) < next)
1240 				next = ilm->ilm_timer - current;
1241 
1242 			if (ip_debug > 1) {
1243 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1244 				    "igmp_timo_hlr 2: ilm_timr %d "
1245 				    "typ %d nxt %d",
1246 				    (int)ntohl(ilm->ilm_timer - current),
1247 				    (ill->ill_mcast_type), next);
1248 			}
1249 
1250 			goto per_ilm_rtxtimer;
1251 		}
1252 
1253 		/* the timer has expired, need to take action */
1254 		ilm->ilm_timer = INFINITY;
1255 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1256 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1257 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1258 		} else if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1259 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1260 		} else {
1261 			slist_t *rsp;
1262 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1263 			    (rsp = l_alloc()) != NULL) {
1264 				/*
1265 				 * Contents of reply depend on pending
1266 				 * requested source list.
1267 				 */
1268 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1269 					l_intersection(ilm->ilm_filter,
1270 					    ilm->ilm_pendsrcs, rsp);
1271 				} else {
1272 					l_difference(ilm->ilm_pendsrcs,
1273 					    ilm->ilm_filter, rsp);
1274 				}
1275 				FREE_SLIST(ilm->ilm_pendsrcs);
1276 				ilm->ilm_pendsrcs = NULL;
1277 				if (!SLIST_IS_EMPTY(rsp))
1278 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1279 					    &ilm->ilm_v6addr, rsp, rp);
1280 				FREE_SLIST(rsp);
1281 			} else {
1282 				/*
1283 				 * Either the pending request is just group-
1284 				 * specific, or we couldn't get the resources
1285 				 * (rsp) to build a source-specific reply.
1286 				 */
1287 				rp = mcast_bldmrec(ilm->ilm_fmode,
1288 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1289 			}
1290 			igmpv3_sendrpt(ill, rp);
1291 			rp = NULL;
1292 		}
1293 
1294 per_ilm_rtxtimer:
1295 		rtxp = &ilm->ilm_rtx;
1296 
1297 		if (rtxp->rtx_timer == INFINITY)
1298 			continue;
1299 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1300 			if ((rtxp->rtx_timer - current) < next)
1301 				next = rtxp->rtx_timer - current;
1302 			continue;
1303 		}
1304 
1305 		rtxp->rtx_timer = INFINITY;
1306 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1307 		if (ill->ill_mcast_type == IGMP_V1_ROUTER) {
1308 			igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0);
1309 			continue;
1310 		}
1311 		if (ill->ill_mcast_type == IGMP_V2_ROUTER) {
1312 			igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0);
1313 			continue;
1314 		}
1315 
1316 		/*
1317 		 * The retransmit timer has popped, and our router is
1318 		 * IGMPv3.  We have to delve into the retransmit state
1319 		 * stored in the ilm.
1320 		 *
1321 		 * Decrement the retransmit count.  If the fmode rtx
1322 		 * count is active, decrement it, and send a filter
1323 		 * mode change report with the ilm's source list.
1324 		 * Otherwise, send a source list change report with
1325 		 * the current retransmit lists.
1326 		 */
1327 		ASSERT(rtxp->rtx_cnt > 0);
1328 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1329 		rtxp->rtx_cnt--;
1330 		if (rtxp->rtx_fmode_cnt > 0) {
1331 			rtxp->rtx_fmode_cnt--;
1332 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1333 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1334 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1335 			    ilm->ilm_filter, rtxrp);
1336 		} else {
1337 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1338 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1339 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1340 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1341 		}
1342 		if (rtxp->rtx_cnt > 0) {
1343 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1344 			    SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY));
1345 			if (rtxp->rtx_timer < next)
1346 				next = rtxp->rtx_timer;
1347 			rtxp->rtx_timer += current;
1348 		} else {
1349 			ASSERT(rtxp->rtx_timer == INFINITY);
1350 			CLEAR_SLIST(rtxp->rtx_allow);
1351 			CLEAR_SLIST(rtxp->rtx_block);
1352 		}
1353 		igmpv3_sendrpt(ill, rtxrp);
1354 		rtxrp = NULL;
1355 	}
1356 
1357 	rw_exit(&ill->ill_mcast_lock);
1358 	/* Send any deferred/queued IP packets */
1359 	ill_mcast_send_queued(ill);
1360 	/* Defer ill_mcast_timer_start() until the caller is done */
1361 
1362 	return (next);
1363 }
1364 
1365 /*
1366  * igmp_timeout_handler:
1367  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1368  * Returns number of ticks to next event (or 0 if none).
1369  *
1370  * As part of multicast join and leave igmp we may need to send out an
1371  * igmp request. The igmp related state variables in the ilm are protected
1372  * by ill_mcast_lock. A single global igmp timer is used to track igmp timeouts.
1373  * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers
1374  * starts the igmp timer if needed. It serializes multiple threads trying to
1375  * simultaneously start the timer using the igmp_timer_setter_active flag.
1376  *
1377  * igmp_input() receives igmp queries and responds to the queries
1378  * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers().
1379  * Later the igmp_timer fires, the timeout handler igmp_timerout_handler()
1380  * performs the action exclusively after acquiring ill_mcast_lock.
1381  *
1382  * The igmp_slowtimeo() function is called thru another timer.
1383  * igmp_slowtimeout_lock protects the igmp_slowtimeout_id
1384  */
1385 void
1386 igmp_timeout_handler(void *arg)
1387 {
1388 	ill_t	*ill;
1389 	uint_t  global_next = INFINITY;
1390 	uint_t  next;
1391 	ill_walk_context_t ctx;
1392 	ip_stack_t *ipst = arg;
1393 
1394 	ASSERT(arg != NULL);
1395 	mutex_enter(&ipst->ips_igmp_timer_lock);
1396 	ASSERT(ipst->ips_igmp_timeout_id != 0);
1397 	ipst->ips_igmp_timeout_id = 0;
1398 	ipst->ips_igmp_timer_scheduled_last = 0;
1399 	ipst->ips_igmp_time_to_next = 0;
1400 	mutex_exit(&ipst->ips_igmp_timer_lock);
1401 
1402 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1403 	ill = ILL_START_WALK_V4(&ctx, ipst);
1404 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1405 		ASSERT(!ill->ill_isv6);
1406 		/* Make sure the ill isn't going away. */
1407 		if (!ill_check_and_refhold(ill))
1408 			continue;
1409 		rw_exit(&ipst->ips_ill_g_lock);
1410 		next = igmp_timeout_handler_per_ill(ill);
1411 		if (next < global_next)
1412 			global_next = next;
1413 		ill_refrele(ill);
1414 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1415 	}
1416 	rw_exit(&ipst->ips_ill_g_lock);
1417 	if (global_next != INFINITY)
1418 		igmp_start_timers(global_next, ipst);
1419 }
1420 
1421 /*
1422  * mld_timeout_handler:
1423  * Called when there are timeout events, every next (tick).
1424  * Returns number of ticks to next event (or 0 if none).
1425  */
1426 uint_t
1427 mld_timeout_handler_per_ill(ill_t *ill)
1428 {
1429 	ilm_t 	*ilm;
1430 	uint_t	next = INFINITY, current;
1431 	mrec_t	*rp, *rtxrp;
1432 	rtx_state_t *rtxp;
1433 	mcast_record_t	rtype;
1434 
1435 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1436 
1437 	current = CURRENT_MSTIME;
1438 	/*
1439 	 * First check the global timer on this interface; the global timer
1440 	 * is not used for MLDv1, so if it's set we can assume we're v2.
1441 	 */
1442 	if (ill->ill_global_timer == INFINITY)
1443 		goto per_ilm_timer;
1444 	if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) {
1445 		ill->ill_global_timer = INFINITY;
1446 		/*
1447 		 * Send report for each group on this interface.
1448 		 * Since we just set the global timer (received a v2 general
1449 		 * query), need to skip the all hosts addr (ff02::1), per
1450 		 * RFC 3810 section 6.
1451 		 */
1452 		rp = NULL;
1453 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1454 			if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
1455 			    &ipv6_all_hosts_mcast))
1456 				continue;
1457 			rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr,
1458 			    ilm->ilm_filter, rp);
1459 			/*
1460 			 * Since we're sending a report on this group, okay
1461 			 * to delete pending group-specific timers.  Note
1462 			 * that group-specific retransmit timers still need
1463 			 * to be checked in the per_ilm_timer for-loop.
1464 			 */
1465 			ilm->ilm_timer = INFINITY;
1466 			ilm->ilm_state = IGMP_IREPORTEDLAST;
1467 			FREE_SLIST(ilm->ilm_pendsrcs);
1468 			ilm->ilm_pendsrcs = NULL;
1469 		}
1470 		mldv2_sendrpt(ill, rp);
1471 	} else {
1472 		if ((ill->ill_global_timer - current) < next)
1473 			next = ill->ill_global_timer - current;
1474 	}
1475 
1476 per_ilm_timer:
1477 	rp = rtxrp = NULL;
1478 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
1479 		if (ilm->ilm_timer == INFINITY)
1480 			goto per_ilm_rtxtimer;
1481 
1482 		if (ilm->ilm_timer > (current + CURRENT_OFFSET)) {
1483 			if ((ilm->ilm_timer - current) < next)
1484 				next = ilm->ilm_timer - current;
1485 
1486 			if (ip_debug > 1) {
1487 				(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
1488 				    "igmp_timo_hlr 2: ilm_timr"
1489 				    " %d typ %d nxt %d",
1490 				    (int)ntohl(ilm->ilm_timer - current),
1491 				    (ill->ill_mcast_type), next);
1492 			}
1493 
1494 			goto per_ilm_rtxtimer;
1495 		}
1496 
1497 		/* the timer has expired, need to take action */
1498 		ilm->ilm_timer = INFINITY;
1499 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1500 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1501 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1502 		} else {
1503 			slist_t *rsp;
1504 			if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) &&
1505 			    (rsp = l_alloc()) != NULL) {
1506 				/*
1507 				 * Contents of reply depend on pending
1508 				 * requested source list.
1509 				 */
1510 				if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
1511 					l_intersection(ilm->ilm_filter,
1512 					    ilm->ilm_pendsrcs, rsp);
1513 				} else {
1514 					l_difference(ilm->ilm_pendsrcs,
1515 					    ilm->ilm_filter, rsp);
1516 				}
1517 				FREE_SLIST(ilm->ilm_pendsrcs);
1518 				ilm->ilm_pendsrcs = NULL;
1519 				if (!SLIST_IS_EMPTY(rsp))
1520 					rp = mcast_bldmrec(MODE_IS_INCLUDE,
1521 					    &ilm->ilm_v6addr, rsp, rp);
1522 				FREE_SLIST(rsp);
1523 			} else {
1524 				rp = mcast_bldmrec(ilm->ilm_fmode,
1525 				    &ilm->ilm_v6addr, ilm->ilm_filter, rp);
1526 			}
1527 		}
1528 
1529 per_ilm_rtxtimer:
1530 		rtxp = &ilm->ilm_rtx;
1531 
1532 		if (rtxp->rtx_timer == INFINITY)
1533 			continue;
1534 		if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) {
1535 			if ((rtxp->rtx_timer - current) < next)
1536 				next = rtxp->rtx_timer - current;
1537 			continue;
1538 		}
1539 
1540 		rtxp->rtx_timer = INFINITY;
1541 		ilm->ilm_state = IGMP_IREPORTEDLAST;
1542 		if (ill->ill_mcast_type == MLD_V1_ROUTER) {
1543 			mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
1544 			continue;
1545 		}
1546 
1547 		/*
1548 		 * The retransmit timer has popped, and our router is
1549 		 * MLDv2.  We have to delve into the retransmit state
1550 		 * stored in the ilm.
1551 		 *
1552 		 * Decrement the retransmit count.  If the fmode rtx
1553 		 * count is active, decrement it, and send a filter
1554 		 * mode change report with the ilm's source list.
1555 		 * Otherwise, send a source list change report with
1556 		 * the current retransmit lists.
1557 		 */
1558 		ASSERT(rtxp->rtx_cnt > 0);
1559 		ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt);
1560 		rtxp->rtx_cnt--;
1561 		if (rtxp->rtx_fmode_cnt > 0) {
1562 			rtxp->rtx_fmode_cnt--;
1563 			rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ?
1564 			    CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE;
1565 			rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr,
1566 			    ilm->ilm_filter, rtxrp);
1567 		} else {
1568 			rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES,
1569 			    &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp);
1570 			rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES,
1571 			    &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp);
1572 		}
1573 		if (rtxp->rtx_cnt > 0) {
1574 			MCAST_RANDOM_DELAY(rtxp->rtx_timer,
1575 			    SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY));
1576 			if (rtxp->rtx_timer < next)
1577 				next = rtxp->rtx_timer;
1578 			rtxp->rtx_timer += current;
1579 		} else {
1580 			ASSERT(rtxp->rtx_timer == INFINITY);
1581 			CLEAR_SLIST(rtxp->rtx_allow);
1582 			CLEAR_SLIST(rtxp->rtx_block);
1583 		}
1584 	}
1585 
1586 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
1587 		mldv2_sendrpt(ill, rp);
1588 		mldv2_sendrpt(ill, rtxrp);
1589 	}
1590 	rw_exit(&ill->ill_mcast_lock);
1591 	/* Send any deferred/queued IP packets */
1592 	ill_mcast_send_queued(ill);
1593 	/* Defer ill_mcast_timer_start() until the caller is done */
1594 
1595 	return (next);
1596 }
1597 
1598 /*
1599  * mld_timeout_handler:
1600  * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick).
1601  * Returns number of ticks to next event (or 0 if none).
1602  * MT issues are same as igmp_timeout_handler
1603  */
1604 void
1605 mld_timeout_handler(void *arg)
1606 {
1607 	ill_t	*ill;
1608 	uint_t  global_next = INFINITY;
1609 	uint_t  next;
1610 	ill_walk_context_t ctx;
1611 	ip_stack_t *ipst = arg;
1612 
1613 	ASSERT(arg != NULL);
1614 	mutex_enter(&ipst->ips_mld_timer_lock);
1615 	ASSERT(ipst->ips_mld_timeout_id != 0);
1616 	ipst->ips_mld_timeout_id = 0;
1617 	ipst->ips_mld_timer_scheduled_last = 0;
1618 	ipst->ips_mld_time_to_next = 0;
1619 	mutex_exit(&ipst->ips_mld_timer_lock);
1620 
1621 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1622 	ill = ILL_START_WALK_V6(&ctx, ipst);
1623 	for (; ill != NULL; ill = ill_next(&ctx, ill)) {
1624 		ASSERT(ill->ill_isv6);
1625 		/* Make sure the ill isn't going away. */
1626 		if (!ill_check_and_refhold(ill))
1627 			continue;
1628 		rw_exit(&ipst->ips_ill_g_lock);
1629 		next = mld_timeout_handler_per_ill(ill);
1630 		if (next < global_next)
1631 			global_next = next;
1632 		ill_refrele(ill);
1633 		rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1634 	}
1635 	rw_exit(&ipst->ips_ill_g_lock);
1636 	if (global_next != INFINITY)
1637 		mld_start_timers(global_next, ipst);
1638 }
1639 
1640 /*
1641  * Calculate the Older Version Querier Present timeout value, in number
1642  * of slowtimo intervals, for the given ill.
1643  */
1644 #define	OVQP(ill) \
1645 	((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \
1646 	+ MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL)
1647 
1648 /*
1649  * igmp_slowtimo:
1650  * - Resets to new router if we didnt we hear from the router
1651  *   in IGMP_AGE_THRESHOLD seconds.
1652  * - Resets slowtimeout.
1653  * Check for ips_igmp_max_version ensures that we don't revert to a higher
1654  * IGMP version than configured.
1655  */
1656 void
1657 igmp_slowtimo(void *arg)
1658 {
1659 	ill_t	*ill;
1660 	ill_if_t *ifp;
1661 	avl_tree_t *avl_tree;
1662 	ip_stack_t *ipst = (ip_stack_t *)arg;
1663 
1664 	ASSERT(arg != NULL);
1665 
1666 	/*
1667 	 * The ill_if_t list is circular, hence the odd loop parameters.
1668 	 *
1669 	 * We can't use the ILL_START_WALK and ill_next() wrappers for this
1670 	 * walk, as we need to check the illif_mcast_* fields in the ill_if_t
1671 	 * structure (allowing us to skip if none of the instances have timers
1672 	 * running).
1673 	 */
1674 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1675 	for (ifp = IP_V4_ILL_G_LIST(ipst);
1676 	    ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst);
1677 	    ifp = ifp->illif_next) {
1678 		/*
1679 		 * illif_mcast_v[12] are set using atomics. If an ill hears
1680 		 * a V1 or V2 query now and we miss seeing the count now,
1681 		 * we will see it the next time igmp_slowtimo is called.
1682 		 */
1683 		if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0)
1684 			continue;
1685 
1686 		avl_tree = &ifp->illif_avl_by_ppa;
1687 		for (ill = avl_first(avl_tree); ill != NULL;
1688 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1689 			/* Make sure the ill isn't going away. */
1690 			if (!ill_check_and_refhold(ill))
1691 				continue;
1692 			rw_exit(&ipst->ips_ill_g_lock);
1693 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1694 			if (ill->ill_mcast_v1_tset == 1)
1695 				ill->ill_mcast_v1_time++;
1696 			if (ill->ill_mcast_v2_tset == 1)
1697 				ill->ill_mcast_v2_time++;
1698 			if ((ill->ill_mcast_type == IGMP_V1_ROUTER) &&
1699 			    (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) &&
1700 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1701 				if ((ill->ill_mcast_v2_tset > 0) ||
1702 				    (ipst->ips_igmp_max_version ==
1703 				    IGMP_V2_ROUTER)) {
1704 					ip1dbg(("V1 query timer "
1705 					    "expired on %s; switching "
1706 					    "mode to IGMP_V2\n",
1707 					    ill->ill_name));
1708 					ill->ill_mcast_type =
1709 					    IGMP_V2_ROUTER;
1710 				} else {
1711 					ip1dbg(("V1 query timer "
1712 					    "expired on %s; switching "
1713 					    "mode to IGMP_V3\n",
1714 					    ill->ill_name));
1715 					ill->ill_mcast_type =
1716 					    IGMP_V3_ROUTER;
1717 				}
1718 				ill->ill_mcast_v1_time = 0;
1719 				ill->ill_mcast_v1_tset = 0;
1720 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1721 			}
1722 			if ((ill->ill_mcast_type == IGMP_V2_ROUTER) &&
1723 			    (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) &&
1724 			    (ill->ill_mcast_v2_time >= OVQP(ill))) {
1725 				ip1dbg(("V2 query timer expired on "
1726 				    "%s; switching mode to IGMP_V3\n",
1727 				    ill->ill_name));
1728 				ill->ill_mcast_type = IGMP_V3_ROUTER;
1729 				ill->ill_mcast_v2_time = 0;
1730 				ill->ill_mcast_v2_tset = 0;
1731 				atomic_add_16(&ifp->illif_mcast_v2, -1);
1732 			}
1733 			rw_exit(&ill->ill_mcast_lock);
1734 			ill_refrele(ill);
1735 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1736 		}
1737 	}
1738 	rw_exit(&ipst->ips_ill_g_lock);
1739 	ill_mcast_timer_start(ipst);
1740 	mutex_enter(&ipst->ips_igmp_slowtimeout_lock);
1741 	ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst,
1742 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1743 	mutex_exit(&ipst->ips_igmp_slowtimeout_lock);
1744 }
1745 
1746 /*
1747  * mld_slowtimo:
1748  * - Resets to newer version if we didn't hear from the older version router
1749  *   in MLD_AGE_THRESHOLD seconds.
1750  * - Restarts slowtimeout.
1751  * Check for ips_mld_max_version ensures that we don't revert to a higher
1752  * IGMP version than configured.
1753  */
1754 void
1755 mld_slowtimo(void *arg)
1756 {
1757 	ill_t *ill;
1758 	ill_if_t *ifp;
1759 	avl_tree_t *avl_tree;
1760 	ip_stack_t *ipst = (ip_stack_t *)arg;
1761 
1762 	ASSERT(arg != NULL);
1763 	/* See comments in igmp_slowtimo() above... */
1764 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1765 	for (ifp = IP_V6_ILL_G_LIST(ipst);
1766 	    ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst);
1767 	    ifp = ifp->illif_next) {
1768 		if (ifp->illif_mcast_v1 == 0)
1769 			continue;
1770 
1771 		avl_tree = &ifp->illif_avl_by_ppa;
1772 		for (ill = avl_first(avl_tree); ill != NULL;
1773 		    ill = avl_walk(avl_tree, ill, AVL_AFTER)) {
1774 			/* Make sure the ill isn't going away. */
1775 			if (!ill_check_and_refhold(ill))
1776 				continue;
1777 			rw_exit(&ipst->ips_ill_g_lock);
1778 			rw_enter(&ill->ill_mcast_lock, RW_WRITER);
1779 			if (ill->ill_mcast_v1_tset == 1)
1780 				ill->ill_mcast_v1_time++;
1781 			if ((ill->ill_mcast_type == MLD_V1_ROUTER) &&
1782 			    (ipst->ips_mld_max_version >= MLD_V2_ROUTER) &&
1783 			    (ill->ill_mcast_v1_time >= OVQP(ill))) {
1784 				ip1dbg(("MLD query timer expired on"
1785 				    " %s; switching mode to MLD_V2\n",
1786 				    ill->ill_name));
1787 				ill->ill_mcast_type = MLD_V2_ROUTER;
1788 				ill->ill_mcast_v1_time = 0;
1789 				ill->ill_mcast_v1_tset = 0;
1790 				atomic_add_16(&ifp->illif_mcast_v1, -1);
1791 			}
1792 			rw_exit(&ill->ill_mcast_lock);
1793 			ill_refrele(ill);
1794 			rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1795 		}
1796 	}
1797 	rw_exit(&ipst->ips_ill_g_lock);
1798 	ill_mcast_timer_start(ipst);
1799 	mutex_enter(&ipst->ips_mld_slowtimeout_lock);
1800 	ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst,
1801 	    MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL));
1802 	mutex_exit(&ipst->ips_mld_slowtimeout_lock);
1803 }
1804 
1805 /*
1806  * igmp_sendpkt:
1807  * This will send to ip_output_simple just like icmp_inbound.
1808  */
1809 static void
1810 igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr)
1811 {
1812 	mblk_t	*mp;
1813 	igmpa_t	*igmpa;
1814 	uint8_t *rtralert;
1815 	ipha_t	*ipha;
1816 	int	hdrlen = sizeof (ipha_t) + RTRALERT_LEN;
1817 	size_t	size  = hdrlen + sizeof (igmpa_t);
1818 	ill_t 	*ill  = ilm->ilm_ill;
1819 	ip_stack_t *ipst = ill->ill_ipst;
1820 
1821 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1822 
1823 	mp = allocb(size, BPRI_HI);
1824 	if (mp == NULL) {
1825 		return;
1826 	}
1827 	mp->b_wptr = mp->b_rptr + size;
1828 
1829 	ipha = (ipha_t *)mp->b_rptr;
1830 	rtralert = (uint8_t *)&(ipha[1]);
1831 	igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]);
1832 	igmpa->igmpa_type   = type;
1833 	igmpa->igmpa_code   = 0;
1834 	igmpa->igmpa_group  = ilm->ilm_addr;
1835 	igmpa->igmpa_cksum  = 0;
1836 	igmpa->igmpa_cksum  = IP_CSUM(mp, hdrlen, 0);
1837 
1838 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1839 	rtralert[1] = RTRALERT_LEN;
1840 	rtralert[2] = 0;
1841 	rtralert[3] = 0;
1842 
1843 	ipha->ipha_version_and_hdr_length = (IP_VERSION << 4)
1844 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
1845 	ipha->ipha_type_of_service 	= 0;
1846 	ipha->ipha_length = htons(size);
1847 	ipha->ipha_ident = 0;
1848 	ipha->ipha_fragment_offset_and_flags = 0;
1849 	ipha->ipha_ttl 		= IGMP_TTL;
1850 	ipha->ipha_protocol 	= IPPROTO_IGMP;
1851 	ipha->ipha_hdr_checksum 	= 0;
1852 	ipha->ipha_dst 		= addr ? addr : igmpa->igmpa_group;
1853 	ipha->ipha_src 		= INADDR_ANY;
1854 
1855 	ill_mcast_queue(ill, mp);
1856 
1857 	++ipst->ips_igmpstat.igps_snd_reports;
1858 }
1859 
1860 /*
1861  * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill.
1862  * The report will contain one group record
1863  * for each element of reclist.  If this causes packet length to
1864  * exceed ill->ill_mtu, multiple reports are sent.
1865  * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(),
1866  * and those buffers are freed here.
1867  */
1868 static void
1869 igmpv3_sendrpt(ill_t *ill, mrec_t *reclist)
1870 {
1871 	igmp3ra_t *igmp3ra;
1872 	grphdra_t *grphdr;
1873 	mblk_t *mp;
1874 	ipha_t *ipha;
1875 	uint8_t *rtralert;
1876 	ipaddr_t *src_array;
1877 	int i, j, numrec, more_src_cnt;
1878 	size_t hdrsize, size, rsize;
1879 	mrec_t *rp, *cur_reclist;
1880 	mrec_t *next_reclist = reclist;
1881 	boolean_t morepkts;
1882 	ip_stack_t	 *ipst = ill->ill_ipst;
1883 
1884 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
1885 
1886 	/* if there aren't any records, there's nothing to send */
1887 	if (reclist == NULL)
1888 		return;
1889 
1890 	hdrsize = sizeof (ipha_t) + RTRALERT_LEN;
1891 nextpkt:
1892 	size = hdrsize + sizeof (igmp3ra_t);
1893 	morepkts = B_FALSE;
1894 	more_src_cnt = 0;
1895 	cur_reclist = next_reclist;
1896 	numrec = 0;
1897 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
1898 		rsize = sizeof (grphdra_t) +
1899 		    (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t));
1900 		if (size + rsize > ill->ill_mtu) {
1901 			if (rp == cur_reclist) {
1902 				/*
1903 				 * If the first mrec we looked at is too big
1904 				 * to fit in a single packet (i.e the source
1905 				 * list is too big), we must either truncate
1906 				 * the list (if TO_EX or IS_EX), or send
1907 				 * multiple reports for the same group (all
1908 				 * other types).
1909 				 */
1910 				int srcspace, srcsperpkt;
1911 				srcspace = ill->ill_mtu - (size +
1912 				    sizeof (grphdra_t));
1913 
1914 				/*
1915 				 * Skip if there's not even enough room in
1916 				 * a single packet to send something useful.
1917 				 */
1918 				if (srcspace <= sizeof (ipaddr_t))
1919 					continue;
1920 
1921 				srcsperpkt = srcspace / sizeof (ipaddr_t);
1922 				/*
1923 				 * Increment size and numrec, because we will
1924 				 * be sending a record for the mrec we're
1925 				 * looking at now.
1926 				 */
1927 				size += sizeof (grphdra_t) +
1928 				    (srcsperpkt * sizeof (ipaddr_t));
1929 				numrec++;
1930 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
1931 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
1932 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1933 					if (rp->mrec_next == NULL) {
1934 						/* no more packets to send */
1935 						break;
1936 					} else {
1937 						/*
1938 						 * more packets, but we're
1939 						 * done with this mrec.
1940 						 */
1941 						next_reclist = rp->mrec_next;
1942 					}
1943 				} else {
1944 					more_src_cnt = rp->mrec_srcs.sl_numsrc
1945 					    - srcsperpkt;
1946 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
1947 					/*
1948 					 * We'll fix up this mrec (remove the
1949 					 * srcs we've already sent) before
1950 					 * returning to nextpkt above.
1951 					 */
1952 					next_reclist = rp;
1953 				}
1954 			} else {
1955 				next_reclist = rp;
1956 			}
1957 			morepkts = B_TRUE;
1958 			break;
1959 		}
1960 		size += rsize;
1961 		numrec++;
1962 	}
1963 
1964 	mp = allocb(size, BPRI_HI);
1965 	if (mp == NULL) {
1966 		goto free_reclist;
1967 	}
1968 	bzero((char *)mp->b_rptr, size);
1969 	mp->b_wptr = (uchar_t *)(mp->b_rptr + size);
1970 
1971 	ipha = (ipha_t *)mp->b_rptr;
1972 	rtralert = (uint8_t *)&(ipha[1]);
1973 	igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]);
1974 	grphdr = (grphdra_t *)&(igmp3ra[1]);
1975 
1976 	rp = cur_reclist;
1977 	for (i = 0; i < numrec; i++) {
1978 		grphdr->grphdra_type = rp->mrec_type;
1979 		grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc);
1980 		grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group);
1981 		src_array = (ipaddr_t *)&(grphdr[1]);
1982 
1983 		for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++)
1984 			src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]);
1985 
1986 		grphdr = (grphdra_t *)&(src_array[j]);
1987 		rp = rp->mrec_next;
1988 	}
1989 
1990 	igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT;
1991 	igmp3ra->igmp3ra_numrec = htons(numrec);
1992 	igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0);
1993 
1994 	rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT;
1995 	rtralert[1] = RTRALERT_LEN;
1996 	rtralert[2] = 0;
1997 	rtralert[3] = 0;
1998 
1999 	ipha->ipha_version_and_hdr_length = IP_VERSION << 4
2000 	    | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS);
2001 	ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL;
2002 	ipha->ipha_length = htons(size);
2003 	ipha->ipha_ttl = IGMP_TTL;
2004 	ipha->ipha_protocol = IPPROTO_IGMP;
2005 	ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP);
2006 	ipha->ipha_src = INADDR_ANY;
2007 
2008 	ill_mcast_queue(ill, mp);
2009 
2010 	++ipst->ips_igmpstat.igps_snd_reports;
2011 
2012 	if (morepkts) {
2013 		if (more_src_cnt > 0) {
2014 			int index, mvsize;
2015 			slist_t *sl = &next_reclist->mrec_srcs;
2016 			index = sl->sl_numsrc;
2017 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2018 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2019 			    mvsize);
2020 			sl->sl_numsrc = more_src_cnt;
2021 		}
2022 		goto nextpkt;
2023 	}
2024 
2025 free_reclist:
2026 	while (reclist != NULL) {
2027 		rp = reclist->mrec_next;
2028 		mi_free(reclist);
2029 		reclist = rp;
2030 	}
2031 }
2032 
2033 /*
2034  * mld_input:
2035  * Return NULL for a bad packet that is discarded here.
2036  * Return mp if the message is OK and should be handed to "raw" receivers.
2037  * Callers of mld_input() may need to reinitialize variables that were copied
2038  * from the mblk as this calls pullupmsg().
2039  */
2040 mblk_t *
2041 mld_input(mblk_t *mp, ip_recv_attr_t *ira)
2042 {
2043 	ip6_t		*ip6h = (ip6_t *)(mp->b_rptr);
2044 	mld_hdr_t	*mldh;
2045 	ilm_t		*ilm;
2046 	ipif_t		*ipif;
2047 	uint16_t	hdr_length, exthdr_length;
2048 	in6_addr_t	*v6group_ptr;
2049 	uint_t		next;
2050 	int		mldlen;
2051 	ill_t		*ill = ira->ira_ill;
2052 	ip_stack_t	*ipst = ill->ill_ipst;
2053 
2054 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal);
2055 
2056 	/* Make sure the src address of the packet is link-local */
2057 	if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) {
2058 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2059 		freemsg(mp);
2060 		return (NULL);
2061 	}
2062 
2063 	if (ip6h->ip6_hlim != 1) {
2064 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit);
2065 		freemsg(mp);
2066 		return (NULL);
2067 	}
2068 
2069 	/* Get to the icmp header part */
2070 	hdr_length = ira->ira_ip_hdr_length;
2071 	exthdr_length = hdr_length - IPV6_HDR_LEN;
2072 
2073 	mldlen = ntohs(ip6h->ip6_plen) - exthdr_length;
2074 
2075 	/* An MLD packet must at least be 24 octets to be valid */
2076 	if (mldlen < MLD_MINLEN) {
2077 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2078 		freemsg(mp);
2079 		return (NULL);
2080 	}
2081 
2082 	mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]);
2083 
2084 	switch (mldh->mld_type) {
2085 	case MLD_LISTENER_QUERY:
2086 		/*
2087 		 * packet length differentiates between v1 and v2.  v1
2088 		 * query should be exactly 24 octets long; v2 is >= 28.
2089 		 */
2090 		if ((mldlen == MLD_MINLEN) ||
2091 		    (ipst->ips_mld_max_version < MLD_V2_ROUTER)) {
2092 			next = mld_query_in(mldh, ill);
2093 		} else if (mldlen >= MLD_V2_QUERY_MINLEN) {
2094 			next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen);
2095 		} else {
2096 			BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2097 			freemsg(mp);
2098 			return (NULL);
2099 		}
2100 		if (next == 0) {
2101 			return (mp);
2102 		}
2103 
2104 		if (next != INFINITY)
2105 			mld_start_timers(next, ipst);
2106 		break;
2107 
2108 	case MLD_LISTENER_REPORT:
2109 		/*
2110 		 * For fast leave to work, we have to know that we are the
2111 		 * last person to send a report for this group.  Reports
2112 		 * generated by us are looped back since we could potentially
2113 		 * be a multicast router, so discard reports sourced by me.
2114 		 */
2115 		mutex_enter(&ill->ill_lock);
2116 		for (ipif = ill->ill_ipif; ipif != NULL;
2117 		    ipif = ipif->ipif_next) {
2118 			if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr,
2119 			    &ip6h->ip6_src)) {
2120 				if (ip_debug > 1) {
2121 					char    buf1[INET6_ADDRSTRLEN];
2122 
2123 					(void) mi_strlog(ill->ill_rq,
2124 					    1,
2125 					    SL_TRACE,
2126 					    "mld_input: we are only "
2127 					    "member src %s\n",
2128 					    inet_ntop(AF_INET6, &ip6h->ip6_src,
2129 					    buf1, sizeof (buf1)));
2130 				}
2131 				mutex_exit(&ill->ill_lock);
2132 				return (mp);
2133 			}
2134 		}
2135 		mutex_exit(&ill->ill_lock);
2136 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses);
2137 
2138 		v6group_ptr = &mldh->mld_addr;
2139 		if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) {
2140 			BUMP_MIB(ill->ill_icmp6_mib,
2141 			    ipv6IfIcmpInGroupMembBadReports);
2142 			freemsg(mp);
2143 			return (NULL);
2144 		}
2145 
2146 
2147 		/*
2148 		 * If we belong to the group being reported, and we are a
2149 		 * 'Delaying member' per the RFC terminology, stop our timer
2150 		 * for that group and 'clear flag' i.e. mark ilm_state as
2151 		 * IGMP_OTHERMEMBER. With zones, there can be multiple group
2152 		 * membership entries for the same group address (one per zone)
2153 		 * so we need to walk the ill_ilm list.
2154 		 */
2155 		rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2156 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2157 			if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr))
2158 				continue;
2159 			BUMP_MIB(ill->ill_icmp6_mib,
2160 			    ipv6IfIcmpInGroupMembOurReports);
2161 
2162 			ilm->ilm_timer = INFINITY;
2163 			ilm->ilm_state = IGMP_OTHERMEMBER;
2164 		}
2165 		rw_exit(&ill->ill_mcast_lock);
2166 		/*
2167 		 * No packets have been sent above - no
2168 		 * ill_mcast_send_queued is needed.
2169 		 */
2170 		ill_mcast_timer_start(ill->ill_ipst);
2171 		break;
2172 
2173 	case MLD_LISTENER_REDUCTION:
2174 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions);
2175 		break;
2176 	}
2177 	return (mp);
2178 }
2179 
2180 /*
2181  * Handles an MLDv1 Listener Query.  Returns 0 on error, or the appropriate
2182  * (non-zero, unsigned) timer value to be set on success.
2183  */
2184 static uint_t
2185 mld_query_in(mld_hdr_t *mldh, ill_t *ill)
2186 {
2187 	ilm_t	*ilm;
2188 	int	timer;
2189 	uint_t	next, current;
2190 	in6_addr_t *v6group;
2191 
2192 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2193 
2194 	/*
2195 	 * In the MLD specification, there are 3 states and a flag.
2196 	 *
2197 	 * In Non-Listener state, we simply don't have a membership record.
2198 	 * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY)
2199 	 * In Idle Member state, our timer is not running (ilm->ilm_timer ==
2200 	 * INFINITY)
2201 	 *
2202 	 * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if
2203 	 * we have heard a report from another member, or IGMP_IREPORTEDLAST
2204 	 * if I sent the last report.
2205 	 */
2206 	v6group = &mldh->mld_addr;
2207 	if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) &&
2208 	    ((!IN6_IS_ADDR_MULTICAST(v6group)))) {
2209 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries);
2210 		return (0);
2211 	}
2212 
2213 	/* Need to do compatibility mode checking */
2214 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2215 	ill->ill_mcast_v1_time = 0;
2216 	ill->ill_mcast_v1_tset = 1;
2217 	if (ill->ill_mcast_type == MLD_V2_ROUTER) {
2218 		ip1dbg(("Received MLDv1 Query on %s, switching mode to "
2219 		    "MLD_V1_ROUTER\n", ill->ill_name));
2220 		atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1);
2221 		ill->ill_mcast_type = MLD_V1_ROUTER;
2222 	}
2223 
2224 	timer = (int)ntohs(mldh->mld_maxdelay);
2225 	if (ip_debug > 1) {
2226 		(void) mi_strlog(ill->ill_rq, 1, SL_TRACE,
2227 		    "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x",
2228 		    timer, (int)mldh->mld_type);
2229 	}
2230 
2231 	/*
2232 	 * -Start the timers in all of our membership records for
2233 	 * the physical interface on which the query arrived,
2234 	 * excl:
2235 	 *	1.  those that belong to the "all hosts" group,
2236 	 *	2.  those with 0 scope, or 1 node-local scope.
2237 	 *
2238 	 * -Restart any timer that is already running but has a value
2239 	 * longer that the requested timeout.
2240 	 * -Use the value specified in the query message as the
2241 	 * maximum timeout.
2242 	 */
2243 	next = INFINITY;
2244 
2245 	current = CURRENT_MSTIME;
2246 	for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2247 		ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr));
2248 
2249 		if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2250 		    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2251 		    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr))
2252 			continue;
2253 		if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr,
2254 		    &ipv6_all_hosts_mcast)) &&
2255 		    (IN6_IS_ADDR_UNSPECIFIED(v6group)) ||
2256 		    (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) {
2257 			if (timer == 0) {
2258 				/* Respond immediately */
2259 				ilm->ilm_timer = INFINITY;
2260 				ilm->ilm_state = IGMP_IREPORTEDLAST;
2261 				mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL);
2262 				break;
2263 			}
2264 			if (ilm->ilm_timer > timer) {
2265 				MCAST_RANDOM_DELAY(ilm->ilm_timer, timer);
2266 				if (ilm->ilm_timer < next)
2267 					next = ilm->ilm_timer;
2268 				ilm->ilm_timer += current;
2269 			}
2270 			break;
2271 		}
2272 	}
2273 	rw_exit(&ill->ill_mcast_lock);
2274 	/* Send any deferred/queued IP packets */
2275 	ill_mcast_send_queued(ill);
2276 	ill_mcast_timer_start(ill->ill_ipst);
2277 
2278 	return (next);
2279 }
2280 
2281 /*
2282  * Handles an MLDv2 Listener Query.  On error, returns 0; on success,
2283  * returns the appropriate (non-zero, unsigned) timer value (which may
2284  * be INFINITY) to be set.
2285  */
2286 static uint_t
2287 mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen)
2288 {
2289 	ilm_t	*ilm;
2290 	in6_addr_t *v6group, *src_array;
2291 	uint_t	next, numsrc, i, mrd, delay, qqi, current;
2292 	uint8_t	qrv;
2293 
2294 	v6group = &mld2q->mld2q_addr;
2295 	numsrc = ntohs(mld2q->mld2q_numsrc);
2296 
2297 	/* make sure numsrc matches packet size */
2298 	if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) {
2299 		BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors);
2300 		return (0);
2301 	}
2302 	src_array = (in6_addr_t *)&mld2q[1];
2303 
2304 	BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries);
2305 
2306 	/* extract Maximum Response Delay from code in header */
2307 	mrd = ntohs(mld2q->mld2q_mxrc);
2308 	if (mrd >= MLD_V2_MAXRT_FPMIN) {
2309 		uint_t hdrval, mant, exp;
2310 		hdrval = mrd;
2311 		mant = hdrval & MLD_V2_MAXRT_MANT_MASK;
2312 		exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12;
2313 		mrd = (mant | 0x1000) << (exp + 3);
2314 	}
2315 	if (mrd == 0)
2316 		mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL);
2317 
2318 	MCAST_RANDOM_DELAY(delay, mrd);
2319 	next = (unsigned)INFINITY;
2320 	current = CURRENT_MSTIME;
2321 
2322 	if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0)
2323 		ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS;
2324 	else
2325 		ill->ill_mcast_rv = qrv;
2326 
2327 	if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) {
2328 		uint_t mant, exp;
2329 		mant = qqi & MLD_V2_QQI_MANT_MASK;
2330 		exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12;
2331 		qqi = (mant | 0x10) << (exp + 3);
2332 	}
2333 	ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi;
2334 
2335 	/*
2336 	 * If we have a pending general query response that's scheduled
2337 	 * sooner than the delay we calculated for this response, then
2338 	 * no action is required (MLDv2 draft section 6.2 rule 1)
2339 	 */
2340 	rw_enter(&ill->ill_mcast_lock, RW_WRITER);
2341 	if (ill->ill_global_timer < (current + delay)) {
2342 		rw_exit(&ill->ill_mcast_lock);
2343 		return (next);
2344 	}
2345 
2346 	/*
2347 	 * Now take action depending on query type: general,
2348 	 * group specific, or group/source specific.
2349 	 */
2350 	if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) {
2351 		/*
2352 		 * general query
2353 		 * We know global timer is either not running or is
2354 		 * greater than our calculated delay, so reset it to
2355 		 * our delay (random value in range [0, response time])
2356 		 */
2357 		ill->ill_global_timer = current + delay;
2358 		next = delay;
2359 	} else {
2360 		/* group or group/source specific query */
2361 		for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) {
2362 			if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) ||
2363 			    IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) ||
2364 			    IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) ||
2365 			    !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))
2366 				continue;
2367 
2368 			/*
2369 			 * If the query is group specific or we have a
2370 			 * pending group specific query, the response is
2371 			 * group specific (pending sources list should be
2372 			 * empty).  Otherwise, need to update the pending
2373 			 * sources list for the group and source specific
2374 			 * response.
2375 			 */
2376 			if (numsrc == 0 || (ilm->ilm_timer < INFINITY &&
2377 			    SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) {
2378 group_query:
2379 				FREE_SLIST(ilm->ilm_pendsrcs);
2380 				ilm->ilm_pendsrcs = NULL;
2381 			} else {
2382 				boolean_t overflow;
2383 				slist_t *pktl;
2384 				if (numsrc > MAX_FILTER_SIZE ||
2385 				    (ilm->ilm_pendsrcs == NULL &&
2386 				    (ilm->ilm_pendsrcs = l_alloc()) == NULL)) {
2387 					/*
2388 					 * We've been sent more sources than
2389 					 * we can deal with; or we can't deal
2390 					 * with a source list at all. Revert
2391 					 * to a group specific query.
2392 					 */
2393 					goto group_query;
2394 				}
2395 				if ((pktl = l_alloc()) == NULL)
2396 					goto group_query;
2397 				pktl->sl_numsrc = numsrc;
2398 				for (i = 0; i < numsrc; i++)
2399 					pktl->sl_addr[i] = src_array[i];
2400 				l_union_in_a(ilm->ilm_pendsrcs, pktl,
2401 				    &overflow);
2402 				l_free(pktl);
2403 				if (overflow)
2404 					goto group_query;
2405 			}
2406 			ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ?
2407 			    INFINITY : (ilm->ilm_timer - current);
2408 			/* set timer to soonest value */
2409 			ilm->ilm_timer = MIN(ilm->ilm_timer, delay);
2410 			if (ilm->ilm_timer < next)
2411 				next = ilm->ilm_timer;
2412 			ilm->ilm_timer += current;
2413 			break;
2414 		}
2415 	}
2416 	rw_exit(&ill->ill_mcast_lock);
2417 	/*
2418 	 * No packets have been sent above - no
2419 	 * ill_mcast_send_queued is needed.
2420 	 */
2421 	ill_mcast_timer_start(ill->ill_ipst);
2422 
2423 	return (next);
2424 }
2425 
2426 /*
2427  * Send MLDv1 response packet with hoplimit 1
2428  */
2429 static void
2430 mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr)
2431 {
2432 	mblk_t		*mp;
2433 	mld_hdr_t	*mldh;
2434 	ip6_t 		*ip6h;
2435 	ip6_hbh_t	*ip6hbh;
2436 	struct ip6_opt_router	*ip6router;
2437 	size_t		size = IPV6_HDR_LEN + sizeof (mld_hdr_t);
2438 	ill_t		*ill = ilm->ilm_ill;
2439 
2440 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2441 
2442 	/*
2443 	 * We need to place a router alert option in this packet.  The length
2444 	 * of the options must be a multiple of 8.  The hbh option header is 2
2445 	 * bytes followed by the 4 byte router alert option.  That leaves
2446 	 * 2 bytes of pad for a total of 8 bytes.
2447 	 */
2448 	const int	router_alert_length = 8;
2449 
2450 	ASSERT(ill->ill_isv6);
2451 
2452 	size += router_alert_length;
2453 	mp = allocb(size, BPRI_HI);
2454 	if (mp == NULL)
2455 		return;
2456 	bzero(mp->b_rptr, size);
2457 	mp->b_wptr = mp->b_rptr + size;
2458 
2459 	ip6h = (ip6_t *)mp->b_rptr;
2460 	ip6hbh = (struct ip6_hbh *)&ip6h[1];
2461 	ip6router = (struct ip6_opt_router *)&ip6hbh[1];
2462 	/*
2463 	 * A zero is a pad option of length 1.  The bzero of the whole packet
2464 	 * above will pad between ip6router and mld.
2465 	 */
2466 	mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length);
2467 
2468 	mldh->mld_type = type;
2469 	mldh->mld_addr = ilm->ilm_v6addr;
2470 
2471 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2472 	ip6router->ip6or_len = 2;
2473 	ip6router->ip6or_value[0] = 0;
2474 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2475 
2476 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2477 	ip6hbh->ip6h_len = 0;
2478 
2479 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2480 	ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length);
2481 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2482 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2483 	if (v6addr == NULL)
2484 		ip6h->ip6_dst =  ilm->ilm_v6addr;
2485 	else
2486 		ip6h->ip6_dst = *v6addr;
2487 
2488 	ip6h->ip6_src = ipv6_all_zeros;
2489 	/*
2490 	 * Prepare for checksum by putting icmp length in the icmp
2491 	 * checksum field. The checksum is calculated in ip_output.
2492 	 */
2493 	mldh->mld_cksum = htons(sizeof (*mldh));
2494 
2495 	ill_mcast_queue(ill, mp);
2496 }
2497 
2498 /*
2499  * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill.  The
2500  * report will contain one multicast address record for each element of
2501  * reclist.  If this causes packet length to exceed ill->ill_mtu,
2502  * multiple reports are sent.  reclist is assumed to be made up of
2503  * buffers allocated by mcast_bldmrec(), and those buffers are freed here.
2504  */
2505 static void
2506 mldv2_sendrpt(ill_t *ill, mrec_t *reclist)
2507 {
2508 	mblk_t		*mp;
2509 	mld2r_t		*mld2r;
2510 	mld2mar_t	*mld2mar;
2511 	in6_addr_t	*srcarray;
2512 	ip6_t		*ip6h;
2513 	ip6_hbh_t	*ip6hbh;
2514 	struct ip6_opt_router	*ip6router;
2515 	size_t		size, optlen, padlen, icmpsize, rsize;
2516 	int		i, numrec, more_src_cnt;
2517 	mrec_t		*rp, *cur_reclist;
2518 	mrec_t		*next_reclist = reclist;
2519 	boolean_t	morepkts;
2520 
2521 	/* If there aren't any records, there's nothing to send */
2522 	if (reclist == NULL)
2523 		return;
2524 
2525 	ASSERT(ill->ill_isv6);
2526 	ASSERT(RW_LOCK_HELD(&ill->ill_mcast_lock));
2527 
2528 	/*
2529 	 * Total option length (optlen + padlen) must be a multiple of
2530 	 * 8 bytes.  We assume here that optlen <= 8, so the total option
2531 	 * length will be 8.  Assert this in case anything ever changes.
2532 	 */
2533 	optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router);
2534 	ASSERT(optlen <= 8);
2535 	padlen = 8 - optlen;
2536 nextpkt:
2537 	icmpsize = sizeof (mld2r_t);
2538 	size = IPV6_HDR_LEN + optlen + padlen + icmpsize;
2539 	morepkts = B_FALSE;
2540 	more_src_cnt = 0;
2541 	for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL;
2542 	    rp = rp->mrec_next, numrec++) {
2543 		rsize = sizeof (mld2mar_t) +
2544 		    (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t));
2545 		if (size + rsize > ill->ill_mtu) {
2546 			if (rp == cur_reclist) {
2547 				/*
2548 				 * If the first mrec we looked at is too big
2549 				 * to fit in a single packet (i.e the source
2550 				 * list is too big), we must either truncate
2551 				 * the list (if TO_EX or IS_EX), or send
2552 				 * multiple reports for the same group (all
2553 				 * other types).
2554 				 */
2555 				int srcspace, srcsperpkt;
2556 				srcspace = ill->ill_mtu -
2557 				    (size + sizeof (mld2mar_t));
2558 
2559 				/*
2560 				 * Skip if there's not even enough room in
2561 				 * a single packet to send something useful.
2562 				 */
2563 				if (srcspace <= sizeof (in6_addr_t))
2564 					continue;
2565 
2566 				srcsperpkt = srcspace / sizeof (in6_addr_t);
2567 				/*
2568 				 * Increment icmpsize and size, because we will
2569 				 * be sending a record for the mrec we're
2570 				 * looking at now.
2571 				 */
2572 				rsize = sizeof (mld2mar_t) +
2573 				    (srcsperpkt * sizeof (in6_addr_t));
2574 				icmpsize += rsize;
2575 				size += rsize;
2576 				if (rp->mrec_type == MODE_IS_EXCLUDE ||
2577 				    rp->mrec_type == CHANGE_TO_EXCLUDE) {
2578 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2579 					if (rp->mrec_next == NULL) {
2580 						/* no more packets to send */
2581 						break;
2582 					} else {
2583 						/*
2584 						 * more packets, but we're
2585 						 * done with this mrec.
2586 						 */
2587 						next_reclist = rp->mrec_next;
2588 					}
2589 				} else {
2590 					more_src_cnt = rp->mrec_srcs.sl_numsrc
2591 					    - srcsperpkt;
2592 					rp->mrec_srcs.sl_numsrc = srcsperpkt;
2593 					/*
2594 					 * We'll fix up this mrec (remove the
2595 					 * srcs we've already sent) before
2596 					 * returning to nextpkt above.
2597 					 */
2598 					next_reclist = rp;
2599 				}
2600 			} else {
2601 				next_reclist = rp;
2602 			}
2603 			morepkts = B_TRUE;
2604 			break;
2605 		}
2606 		icmpsize += rsize;
2607 		size += rsize;
2608 	}
2609 
2610 	mp = allocb(size, BPRI_HI);
2611 	if (mp == NULL)
2612 		goto free_reclist;
2613 	bzero(mp->b_rptr, size);
2614 	mp->b_wptr = mp->b_rptr + size;
2615 
2616 	ip6h = (ip6_t *)mp->b_rptr;
2617 	ip6hbh = (ip6_hbh_t *)&(ip6h[1]);
2618 	ip6router = (struct ip6_opt_router *)&(ip6hbh[1]);
2619 	mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen);
2620 	mld2mar = (mld2mar_t *)&(mld2r[1]);
2621 
2622 	ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW;
2623 	ip6h->ip6_plen = htons(optlen + padlen + icmpsize);
2624 	ip6h->ip6_nxt = IPPROTO_HOPOPTS;
2625 	ip6h->ip6_hops = MLD_HOP_LIMIT;
2626 	ip6h->ip6_dst = ipv6_all_v2rtrs_mcast;
2627 	ip6h->ip6_src = ipv6_all_zeros;
2628 
2629 	ip6hbh->ip6h_nxt = IPPROTO_ICMPV6;
2630 	/*
2631 	 * ip6h_len is the number of 8-byte words, not including the first
2632 	 * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0.
2633 	 */
2634 	ip6hbh->ip6h_len = 0;
2635 
2636 	ip6router->ip6or_type = IP6OPT_ROUTER_ALERT;
2637 	ip6router->ip6or_len = 2;
2638 	ip6router->ip6or_value[0] = 0;
2639 	ip6router->ip6or_value[1] = IP6_ALERT_MLD;
2640 
2641 	mld2r->mld2r_type = MLD_V2_LISTENER_REPORT;
2642 	mld2r->mld2r_nummar = htons(numrec);
2643 	/*
2644 	 * Prepare for the checksum by putting icmp length in the icmp
2645 	 * checksum field. The checksum is calculated in ip_output_simple.
2646 	 */
2647 	mld2r->mld2r_cksum = htons(icmpsize);
2648 
2649 	for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) {
2650 		mld2mar->mld2mar_type = rp->mrec_type;
2651 		mld2mar->mld2mar_auxlen = 0;
2652 		mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc);
2653 		mld2mar->mld2mar_group = rp->mrec_group;
2654 		srcarray = (in6_addr_t *)&(mld2mar[1]);
2655 
2656 		for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++)
2657 			srcarray[i] = rp->mrec_srcs.sl_addr[i];
2658 
2659 		mld2mar = (mld2mar_t *)&(srcarray[i]);
2660 	}
2661 
2662 	ill_mcast_queue(ill, mp);
2663 
2664 	if (morepkts) {
2665 		if (more_src_cnt > 0) {
2666 			int index, mvsize;
2667 			slist_t *sl = &next_reclist->mrec_srcs;
2668 			index = sl->sl_numsrc;
2669 			mvsize = more_src_cnt * sizeof (in6_addr_t);
2670 			(void) memmove(&sl->sl_addr[0], &sl->sl_addr[index],
2671 			    mvsize);
2672 			sl->sl_numsrc = more_src_cnt;
2673 		}
2674 		goto nextpkt;
2675 	}
2676 
2677 free_reclist:
2678 	while (reclist != NULL) {
2679 		rp = reclist->mrec_next;
2680 		mi_free(reclist);
2681 		reclist = rp;
2682 	}
2683 }
2684 
2685 static mrec_t *
2686 mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist,
2687     mrec_t *next)
2688 {
2689 	mrec_t *rp;
2690 	int i;
2691 
2692 	if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) &&
2693 	    SLIST_IS_EMPTY(srclist))
2694 		return (next);
2695 
2696 	rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI);
2697 	if (rp == NULL)
2698 		return (next);
2699 
2700 	rp->mrec_next = next;
2701 	rp->mrec_type = type;
2702 	rp->mrec_auxlen = 0;
2703 	rp->mrec_group = *grp;
2704 	if (srclist == NULL) {
2705 		rp->mrec_srcs.sl_numsrc = 0;
2706 	} else {
2707 		rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc;
2708 		for (i = 0; i < srclist->sl_numsrc; i++)
2709 			rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i];
2710 	}
2711 
2712 	return (rp);
2713 }
2714 
2715 /*
2716  * Set up initial retransmit state.  If memory cannot be allocated for
2717  * the source lists, simply create as much state as is possible; memory
2718  * allocation failures are considered one type of transient error that
2719  * the retransmissions are designed to overcome (and if they aren't
2720  * transient, there are bigger problems than failing to notify the
2721  * router about multicast group membership state changes).
2722  */
2723 static void
2724 mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype,
2725     slist_t *flist)
2726 {
2727 	/*
2728 	 * There are only three possibilities for rtype:
2729 	 *	New join, transition from INCLUDE {} to INCLUDE {flist}
2730 	 *	  => rtype is ALLOW_NEW_SOURCES
2731 	 *	New join, transition from INCLUDE {} to EXCLUDE {flist}
2732 	 *	  => rtype is CHANGE_TO_EXCLUDE
2733 	 *	State change that involves a filter mode change
2734 	 *	  => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE
2735 	 */
2736 	ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE ||
2737 	    rtype == ALLOW_NEW_SOURCES);
2738 
2739 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2740 
2741 	switch (rtype) {
2742 	case CHANGE_TO_EXCLUDE:
2743 		rtxp->rtx_fmode_cnt = ill->ill_mcast_rv;
2744 		CLEAR_SLIST(rtxp->rtx_allow);
2745 		COPY_SLIST(flist, rtxp->rtx_block);
2746 		break;
2747 	case ALLOW_NEW_SOURCES:
2748 	case CHANGE_TO_INCLUDE:
2749 		rtxp->rtx_fmode_cnt =
2750 		    rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv;
2751 		CLEAR_SLIST(rtxp->rtx_block);
2752 		COPY_SLIST(flist, rtxp->rtx_allow);
2753 		break;
2754 	}
2755 }
2756 
2757 /*
2758  * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and
2759  * RFC 3376 section 5.1, covers three cases:
2760  *	* The current state change is a filter mode change
2761  *		Set filter mode retransmit counter; set retransmit allow or
2762  *		block list to new source list as appropriate, and clear the
2763  *		retransmit list that was not set; send TO_IN or TO_EX with
2764  *		new source list.
2765  *	* The current state change is a source list change, but the filter
2766  *	  mode retransmit counter is > 0
2767  *		Decrement filter mode retransmit counter; set retransmit
2768  *		allow or block list to  new source list as appropriate,
2769  *		and clear the retransmit list that was not set; send TO_IN
2770  *		or TO_EX with new source list.
2771  *	* The current state change is a source list change, and the filter
2772  *	  mode retransmit counter is 0.
2773  *		Merge existing rtx allow and block lists with new state:
2774  *		  rtx_allow = (new allow + rtx_allow) - new block
2775  *		  rtx_block = (new block + rtx_block) - new allow
2776  *		Send ALLOW and BLOCK records for new retransmit lists;
2777  *		decrement retransmit counter.
2778  *
2779  * As is the case for mcast_init_rtx(), memory allocation failures are
2780  * acceptable; we just create as much state as we can.
2781  */
2782 static mrec_t *
2783 mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist)
2784 {
2785 	ill_t *ill;
2786 	rtx_state_t *rtxp = &ilm->ilm_rtx;
2787 	mcast_record_t txtype;
2788 	mrec_t *rp, *rpnext, *rtnmrec;
2789 	boolean_t ovf;
2790 
2791 	ill = ilm->ilm_ill;
2792 
2793 	if (mreclist == NULL)
2794 		return (mreclist);
2795 
2796 	/*
2797 	 * A filter mode change is indicated by a single mrec, which is
2798 	 * either TO_IN or TO_EX.  In this case, we just need to set new
2799 	 * retransmit state as if this were an initial join.  There is
2800 	 * no change to the mrec list.
2801 	 */
2802 	if (mreclist->mrec_type == CHANGE_TO_INCLUDE ||
2803 	    mreclist->mrec_type == CHANGE_TO_EXCLUDE) {
2804 		mcast_init_rtx(ill, rtxp, mreclist->mrec_type,
2805 		    &mreclist->mrec_srcs);
2806 		return (mreclist);
2807 	}
2808 
2809 	/*
2810 	 * Only the source list has changed
2811 	 */
2812 	rtxp->rtx_cnt = ill->ill_mcast_rv;
2813 	if (rtxp->rtx_fmode_cnt > 0) {
2814 		/* but we're still sending filter mode change reports */
2815 		rtxp->rtx_fmode_cnt--;
2816 		if (ilm->ilm_fmode == MODE_IS_INCLUDE) {
2817 			CLEAR_SLIST(rtxp->rtx_block);
2818 			COPY_SLIST(flist, rtxp->rtx_allow);
2819 			txtype = CHANGE_TO_INCLUDE;
2820 		} else {
2821 			CLEAR_SLIST(rtxp->rtx_allow);
2822 			COPY_SLIST(flist, rtxp->rtx_block);
2823 			txtype = CHANGE_TO_EXCLUDE;
2824 		}
2825 		/* overwrite first mrec with new info */
2826 		mreclist->mrec_type = txtype;
2827 		l_copy(flist, &mreclist->mrec_srcs);
2828 		/* then free any remaining mrecs */
2829 		for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) {
2830 			rpnext = rp->mrec_next;
2831 			mi_free(rp);
2832 		}
2833 		mreclist->mrec_next = NULL;
2834 		rtnmrec = mreclist;
2835 	} else {
2836 		mrec_t *allow_mrec, *block_mrec;
2837 		/*
2838 		 * Just send the source change reports; but we need to
2839 		 * recalculate the ALLOW and BLOCK lists based on previous
2840 		 * state and new changes.
2841 		 */
2842 		rtnmrec = mreclist;
2843 		allow_mrec = block_mrec = NULL;
2844 		for (rp = mreclist; rp != NULL; rp = rp->mrec_next) {
2845 			ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES ||
2846 			    rp->mrec_type == BLOCK_OLD_SOURCES);
2847 			if (rp->mrec_type == ALLOW_NEW_SOURCES)
2848 				allow_mrec = rp;
2849 			else
2850 				block_mrec = rp;
2851 		}
2852 		/*
2853 		 * Perform calculations:
2854 		 *   new_allow = mrec_allow + (rtx_allow - mrec_block)
2855 		 *   new_block = mrec_block + (rtx_block - mrec_allow)
2856 		 *
2857 		 * Each calc requires two steps, for example:
2858 		 *   rtx_allow = rtx_allow - mrec_block;
2859 		 *   new_allow = mrec_allow + rtx_allow;
2860 		 *
2861 		 * Store results in mrec lists, and then copy into rtx lists.
2862 		 * We do it in this order in case the rtx list hasn't been
2863 		 * alloc'd yet; if it hasn't and our alloc fails, that's okay,
2864 		 * Overflows are also okay.
2865 		 */
2866 		if (block_mrec != NULL) {
2867 			l_difference_in_a(rtxp->rtx_allow,
2868 			    &block_mrec->mrec_srcs);
2869 		}
2870 		if (allow_mrec != NULL) {
2871 			l_difference_in_a(rtxp->rtx_block,
2872 			    &allow_mrec->mrec_srcs);
2873 			l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow,
2874 			    &ovf);
2875 		}
2876 		if (block_mrec != NULL) {
2877 			l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block,
2878 			    &ovf);
2879 			COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block);
2880 		} else {
2881 			rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES,
2882 			    &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec);
2883 		}
2884 		if (allow_mrec != NULL) {
2885 			COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow);
2886 		} else {
2887 			rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES,
2888 			    &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec);
2889 		}
2890 	}
2891 
2892 	return (rtnmrec);
2893 }
2894