/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1990 Mentat Inc. */ /* * Internet Group Management Protocol (IGMP) routines. * Multicast Listener Discovery Protocol (MLD) routines. * * Written by Steve Deering, Stanford, May 1988. * Modified by Rosen Sharma, Stanford, Aug 1994. * Modified by Bill Fenner, Xerox PARC, Feb. 1995. * * MULTICAST 3.5.1.1 */ #include <sys/types.h> #include <sys/stream.h> #include <sys/stropts.h> #include <sys/strlog.h> #include <sys/strsun.h> #include <sys/systm.h> #include <sys/ddi.h> #include <sys/sunddi.h> #include <sys/cmn_err.h> #include <sys/atomic.h> #include <sys/zone.h> #include <sys/param.h> #include <sys/socket.h> #include <inet/ipclassifier.h> #include <net/if.h> #include <net/route.h> #include <netinet/in.h> #include <netinet/igmp_var.h> #include <netinet/ip6.h> #include <netinet/icmp6.h> #include <inet/common.h> #include <inet/mi.h> #include <inet/nd.h> #include <inet/ip.h> #include <inet/ip6.h> #include <inet/ip_multi.h> #include <inet/ip_listutils.h> #include <netinet/igmp.h> #include <inet/ip_if.h> #include <net/pfkeyv2.h> #include <inet/ipsec_info.h> static uint_t igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill); static uint_t igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen); static uint_t mld_query_in(mld_hdr_t *mldh, ill_t *ill); static uint_t mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen); static void igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr); static void mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr); static void igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist); static void mldv2_sendrpt(ill_t *ill, mrec_t *reclist); static mrec_t *mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist, mrec_t *next); static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype, slist_t *flist); static mrec_t *mcast_merge_rtx(ilm_t *ilm, mrec_t *rp, slist_t *flist); /* * Macros used to do timer len conversions. Timer values are always * stored and passed to the timer functions as milliseconds; but the * default values and values from the wire may not be. * * And yes, it's obscure, but decisecond is easier to abbreviate than * "tenths of a second". */ #define DSEC_TO_MSEC(dsec) ((dsec) * 100) #define SEC_TO_MSEC(sec) ((sec) * 1000) /* * A running timer (scheduled thru timeout) can be cancelled if another * timer with a shorter timeout value is scheduled before it has timed * out. When the shorter timer expires, the original timer is updated * to account for the time elapsed while the shorter timer ran; but this * does not take into account the amount of time already spent in timeout * state before being preempted by the shorter timer, that is the time * interval between time scheduled to time cancelled. This can cause * delays in sending out multicast membership reports. To resolve this * problem, wallclock time (absolute time) is used instead of deltas * (relative time) to track timers. * * The MACRO below gets the lbolt value, used for proper timer scheduling * and firing. Therefore multicast membership reports are sent on time. * The timer does not exactly fire at the time it was scehduled to fire, * there is a difference of a few milliseconds observed. An offset is used * to take care of the difference. */ #define CURRENT_MSTIME ((uint_t)TICK_TO_MSEC(ddi_get_lbolt())) #define CURRENT_OFFSET (999) /* * The first multicast join will trigger the igmp timers / mld timers * The unit for next is milliseconds. */ void igmp_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; int ret; ASSERT(next != 0 && next != INFINITY); mutex_enter(&ipst->ips_igmp_timer_lock); if (ipst->ips_igmp_timer_setter_active) { /* * Serialize timer setters, one at a time. If the * timer is currently being set by someone, * just record the next time when it has to be * invoked and return. The current setter will * take care. */ ipst->ips_igmp_time_to_next = MIN(ipst->ips_igmp_time_to_next, next); mutex_exit(&ipst->ips_igmp_timer_lock); return; } else { ipst->ips_igmp_timer_setter_active = B_TRUE; } if (ipst->ips_igmp_timeout_id == 0) { /* * The timer is inactive. We need to start a timer */ ipst->ips_igmp_time_to_next = next; ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler, (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next)); ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt(); ipst->ips_igmp_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_igmp_timer_lock); return; } /* * The timer was scheduled sometime back for firing in * 'igmp_time_to_next' ms and is active. We need to * reschedule the timeout if the new 'next' will happen * earlier than the currently scheduled timeout */ time_left = ipst->ips_igmp_timer_scheduled_last + MSEC_TO_TICK(ipst->ips_igmp_time_to_next) - ddi_get_lbolt(); if (time_left < MSEC_TO_TICK(next)) { ipst->ips_igmp_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_igmp_timer_lock); return; } mutex_exit(&ipst->ips_igmp_timer_lock); ret = untimeout(ipst->ips_igmp_timeout_id); mutex_enter(&ipst->ips_igmp_timer_lock); /* * The timeout was cancelled, or the timeout handler * completed, while we were blocked in the untimeout. * No other thread could have set the timer meanwhile * since we serialized all the timer setters. Thus * no timer is currently active nor executing nor will * any timer fire in the future. We start the timer now * if needed. */ if (ret == -1) { ASSERT(ipst->ips_igmp_timeout_id == 0); } else { ASSERT(ipst->ips_igmp_timeout_id != 0); ipst->ips_igmp_timeout_id = 0; } if (ipst->ips_igmp_time_to_next != 0) { ipst->ips_igmp_time_to_next = MIN(ipst->ips_igmp_time_to_next, next); ipst->ips_igmp_timeout_id = timeout(igmp_timeout_handler, (void *)ipst, MSEC_TO_TICK(ipst->ips_igmp_time_to_next)); ipst->ips_igmp_timer_scheduled_last = ddi_get_lbolt(); } ipst->ips_igmp_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_igmp_timer_lock); } /* * mld_start_timers: * The unit for next is milliseconds. */ void mld_start_timers(unsigned next, ip_stack_t *ipst) { int time_left; int ret; ASSERT(next != 0 && next != INFINITY); mutex_enter(&ipst->ips_mld_timer_lock); if (ipst->ips_mld_timer_setter_active) { /* * Serialize timer setters, one at a time. If the * timer is currently being set by someone, * just record the next time when it has to be * invoked and return. The current setter will * take care. */ ipst->ips_mld_time_to_next = MIN(ipst->ips_mld_time_to_next, next); mutex_exit(&ipst->ips_mld_timer_lock); return; } else { ipst->ips_mld_timer_setter_active = B_TRUE; } if (ipst->ips_mld_timeout_id == 0) { /* * The timer is inactive. We need to start a timer */ ipst->ips_mld_time_to_next = next; ipst->ips_mld_timeout_id = timeout(mld_timeout_handler, (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next)); ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt(); ipst->ips_mld_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_mld_timer_lock); return; } /* * The timer was scheduled sometime back for firing in * 'igmp_time_to_next' ms and is active. We need to * reschedule the timeout if the new 'next' will happen * earlier than the currently scheduled timeout */ time_left = ipst->ips_mld_timer_scheduled_last + MSEC_TO_TICK(ipst->ips_mld_time_to_next) - ddi_get_lbolt(); if (time_left < MSEC_TO_TICK(next)) { ipst->ips_mld_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_mld_timer_lock); return; } mutex_exit(&ipst->ips_mld_timer_lock); ret = untimeout(ipst->ips_mld_timeout_id); mutex_enter(&ipst->ips_mld_timer_lock); /* * The timeout was cancelled, or the timeout handler * completed, while we were blocked in the untimeout. * No other thread could have set the timer meanwhile * since we serialized all the timer setters. Thus * no timer is currently active nor executing nor will * any timer fire in the future. We start the timer now * if needed. */ if (ret == -1) { ASSERT(ipst->ips_mld_timeout_id == 0); } else { ASSERT(ipst->ips_mld_timeout_id != 0); ipst->ips_mld_timeout_id = 0; } if (ipst->ips_mld_time_to_next != 0) { ipst->ips_mld_time_to_next = MIN(ipst->ips_mld_time_to_next, next); ipst->ips_mld_timeout_id = timeout(mld_timeout_handler, (void *)ipst, MSEC_TO_TICK(ipst->ips_mld_time_to_next)); ipst->ips_mld_timer_scheduled_last = ddi_get_lbolt(); } ipst->ips_mld_timer_setter_active = B_FALSE; mutex_exit(&ipst->ips_mld_timer_lock); } /* * igmp_input: * Return NULL for a bad packet that is discarded here. * Return mp if the message is OK and should be handed to "raw" receivers. * Callers of igmp_input() may need to reinitialize variables that were copied * from the mblk as this calls pullupmsg(). */ /* ARGSUSED */ mblk_t * igmp_input(queue_t *q, mblk_t *mp, ill_t *ill) { igmpa_t *igmpa; ipha_t *ipha = (ipha_t *)(mp->b_rptr); int iphlen, igmplen, mblklen; ilm_t *ilm; uint32_t src, dst; uint32_t group; uint_t next; ipif_t *ipif; ip_stack_t *ipst; ASSERT(ill != NULL); ASSERT(!ill->ill_isv6); ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_total; mblklen = MBLKL(mp); if (mblklen < 1 || mblklen < (iphlen = IPH_HDR_LENGTH(ipha))) { ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } igmplen = ntohs(ipha->ipha_length) - iphlen; /* * Since msg sizes are more variable with v3, just pullup the * whole thing now. */ if (MBLKL(mp) < (igmplen + iphlen)) { mblk_t *mp1; if ((mp1 = msgpullup(mp, -1)) == NULL) { ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } freemsg(mp); mp = mp1; ipha = (ipha_t *)(mp->b_rptr); } /* * Validate lengths */ if (igmplen < IGMP_MINLEN) { ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } /* * Validate checksum */ if (IP_CSUM(mp, iphlen, 0)) { ++ipst->ips_igmpstat.igps_rcv_badsum; goto bad_pkt; } igmpa = (igmpa_t *)(&mp->b_rptr[iphlen]); src = ipha->ipha_src; dst = ipha->ipha_dst; if (ip_debug > 1) (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_input: src 0x%x, dst 0x%x on %s\n", (int)ntohl(src), (int)ntohl(dst), ill->ill_name); switch (igmpa->igmpa_type) { case IGMP_MEMBERSHIP_QUERY: /* * packet length differentiates between v1/v2 and v3 * v1/v2 should be exactly 8 octets long; v3 is >= 12 */ if ((igmplen == IGMP_MINLEN) || (ipst->ips_igmp_max_version <= IGMP_V2_ROUTER)) { next = igmp_query_in(ipha, igmpa, ill); } else if (igmplen >= IGMP_V3_QUERY_MINLEN) { next = igmpv3_query_in((igmp3qa_t *)igmpa, ill, igmplen); } else { ++ipst->ips_igmpstat.igps_rcv_tooshort; goto bad_pkt; } if (next == 0) goto bad_pkt; if (next != INFINITY) igmp_start_timers(next, ipst); break; case IGMP_V1_MEMBERSHIP_REPORT: case IGMP_V2_MEMBERSHIP_REPORT: /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * generated by us are looped back since we could potentially * be a multicast router, so discard reports sourced by me. */ mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_lcl_addr == src) { if (ip_debug > 1) { (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_input: we are only " "member src 0x%x ipif_local 0x%x", (int)ntohl(src), (int) ntohl(ipif->ipif_lcl_addr)); } mutex_exit(&ill->ill_lock); return (mp); } } mutex_exit(&ill->ill_lock); ++ipst->ips_igmpstat.igps_rcv_reports; group = igmpa->igmpa_group; if (!CLASSD(group)) { ++ipst->ips_igmpstat.igps_rcv_badreports; goto bad_pkt; } /* * KLUDGE: if the IP source address of the report has an * unspecified (i.e., zero) subnet number, as is allowed for * a booting host, replace it with the correct subnet number * so that a process-level multicast routing demon can * determine which subnet it arrived from. This is necessary * to compensate for the lack of any way for a process to * determine the arrival interface of an incoming packet. * * Requires that a copy of *this* message it passed up * to the raw interface which is done by our caller. */ if ((src & htonl(0xFF000000U)) == 0) { /* Minimum net mask */ /* Pick the first ipif on this ill */ mutex_enter(&ill->ill_lock); src = ill->ill_ipif->ipif_subnet; mutex_exit(&ill->ill_lock); ip1dbg(("igmp_input: changed src to 0x%x\n", (int)ntohl(src))); ipha->ipha_src = src; } /* * If we belong to the group being reported, and * we are a 'Delaying member' in the RFC terminology, * stop our timer for that group and 'clear flag' i.e. * mark as IGMP_OTHERMEMBER. Do this for all logical * interfaces on the given physical interface. */ mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { ilm = ilm_lookup_ipif(ipif, group); if (ilm != NULL) { ++ipst->ips_igmpstat.igps_rcv_ourreports; ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } } /* for */ mutex_exit(&ill->ill_lock); break; case IGMP_V3_MEMBERSHIP_REPORT: /* * Currently nothing to do here; IGMP router is not * implemented in ip, and v3 hosts don't pay attention * to membership reports. */ break; } /* * Pass all valid IGMP packets up to any process(es) listening * on a raw IGMP socket. Do not free the packet. */ return (mp); bad_pkt: freemsg(mp); return (NULL); } static uint_t igmp_query_in(ipha_t *ipha, igmpa_t *igmpa, ill_t *ill) { ilm_t *ilm; int timer; uint_t next, current; ip_stack_t *ipst; ipst = ill->ill_ipst; ++ipst->ips_igmpstat.igps_rcv_queries; /* * In the IGMPv2 specification, there are 3 states and a flag. * * In Non-Member state, we simply don't have a membership record. * In Delaying Member state, our timer is running (ilm->ilm_timer * < INFINITY). In Idle Member state, our timer is not running * (ilm->ilm_timer == INFINITY). * * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if * we have heard a report from another member, or IGMP_IREPORTEDLAST * if I sent the last report. */ if ((igmpa->igmpa_code == 0) || (ipst->ips_igmp_max_version == IGMP_V1_ROUTER)) { /* * Query from an old router. * Remember that the querier on this interface is old, * and set the timer to the value in RFC 1112. */ mutex_enter(&ill->ill_lock); ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 1; if (ill->ill_mcast_type != IGMP_V1_ROUTER) { ip1dbg(("Received IGMPv1 Query on %s, switching mode " "to IGMP_V1_ROUTER\n", ill->ill_name)); atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1); ill->ill_mcast_type = IGMP_V1_ROUTER; } mutex_exit(&ill->ill_lock); timer = SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY); if (ipha->ipha_dst != htonl(INADDR_ALLHOSTS_GROUP) || igmpa->igmpa_group != 0) { ++ipst->ips_igmpstat.igps_rcv_badqueries; return (0); } } else { in_addr_t group; /* * Query from a new router * Simply do a validity check */ group = igmpa->igmpa_group; if (group != 0 && (!CLASSD(group))) { ++ipst->ips_igmpstat.igps_rcv_badqueries; return (0); } /* * Switch interface state to v2 on receipt of a v2 query * ONLY IF current state is v3. Let things be if current * state if v1 but do reset the v2-querier-present timer. */ mutex_enter(&ill->ill_lock); if (ill->ill_mcast_type == IGMP_V3_ROUTER) { ip1dbg(("Received IGMPv2 Query on %s, switching mode " "to IGMP_V2_ROUTER", ill->ill_name)); atomic_add_16(&ill->ill_ifptr->illif_mcast_v2, 1); ill->ill_mcast_type = IGMP_V2_ROUTER; } ill->ill_mcast_v2_time = 0; ill->ill_mcast_v2_tset = 1; mutex_exit(&ill->ill_lock); timer = DSEC_TO_MSEC((int)igmpa->igmpa_code); } if (ip_debug > 1) { mutex_enter(&ill->ill_lock); (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_input: TIMER = igmp_code %d igmp_type 0x%x", (int)ntohs(igmpa->igmpa_code), (int)ntohs(igmpa->igmpa_type)); mutex_exit(&ill->ill_lock); } /* * -Start the timers in all of our membership records * for the physical interface on which the query * arrived, excluding those that belong to the "all * hosts" group (224.0.0.1). * * -Restart any timer that is already running but has * a value longer than the requested timeout. * * -Use the value specified in the query message as * the maximum timeout. */ next = (unsigned)INFINITY; mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { /* * A multicast router joins INADDR_ANY address * to enable promiscuous reception of all * mcasts from the interface. This INADDR_ANY * is stored in the ilm_v6addr as V6 unspec addr */ if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr)) continue; if (ilm->ilm_addr == htonl(INADDR_ANY)) continue; if (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP) && (igmpa->igmpa_group == 0) || (igmpa->igmpa_group == ilm->ilm_addr)) { if (ilm->ilm_timer > timer) { MCAST_RANDOM_DELAY(ilm->ilm_timer, timer); if (ilm->ilm_timer < next) next = ilm->ilm_timer; ilm->ilm_timer += current; } } } mutex_exit(&ill->ill_lock); return (next); } static uint_t igmpv3_query_in(igmp3qa_t *igmp3qa, ill_t *ill, int igmplen) { uint_t i, next, mrd, qqi, timer, delay, numsrc; uint_t current; ilm_t *ilm; ipaddr_t *src_array; uint8_t qrv; ip_stack_t *ipst; ipst = ill->ill_ipst; /* make sure numsrc matches packet size */ numsrc = ntohs(igmp3qa->igmp3qa_numsrc); if (igmplen < IGMP_V3_QUERY_MINLEN + (numsrc * sizeof (ipaddr_t))) { ++ipst->ips_igmpstat.igps_rcv_tooshort; return (0); } src_array = (ipaddr_t *)&igmp3qa[1]; ++ipst->ips_igmpstat.igps_rcv_queries; if ((mrd = (uint_t)igmp3qa->igmp3qa_mxrc) >= IGMP_V3_MAXRT_FPMIN) { uint_t hdrval, mant, exp; hdrval = (uint_t)igmp3qa->igmp3qa_mxrc; mant = hdrval & IGMP_V3_MAXRT_MANT_MASK; exp = (hdrval & IGMP_V3_MAXRT_EXP_MASK) >> 4; mrd = (mant | 0x10) << (exp + 3); } if (mrd == 0) mrd = MCAST_DEF_QUERY_RESP_INTERVAL; timer = DSEC_TO_MSEC(mrd); MCAST_RANDOM_DELAY(delay, timer); next = (unsigned)INFINITY; current = CURRENT_MSTIME; if ((qrv = igmp3qa->igmp3qa_sqrv & IGMP_V3_RV_MASK) == 0) ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; else ill->ill_mcast_rv = qrv; if ((qqi = (uint_t)igmp3qa->igmp3qa_qqic) >= IGMP_V3_QQI_FPMIN) { uint_t hdrval, mant, exp; hdrval = (uint_t)igmp3qa->igmp3qa_qqic; mant = hdrval & IGMP_V3_QQI_MANT_MASK; exp = (hdrval & IGMP_V3_QQI_EXP_MASK) >> 4; qqi = (mant | 0x10) << (exp + 3); } ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi; /* * If we have a pending general query response that's scheduled * sooner than the delay we calculated for this response, then * no action is required (RFC3376 section 5.2 rule 1) */ mutex_enter(&ill->ill_lock); if (ill->ill_global_timer < (current + delay)) { mutex_exit(&ill->ill_lock); return (next); } mutex_exit(&ill->ill_lock); /* * Now take action depending upon query type: * general, group specific, or group/source specific. */ if ((numsrc == 0) && (igmp3qa->igmp3qa_group == INADDR_ANY)) { /* * general query * We know global timer is either not running or is * greater than our calculated delay, so reset it to * our delay (random value in range [0, response time]). */ mutex_enter(&ill->ill_lock); ill->ill_global_timer = current + delay; mutex_exit(&ill->ill_lock); next = delay; } else { /* group or group/source specific query */ mutex_enter(&ill->ill_lock); for (ilm = ill->ill_ilm; ilm; ilm = ilm->ilm_next) { if (!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr) || (ilm->ilm_addr == htonl(INADDR_ANY)) || (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) || (igmp3qa->igmp3qa_group != ilm->ilm_addr)) continue; /* * If the query is group specific or we have a * pending group specific query, the response is * group specific (pending sources list should be * empty). Otherwise, need to update the pending * sources list for the group and source specific * response. */ if (numsrc == 0 || (ilm->ilm_timer < INFINITY && SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) { group_query: FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } else { boolean_t overflow; slist_t *pktl; if (numsrc > MAX_FILTER_SIZE || (ilm->ilm_pendsrcs == NULL && (ilm->ilm_pendsrcs = l_alloc()) == NULL)) { /* * We've been sent more sources than * we can deal with; or we can't deal * with a source list at all. Revert * to a group specific query. */ goto group_query; } if ((pktl = l_alloc()) == NULL) goto group_query; pktl->sl_numsrc = numsrc; for (i = 0; i < numsrc; i++) IN6_IPADDR_TO_V4MAPPED(src_array[i], &(pktl->sl_addr[i])); l_union_in_a(ilm->ilm_pendsrcs, pktl, &overflow); l_free(pktl); if (overflow) goto group_query; } ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ? INFINITY : (ilm->ilm_timer - current); /* choose soonest timer */ ilm->ilm_timer = MIN(ilm->ilm_timer, delay); if (ilm->ilm_timer < next) next = ilm->ilm_timer; ilm->ilm_timer += current; } mutex_exit(&ill->ill_lock); } return (next); } void igmp_joingroup(ilm_t *ilm) { uint_t timer; ill_t *ill; ip_stack_t *ipst = ilm->ilm_ipst; ill = ilm->ilm_ipif->ipif_ill; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(ilm->ilm_ill == NULL && !ilm->ilm_ipif->ipif_isv6); mutex_enter(&ill->ill_lock); if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) { ilm->ilm_rtx.rtx_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; mutex_exit(&ill->ill_lock); } else { ip1dbg(("Querier mode %d, sending report, group %x\n", ill->ill_mcast_type, htonl(ilm->ilm_addr))); if (ill->ill_mcast_type == IGMP_V1_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V3_ROUTER) { mrec_t *rp; mcast_record_t rtype; /* * The possible state changes we need to handle here: * Old State New State Report * * INCLUDE(0) INCLUDE(X) ALLOW(X),BLOCK(0) * INCLUDE(0) EXCLUDE(X) TO_EX(X) * * No need to send the BLOCK(0) report; ALLOW(X) * is enough. */ rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ? ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE; rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ilm->ilm_ipif, rp); mutex_enter(&ill->ill_lock); /* * Set up retransmission state. Timer is set below, * for both v3 and older versions. */ mcast_init_rtx(ill, &ilm->ilm_rtx, rtype, ilm->ilm_filter); } /* Set the ilm timer value */ MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer, SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY)); timer = ilm->ilm_rtx.rtx_timer; ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; ilm->ilm_state = IGMP_IREPORTEDLAST; mutex_exit(&ill->ill_lock); /* * To avoid deadlock, we defer igmp_start_timers() to * ipsq_exit(). See the comment in ipsq_exit() for details. */ mutex_enter(&ipst->ips_igmp_timer_lock); ipst->ips_igmp_deferred_next = MIN(timer, ipst->ips_igmp_deferred_next); mutex_exit(&ipst->ips_igmp_timer_lock); } if (ip_debug > 1) { (void) mi_strlog(ilm->ilm_ipif->ipif_ill->ill_rq, 1, SL_TRACE, "igmp_joingroup: multicast_type %d timer %d", (ilm->ilm_ipif->ipif_ill->ill_mcast_type), (int)ntohl(timer)); } } void mld_joingroup(ilm_t *ilm) { uint_t timer; ill_t *ill; ip_stack_t *ipst = ilm->ilm_ipst; ill = ilm->ilm_ill; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(ilm->ilm_ipif == NULL && ill->ill_isv6); mutex_enter(&ill->ill_lock); if (IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr)) { ilm->ilm_rtx.rtx_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; mutex_exit(&ill->ill_lock); } else { if (ill->ill_mcast_type == MLD_V1_ROUTER) { mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); mutex_enter(&ill->ill_lock); } else { mrec_t *rp; mcast_record_t rtype; /* * The possible state changes we need to handle here: * Old State New State Report * * INCLUDE(0) INCLUDE(X) ALLOW(X),BLOCK(0) * INCLUDE(0) EXCLUDE(X) TO_EX(X) * * No need to send the BLOCK(0) report; ALLOW(X) * is enough */ rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ? ALLOW_NEW_SOURCES : CHANGE_TO_EXCLUDE; rp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); mutex_enter(&ill->ill_lock); /* * Set up retransmission state. Timer is set below, * for both v2 and v1. */ mcast_init_rtx(ill, &ilm->ilm_rtx, rtype, ilm->ilm_filter); } /* Set the ilm timer value */ ASSERT(ill->ill_mcast_type != MLD_V2_ROUTER || ilm->ilm_rtx.rtx_cnt > 0); MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer, SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY)); timer = ilm->ilm_rtx.rtx_timer; ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; ilm->ilm_state = IGMP_IREPORTEDLAST; mutex_exit(&ill->ill_lock); /* * To avoid deadlock, we defer mld_start_timers() to * ipsq_exit(). See the comment in ipsq_exit() for details. */ mutex_enter(&ipst->ips_mld_timer_lock); ipst->ips_mld_deferred_next = MIN(timer, ipst->ips_mld_deferred_next); mutex_exit(&ipst->ips_mld_timer_lock); } if (ip_debug > 1) { (void) mi_strlog(ilm->ilm_ill->ill_rq, 1, SL_TRACE, "mld_joingroup: multicast_type %d timer %d", (ilm->ilm_ill->ill_mcast_type), (int)ntohl(timer)); } } void igmp_leavegroup(ilm_t *ilm) { ill_t *ill = ilm->ilm_ipif->ipif_ill; ASSERT(ilm->ilm_ill == NULL); ASSERT(!ill->ill_isv6); mutex_enter(&ill->ill_lock); if (ilm->ilm_state == IGMP_IREPORTEDLAST && ill->ill_mcast_type == IGMP_V2_ROUTER && (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_LEAVE_GROUP, (htonl(INADDR_ALLRTRS_GROUP))); return; } else if ((ill->ill_mcast_type == IGMP_V3_ROUTER) && (ilm->ilm_addr != htonl(INADDR_ALLHOSTS_GROUP))) { mrec_t *rp; /* * The possible state changes we need to handle here: * Old State New State Report * * INCLUDE(X) INCLUDE(0) ALLOW(0),BLOCK(X) * EXCLUDE(X) INCLUDE(0) TO_IN(0) * * No need to send the ALLOW(0) report; BLOCK(X) is enough */ if (ilm->ilm_fmode == MODE_IS_INCLUDE) { rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); } else { rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, NULL, NULL); } mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ilm->ilm_ipif, rp); return; } mutex_exit(&ill->ill_lock); } void mld_leavegroup(ilm_t *ilm) { ill_t *ill = ilm->ilm_ill; ASSERT(ilm->ilm_ipif == NULL); ASSERT(ill->ill_isv6); mutex_enter(&ill->ill_lock); if (ilm->ilm_state == IGMP_IREPORTEDLAST && ill->ill_mcast_type == MLD_V1_ROUTER && (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) { mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REDUCTION, &ipv6_all_rtrs_mcast); return; } else if ((ill->ill_mcast_type == MLD_V2_ROUTER) && (!IN6_ARE_ADDR_EQUAL(&ipv6_all_hosts_mcast, &ilm->ilm_v6addr))) { mrec_t *rp; /* * The possible state changes we need to handle here: * Old State New State Report * * INCLUDE(X) INCLUDE(0) ALLOW(0),BLOCK(X) * EXCLUDE(X) INCLUDE(0) TO_IN(0) * * No need to send the ALLOW(0) report; BLOCK(X) is enough */ if (ilm->ilm_fmode == MODE_IS_INCLUDE) { rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, ilm->ilm_filter, NULL); } else { rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, NULL, NULL); } mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); return; } mutex_exit(&ill->ill_lock); } void igmp_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) { ill_t *ill; mrec_t *rp; ip_stack_t *ipst = ilm->ilm_ipst; ASSERT(ilm != NULL); /* state change reports should only be sent if the router is v3 */ if (ilm->ilm_ipif->ipif_ill->ill_mcast_type != IGMP_V3_ROUTER) return; if (ilm->ilm_ill == NULL) { ASSERT(ilm->ilm_ipif != NULL); ill = ilm->ilm_ipif->ipif_ill; } else { ill = ilm->ilm_ill; } mutex_enter(&ill->ill_lock); /* * Compare existing(old) state with the new state and prepare * State Change Report, according to the rules in RFC 3376: * * Old State New State State Change Report * * INCLUDE(A) INCLUDE(B) ALLOW(B-A),BLOCK(A-B) * EXCLUDE(A) EXCLUDE(B) ALLOW(A-B),BLOCK(B-A) * INCLUDE(A) EXCLUDE(B) TO_EX(B) * EXCLUDE(A) INCLUDE(B) TO_IN(B) */ if (ilm->ilm_fmode == fmode) { slist_t *a_minus_b = NULL, *b_minus_a = NULL; slist_t *allow, *block; if (((a_minus_b = l_alloc()) == NULL) || ((b_minus_a = l_alloc()) == NULL)) { l_free(a_minus_b); if (ilm->ilm_fmode == MODE_IS_INCLUDE) goto send_to_ex; else goto send_to_in; } l_difference(ilm->ilm_filter, flist, a_minus_b); l_difference(flist, ilm->ilm_filter, b_minus_a); if (ilm->ilm_fmode == MODE_IS_INCLUDE) { allow = b_minus_a; block = a_minus_b; } else { allow = a_minus_b; block = b_minus_a; } rp = NULL; if (!SLIST_IS_EMPTY(allow)) rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr, allow, rp); if (!SLIST_IS_EMPTY(block)) rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, block, rp); l_free(a_minus_b); l_free(b_minus_a); } else if (ilm->ilm_fmode == MODE_IS_INCLUDE) { send_to_ex: rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist, NULL); } else { send_to_in: rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist, NULL); } /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently * running, start it (need to do a delayed start of the timer as * we're currently in the sq). */ rp = mcast_merge_rtx(ilm, rp, flist); if (ilm->ilm_rtx.rtx_timer == INFINITY) { MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer, SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY)); mutex_enter(&ipst->ips_igmp_timer_lock); ipst->ips_igmp_deferred_next = MIN(ipst->ips_igmp_deferred_next, ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_igmp_timer_lock); } mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ilm->ilm_ipif, rp); } void mld_statechange(ilm_t *ilm, mcast_record_t fmode, slist_t *flist) { ill_t *ill; mrec_t *rp = NULL; ip_stack_t *ipst = ilm->ilm_ipst; ASSERT(ilm != NULL); ill = ilm->ilm_ill; /* only need to send if we have an mldv2-capable router */ mutex_enter(&ill->ill_lock); if (ill->ill_mcast_type != MLD_V2_ROUTER) { mutex_exit(&ill->ill_lock); return; } /* * Compare existing (old) state with the new state passed in * and send appropriate MLDv2 State Change Report. * * Old State New State State Change Report * * INCLUDE(A) INCLUDE(B) ALLOW(B-A),BLOCK(A-B) * EXCLUDE(A) EXCLUDE(B) ALLOW(A-B),BLOCK(B-A) * INCLUDE(A) EXCLUDE(B) TO_EX(B) * EXCLUDE(A) INCLUDE(B) TO_IN(B) */ if (ilm->ilm_fmode == fmode) { slist_t *a_minus_b = NULL, *b_minus_a = NULL; slist_t *allow, *block; if (((a_minus_b = l_alloc()) == NULL) || ((b_minus_a = l_alloc()) == NULL)) { l_free(a_minus_b); if (ilm->ilm_fmode == MODE_IS_INCLUDE) goto send_to_ex; else goto send_to_in; } l_difference(ilm->ilm_filter, flist, a_minus_b); l_difference(flist, ilm->ilm_filter, b_minus_a); if (ilm->ilm_fmode == MODE_IS_INCLUDE) { allow = b_minus_a; block = a_minus_b; } else { allow = a_minus_b; block = b_minus_a; } if (!SLIST_IS_EMPTY(allow)) rp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr, allow, rp); if (!SLIST_IS_EMPTY(block)) rp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, block, rp); l_free(a_minus_b); l_free(b_minus_a); } else if (ilm->ilm_fmode == MODE_IS_INCLUDE) { send_to_ex: rp = mcast_bldmrec(CHANGE_TO_EXCLUDE, &ilm->ilm_v6addr, flist, NULL); } else { send_to_in: rp = mcast_bldmrec(CHANGE_TO_INCLUDE, &ilm->ilm_v6addr, flist, NULL); } /* * Need to set up retransmission state; merge the new info with the * current state (which may be null). If the timer is not currently * running, start it (need to do a deferred start of the timer as * we're currently in the sq). */ rp = mcast_merge_rtx(ilm, rp, flist); ASSERT(ilm->ilm_rtx.rtx_cnt > 0); if (ilm->ilm_rtx.rtx_timer == INFINITY) { MCAST_RANDOM_DELAY(ilm->ilm_rtx.rtx_timer, SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY)); mutex_enter(&ipst->ips_mld_timer_lock); ipst->ips_mld_deferred_next = MIN(ipst->ips_mld_deferred_next, ilm->ilm_rtx.rtx_timer); ilm->ilm_rtx.rtx_timer += CURRENT_MSTIME; mutex_exit(&ipst->ips_mld_timer_lock); } mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); } uint_t igmp_timeout_handler_per_ill(ill_t *ill) { uint_t next = INFINITY, current; ilm_t *ilm; ipif_t *ipif; mrec_t *rp = NULL; mrec_t *rtxrp = NULL; rtx_state_t *rtxp; mcast_record_t rtype; ASSERT(IAM_WRITER_ILL(ill)); mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; /* First check the global timer on this interface */ if (ill->ill_global_timer == INFINITY) goto per_ilm_timer; if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) { ill->ill_global_timer = INFINITY; /* * Send report for each group on this interface. * Since we just set the global timer (received a v3 general * query), need to skip the all hosts addr (224.0.0.1), per * RFC 3376 section 5. */ for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (ilm->ilm_addr == htonl(INADDR_ALLHOSTS_GROUP)) continue; ASSERT(ilm->ilm_ipif != NULL); ilm->ilm_ipif->ipif_igmp_rpt = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, ilm->ilm_filter, ilm->ilm_ipif->ipif_igmp_rpt); /* * Since we're sending a report on this group, okay * to delete pending group-specific timers. Note * that group-specific retransmit timers still need * to be checked in the per_ilm_timer for-loop. */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } /* * We've built per-ipif mrec lists; walk the ill's ipif list * and send a report for each ipif that has an mrec list. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (ipif->ipif_igmp_rpt == NULL) continue; mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ipif, ipif->ipif_igmp_rpt); mutex_enter(&ill->ill_lock); /* mrec list was freed by igmpv3_sendrpt() */ ipif->ipif_igmp_rpt = NULL; } } else { if ((ill->ill_global_timer - current) < next) next = ill->ill_global_timer - current; } per_ilm_timer: for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (ilm->ilm_timer == INFINITY) goto per_ilm_rtxtimer; if (ilm->ilm_timer > (current + CURRENT_OFFSET)) { if ((ilm->ilm_timer - current) < next) next = ilm->ilm_timer - current; if (ip_debug > 1) { (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_timo_hlr 2: ilm_timr %d " "typ %d nxt %d", (int)ntohl(ilm->ilm_timer - current), (ill->ill_mcast_type), next); } goto per_ilm_rtxtimer; } /* the timer has expired, need to take action */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == IGMP_V1_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); } else { slist_t *rsp; if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) && (rsp = l_alloc()) != NULL) { /* * Contents of reply depend on pending * requested source list. */ if (ilm->ilm_fmode == MODE_IS_INCLUDE) { l_intersection(ilm->ilm_filter, ilm->ilm_pendsrcs, rsp); } else { l_difference(ilm->ilm_pendsrcs, ilm->ilm_filter, rsp); } FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; if (!SLIST_IS_EMPTY(rsp)) rp = mcast_bldmrec(MODE_IS_INCLUDE, &ilm->ilm_v6addr, rsp, rp); FREE_SLIST(rsp); } else { /* * Either the pending request is just group- * specific, or we couldn't get the resources * (rsp) to build a source-specific reply. */ rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, ilm->ilm_filter, rp); } mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ill->ill_ipif, rp); mutex_enter(&ill->ill_lock); rp = NULL; } per_ilm_rtxtimer: rtxp = &ilm->ilm_rtx; if (rtxp->rtx_timer == INFINITY) continue; if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) { if ((rtxp->rtx_timer - current) < next) next = rtxp->rtx_timer - current; continue; } rtxp->rtx_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == IGMP_V1_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V1_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); continue; } else if (ill->ill_mcast_type == IGMP_V2_ROUTER) { mutex_exit(&ill->ill_lock); igmp_sendpkt(ilm, IGMP_V2_MEMBERSHIP_REPORT, 0); mutex_enter(&ill->ill_lock); continue; } /* * The retransmit timer has popped, and our router is * IGMPv3. We have to delve into the retransmit state * stored in the ilm. * * Decrement the retransmit count. If the fmode rtx * count is active, decrement it, and send a filter * mode change report with the ilm's source list. * Otherwise, send a source list change report with * the current retransmit lists. */ ASSERT(rtxp->rtx_cnt > 0); ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt); rtxp->rtx_cnt--; if (rtxp->rtx_fmode_cnt > 0) { rtxp->rtx_fmode_cnt--; rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ? CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE; rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, rtxrp); } else { rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp); rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp); } if (rtxp->rtx_cnt > 0) { MCAST_RANDOM_DELAY(rtxp->rtx_timer, SEC_TO_MSEC(IGMP_MAX_HOST_REPORT_DELAY)); if (rtxp->rtx_timer < next) next = rtxp->rtx_timer; rtxp->rtx_timer += current; } else { CLEAR_SLIST(rtxp->rtx_allow); CLEAR_SLIST(rtxp->rtx_block); } mutex_exit(&ill->ill_lock); igmpv3_sendrpt(ilm->ilm_ipif, rtxrp); mutex_enter(&ill->ill_lock); rtxrp = NULL; } mutex_exit(&ill->ill_lock); return (next); } /* * igmp_timeout_handler: * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick). * Returns number of ticks to next event (or 0 if none). * * As part of multicast join and leave igmp we may need to send out an * igmp request. The igmp related state variables in the ilm are protected * by ill_lock. A single global igmp timer is used to track igmp timeouts. * igmp_timer_lock protects the global igmp_timeout_id. igmp_start_timers * starts the igmp timer if needed. It serializes multiple threads trying to * simultaneously start the timer using the igmp_timer_setter_active flag. * * igmp_input() receives igmp queries and responds to the queries * in a delayed fashion by posting a timer i.e. it calls igmp_start_timers(). * Later the igmp_timer fires, the timeout handler igmp_timerout_handler() * performs the action exclusively after entering each ill's ipsq as writer. * The actual igmp timeout handler needs to run in the ipsq since it has to * access the ilm's and we don't want another exclusive operation like * say an IPMP failover to be simultaneously moving the ilms from one ill to * another. * * The igmp_slowtimeo() function is called thru another timer. * igmp_slowtimeout_lock protects the igmp_slowtimeout_id */ void igmp_timeout_handler(void *arg) { ill_t *ill; uint_t global_next = INFINITY; uint_t next; ill_walk_context_t ctx; boolean_t success; ip_stack_t *ipst = arg; ASSERT(arg != NULL); mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); ipst->ips_igmp_timer_thread = curthread; ipst->ips_igmp_timer_scheduled_last = 0; ipst->ips_igmp_time_to_next = 0; mutex_exit(&ipst->ips_igmp_timer_lock); rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V4(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(!ill->ill_isv6); /* * We may not be able to refhold the ill if the ill/ipif * is changing. But we need to make sure that the ill will * not vanish. So we just bump up the ill_waiter count. */ if (!ill_waiter_inc(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); success = ipsq_enter(ill, B_TRUE, NEW_OP); if (success) { next = igmp_timeout_handler_per_ill(ill); if (next < global_next) global_next = next; ipsq_exit(ill->ill_phyint->phyint_ipsq); } rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill_waiter_dcr(ill); } rw_exit(&ipst->ips_ill_g_lock); mutex_enter(&ipst->ips_igmp_timer_lock); ASSERT(ipst->ips_igmp_timeout_id != 0); ipst->ips_igmp_timeout_id = 0; ipst->ips_igmp_timer_thread = NULL; mutex_exit(&ipst->ips_igmp_timer_lock); if (global_next != INFINITY) igmp_start_timers(global_next, ipst); } /* * mld_timeout_handler: * Called when there are timeout events, every next (tick). * Returns number of ticks to next event (or 0 if none). */ /* ARGSUSED */ uint_t mld_timeout_handler_per_ill(ill_t *ill) { ilm_t *ilm; uint_t next = INFINITY, current; mrec_t *rp, *rtxrp; rtx_state_t *rtxp; mcast_record_t rtype; ASSERT(IAM_WRITER_ILL(ill)); mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; /* * First check the global timer on this interface; the global timer * is not used for MLDv1, so if it's set we can assume we're v2. */ if (ill->ill_global_timer == INFINITY) goto per_ilm_timer; if (ill->ill_global_timer <= (current + CURRENT_OFFSET)) { ill->ill_global_timer = INFINITY; /* * Send report for each group on this interface. * Since we just set the global timer (received a v2 general * query), need to skip the all hosts addr (ff02::1), per * RFC 3810 section 6. */ rp = NULL; for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &ipv6_all_hosts_mcast)) continue; rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, ilm->ilm_filter, rp); /* * Since we're sending a report on this group, okay * to delete pending group-specific timers. Note * that group-specific retransmit timers still need * to be checked in the per_ilm_timer for-loop. */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); mutex_enter(&ill->ill_lock); } else { if ((ill->ill_global_timer - current) < next) next = ill->ill_global_timer - current; } per_ilm_timer: rp = rtxrp = NULL; for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (ilm->ilm_timer == INFINITY) goto per_ilm_rtxtimer; if (ilm->ilm_timer > (current + CURRENT_OFFSET)) { if ((ilm->ilm_timer - current) < next) next = ilm->ilm_timer - current; if (ip_debug > 1) { (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "igmp_timo_hlr 2: ilm_timr" " %d typ %d nxt %d", (int)ntohl(ilm->ilm_timer - current), (ill->ill_mcast_type), next); } goto per_ilm_rtxtimer; } /* the timer has expired, need to take action */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == MLD_V1_ROUTER) { mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); mutex_enter(&ill->ill_lock); } else { slist_t *rsp; if (!SLIST_IS_EMPTY(ilm->ilm_pendsrcs) && (rsp = l_alloc()) != NULL) { /* * Contents of reply depend on pending * requested source list. */ if (ilm->ilm_fmode == MODE_IS_INCLUDE) { l_intersection(ilm->ilm_filter, ilm->ilm_pendsrcs, rsp); } else { l_difference(ilm->ilm_pendsrcs, ilm->ilm_filter, rsp); } FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; if (!SLIST_IS_EMPTY(rsp)) rp = mcast_bldmrec(MODE_IS_INCLUDE, &ilm->ilm_v6addr, rsp, rp); FREE_SLIST(rsp); } else { rp = mcast_bldmrec(ilm->ilm_fmode, &ilm->ilm_v6addr, ilm->ilm_filter, rp); } } per_ilm_rtxtimer: rtxp = &ilm->ilm_rtx; if (rtxp->rtx_timer == INFINITY) continue; if (rtxp->rtx_timer > (current + CURRENT_OFFSET)) { if ((rtxp->rtx_timer - current) < next) next = rtxp->rtx_timer - current; continue; } rtxp->rtx_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; if (ill->ill_mcast_type == MLD_V1_ROUTER) { mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); mutex_enter(&ill->ill_lock); continue; } /* * The retransmit timer has popped, and our router is * MLDv2. We have to delve into the retransmit state * stored in the ilm. * * Decrement the retransmit count. If the fmode rtx * count is active, decrement it, and send a filter * mode change report with the ilm's source list. * Otherwise, send a source list change report with * the current retransmit lists. */ ASSERT(rtxp->rtx_cnt > 0); ASSERT(rtxp->rtx_cnt >= rtxp->rtx_fmode_cnt); rtxp->rtx_cnt--; if (rtxp->rtx_fmode_cnt > 0) { rtxp->rtx_fmode_cnt--; rtype = (ilm->ilm_fmode == MODE_IS_INCLUDE) ? CHANGE_TO_INCLUDE : CHANGE_TO_EXCLUDE; rtxrp = mcast_bldmrec(rtype, &ilm->ilm_v6addr, ilm->ilm_filter, rtxrp); } else { rtxrp = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_allow, rtxrp); rtxrp = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_block, rtxrp); } if (rtxp->rtx_cnt > 0) { MCAST_RANDOM_DELAY(rtxp->rtx_timer, SEC_TO_MSEC(ICMP6_MAX_HOST_REPORT_DELAY)); if (rtxp->rtx_timer < next) next = rtxp->rtx_timer; rtxp->rtx_timer += current; } else { CLEAR_SLIST(rtxp->rtx_allow); CLEAR_SLIST(rtxp->rtx_block); } } if (ill->ill_mcast_type == MLD_V2_ROUTER) { mutex_exit(&ill->ill_lock); mldv2_sendrpt(ill, rp); mldv2_sendrpt(ill, rtxrp); return (next); } mutex_exit(&ill->ill_lock); return (next); } /* * mld_timeout_handler: * Called when there are timeout events, every next * TMEOUT_INTERVAL (tick). * Returns number of ticks to next event (or 0 if none). * MT issues are same as igmp_timeout_handler */ void mld_timeout_handler(void *arg) { ill_t *ill; uint_t global_next = INFINITY; uint_t next; ill_walk_context_t ctx; boolean_t success; ip_stack_t *ipst = arg; ASSERT(arg != NULL); mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); ipst->ips_mld_timer_thread = curthread; ipst->ips_mld_timer_scheduled_last = 0; ipst->ips_mld_time_to_next = 0; mutex_exit(&ipst->ips_mld_timer_lock); rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill = ILL_START_WALK_V6(&ctx, ipst); for (; ill != NULL; ill = ill_next(&ctx, ill)) { ASSERT(ill->ill_isv6); /* * We may not be able to refhold the ill if the ill/ipif * is changing. But we need to make sure that the ill will * not vanish. So we just bump up the ill_waiter count. */ if (!ill_waiter_inc(ill)) continue; rw_exit(&ipst->ips_ill_g_lock); success = ipsq_enter(ill, B_TRUE, NEW_OP); if (success) { next = mld_timeout_handler_per_ill(ill); if (next < global_next) global_next = next; ipsq_exit(ill->ill_phyint->phyint_ipsq); } rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill_waiter_dcr(ill); } rw_exit(&ipst->ips_ill_g_lock); mutex_enter(&ipst->ips_mld_timer_lock); ASSERT(ipst->ips_mld_timeout_id != 0); ipst->ips_mld_timeout_id = 0; ipst->ips_mld_timer_thread = NULL; mutex_exit(&ipst->ips_mld_timer_lock); if (global_next != INFINITY) mld_start_timers(global_next, ipst); } /* * Calculate the Older Version Querier Present timeout value, in number * of slowtimo intervals, for the given ill. */ #define OVQP(ill) \ ((1000 * (((ill)->ill_mcast_rv * (ill)->ill_mcast_qi) \ + MCAST_QUERY_RESP_INTERVAL)) / MCAST_SLOWTIMO_INTERVAL) /* * igmp_slowtimo: * - Resets to new router if we didnt we hear from the router * in IGMP_AGE_THRESHOLD seconds. * - Resets slowtimeout. * Check for ips_igmp_max_version ensures that we don't revert to a higher * IGMP version than configured. */ void igmp_slowtimo(void *arg) { ill_t *ill; ill_if_t *ifp; avl_tree_t *avl_tree; ip_stack_t *ipst = (ip_stack_t *)arg; ASSERT(arg != NULL); /* Hold the ill_g_lock so that we can safely walk the ill list */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); /* * The ill_if_t list is circular, hence the odd loop parameters. * * We can't use the ILL_START_WALK and ill_next() wrappers for this * walk, as we need to check the illif_mcast_* fields in the ill_if_t * structure (allowing us to skip if none of the instances have timers * running). */ for (ifp = IP_V4_ILL_G_LIST(ipst); ifp != (ill_if_t *)&IP_V4_ILL_G_LIST(ipst); ifp = ifp->illif_next) { /* * illif_mcast_v[12] are set using atomics. If an ill hears * a V1 or V2 query now and we miss seeing the count now, * we will see it the next time igmp_slowtimo is called. */ if (ifp->illif_mcast_v1 == 0 && ifp->illif_mcast_v2 == 0) continue; avl_tree = &ifp->illif_avl_by_ppa; for (ill = avl_first(avl_tree); ill != NULL; ill = avl_walk(avl_tree, ill, AVL_AFTER)) { mutex_enter(&ill->ill_lock); if (ill->ill_mcast_v1_tset == 1) ill->ill_mcast_v1_time++; if (ill->ill_mcast_v2_tset == 1) ill->ill_mcast_v2_time++; if ((ill->ill_mcast_type == IGMP_V1_ROUTER) && (ipst->ips_igmp_max_version >= IGMP_V2_ROUTER) && (ill->ill_mcast_v1_time >= OVQP(ill))) { if ((ill->ill_mcast_v2_tset > 0) || (ipst->ips_igmp_max_version == IGMP_V2_ROUTER)) { ip1dbg(("V1 query timer " "expired on %s; switching " "mode to IGMP_V2\n", ill->ill_name)); ill->ill_mcast_type = IGMP_V2_ROUTER; } else { ip1dbg(("V1 query timer " "expired on %s; switching " "mode to IGMP_V3\n", ill->ill_name)); ill->ill_mcast_type = IGMP_V3_ROUTER; } ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 0; atomic_add_16(&ifp->illif_mcast_v1, -1); } if ((ill->ill_mcast_type == IGMP_V2_ROUTER) && (ipst->ips_igmp_max_version >= IGMP_V3_ROUTER) && (ill->ill_mcast_v2_time >= OVQP(ill))) { ip1dbg(("V2 query timer expired on " "%s; switching mode to IGMP_V3\n", ill->ill_name)); ill->ill_mcast_type = IGMP_V3_ROUTER; ill->ill_mcast_v2_time = 0; ill->ill_mcast_v2_tset = 0; atomic_add_16(&ifp->illif_mcast_v2, -1); } mutex_exit(&ill->ill_lock); } } rw_exit(&ipst->ips_ill_g_lock); mutex_enter(&ipst->ips_igmp_slowtimeout_lock); ipst->ips_igmp_slowtimeout_id = timeout(igmp_slowtimo, (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); mutex_exit(&ipst->ips_igmp_slowtimeout_lock); } /* * mld_slowtimo: * - Resets to newer version if we didn't hear from the older version router * in MLD_AGE_THRESHOLD seconds. * - Restarts slowtimeout. * Check for ips_mld_max_version ensures that we don't revert to a higher * IGMP version than configured. */ /* ARGSUSED */ void mld_slowtimo(void *arg) { ill_t *ill; ill_if_t *ifp; avl_tree_t *avl_tree; ip_stack_t *ipst = (ip_stack_t *)arg; ASSERT(arg != NULL); /* See comments in igmp_slowtimo() above... */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); for (ifp = IP_V6_ILL_G_LIST(ipst); ifp != (ill_if_t *)&IP_V6_ILL_G_LIST(ipst); ifp = ifp->illif_next) { if (ifp->illif_mcast_v1 == 0) continue; avl_tree = &ifp->illif_avl_by_ppa; for (ill = avl_first(avl_tree); ill != NULL; ill = avl_walk(avl_tree, ill, AVL_AFTER)) { mutex_enter(&ill->ill_lock); if (ill->ill_mcast_v1_tset == 1) ill->ill_mcast_v1_time++; if ((ill->ill_mcast_type == MLD_V1_ROUTER) && (ipst->ips_mld_max_version >= MLD_V2_ROUTER) && (ill->ill_mcast_v1_time >= OVQP(ill))) { ip1dbg(("MLD query timer expired on" " %s; switching mode to MLD_V2\n", ill->ill_name)); ill->ill_mcast_type = MLD_V2_ROUTER; ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 0; atomic_add_16(&ifp->illif_mcast_v1, -1); } mutex_exit(&ill->ill_lock); } } rw_exit(&ipst->ips_ill_g_lock); mutex_enter(&ipst->ips_mld_slowtimeout_lock); ipst->ips_mld_slowtimeout_id = timeout(mld_slowtimo, (void *)ipst, MSEC_TO_TICK(MCAST_SLOWTIMO_INTERVAL)); mutex_exit(&ipst->ips_mld_slowtimeout_lock); } /* * igmp_sendpkt: * This will send to ip_wput like icmp_inbound. * Note that the lower ill (on which the membership is kept) is used * as an upper ill to pass in the multicast parameters. */ static void igmp_sendpkt(ilm_t *ilm, uchar_t type, ipaddr_t addr) { mblk_t *mp; igmpa_t *igmpa; uint8_t *rtralert; ipha_t *ipha; int hdrlen = sizeof (ipha_t) + RTRALERT_LEN; size_t size = hdrlen + sizeof (igmpa_t); ipif_t *ipif = ilm->ilm_ipif; ill_t *ill = ipif->ipif_ill; /* Will be the "lower" ill */ mblk_t *first_mp; ipsec_out_t *io; zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; /* * We need to make sure this packet goes out on an ipif. If * there is some global policy match in ip_wput_ire, we need * to get to the right interface after IPSEC processing. * To make sure this multicast packet goes out on the right * interface, we attach an ipsec_out and initialize ill_index * like we did in ip_wput. To make sure that this packet does * not get forwarded on other interfaces or looped back, we * set ipsec_out_dontroute to B_TRUE and ipsec_out_multicast_loop * to B_FALSE. * * We also need to make sure that this does not get load balanced * if it hits ip_newroute_ipif. So, we initialize ipsec_out_attach_if * here. If it gets load balanced, switches supporting igmp snooping * will send the packet that it receives for this multicast group * to the interface that we are sending on. As we have joined the * multicast group on this ill, by sending the packet out on this * ill, we receive all the packets back on this ill. */ first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); if (first_mp == NULL) return; first_mp->b_datap->db_type = M_CTL; first_mp->b_wptr += sizeof (ipsec_info_t); bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); /* ipsec_out_secure is B_FALSE now */ io = (ipsec_out_t *)first_mp->b_rptr; io->ipsec_out_type = IPSEC_OUT; io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ilm->ilm_zoneid) == ALL_ZONES) zoneid = GLOBAL_ZONEID; io->ipsec_out_zoneid = zoneid; io->ipsec_out_ns = ipst->ips_netstack; /* No netstack_hold */ mp = allocb(size, BPRI_HI); if (mp == NULL) { freemsg(first_mp); return; } mp->b_wptr = mp->b_rptr + size; first_mp->b_cont = mp; ipha = (ipha_t *)mp->b_rptr; rtralert = (uint8_t *)&(ipha[1]); igmpa = (igmpa_t *)&(rtralert[RTRALERT_LEN]); igmpa->igmpa_type = type; igmpa->igmpa_code = 0; igmpa->igmpa_group = ilm->ilm_addr; igmpa->igmpa_cksum = 0; igmpa->igmpa_cksum = IP_CSUM(mp, hdrlen, 0); rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT; rtralert[1] = RTRALERT_LEN; rtralert[2] = 0; rtralert[3] = 0; ipha->ipha_version_and_hdr_length = (IP_VERSION << 4) | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS); ipha->ipha_type_of_service = 0; ipha->ipha_length = htons(size); ipha->ipha_ident = 0; ipha->ipha_fragment_offset_and_flags = 0; ipha->ipha_ttl = IGMP_TTL; ipha->ipha_protocol = IPPROTO_IGMP; ipha->ipha_hdr_checksum = 0; ipha->ipha_dst = addr ? addr : igmpa->igmpa_group; ipha->ipha_src = ipif->ipif_src_addr; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing demon can hear it. */ /* * This will run multiple times for the same group if there are members * on the same group for multiple ipif's on the same ill. The * igmp_input code will suppress this due to the loopback thus we * always loopback membership report. */ ASSERT(ill->ill_rq != NULL); ip_multicast_loopback(ill->ill_rq, ill, first_mp, 0, ilm->ilm_zoneid); ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid); ++ipst->ips_igmpstat.igps_snd_reports; } /* * Sends an IGMP_V3_MEMBERSHIP_REPORT message out the ill associated * with the passed-in ipif. The report will contain one group record * for each element of reclist. If this causes packet length to * exceed ipif->ipif_ill->ill_max_frag, multiple reports are sent. * reclist is assumed to be made up of buffers allocated by mcast_bldmrec(), * and those buffers are freed here. */ static void igmpv3_sendrpt(ipif_t *ipif, mrec_t *reclist) { ipsec_out_t *io; igmp3ra_t *igmp3ra; grphdra_t *grphdr; mblk_t *first_mp, *mp; ipha_t *ipha; uint8_t *rtralert; ipaddr_t *src_array; int i, j, numrec, more_src_cnt; size_t hdrsize, size, rsize; ill_t *ill = ipif->ipif_ill; mrec_t *rp, *cur_reclist; mrec_t *next_reclist = reclist; boolean_t morepkts; zoneid_t zoneid; ip_stack_t *ipst = ill->ill_ipst; /* if there aren't any records, there's nothing to send */ if (reclist == NULL) return; hdrsize = sizeof (ipha_t) + RTRALERT_LEN; nextpkt: size = hdrsize + sizeof (igmp3ra_t); morepkts = B_FALSE; more_src_cnt = 0; cur_reclist = next_reclist; numrec = 0; for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) { rsize = sizeof (grphdra_t) + (rp->mrec_srcs.sl_numsrc * sizeof (ipaddr_t)); if (size + rsize > ill->ill_max_frag) { if (rp == cur_reclist) { /* * If the first mrec we looked at is too big * to fit in a single packet (i.e the source * list is too big), we must either truncate * the list (if TO_EX or IS_EX), or send * multiple reports for the same group (all * other types). */ int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (grphdra_t)); srcsperpkt = srcspace / sizeof (ipaddr_t); /* * Increment size and numrec, because we will * be sending a record for the mrec we're * looking at now. */ size += sizeof (grphdra_t) + (srcsperpkt * sizeof (ipaddr_t)); numrec++; if (rp->mrec_type == MODE_IS_EXCLUDE || rp->mrec_type == CHANGE_TO_EXCLUDE) { rp->mrec_srcs.sl_numsrc = srcsperpkt; if (rp->mrec_next == NULL) { /* no more packets to send */ break; } else { /* * more packets, but we're * done with this mrec. */ next_reclist = rp->mrec_next; } } else { more_src_cnt = rp->mrec_srcs.sl_numsrc - srcsperpkt; rp->mrec_srcs.sl_numsrc = srcsperpkt; /* * We'll fix up this mrec (remove the * srcs we've already sent) before * returning to nextpkt above. */ next_reclist = rp; } } else { next_reclist = rp; } morepkts = B_TRUE; break; } size += rsize; numrec++; } /* * See comments in igmp_sendpkt() about initializing for ipsec and * load balancing requirements. */ first_mp = allocb(sizeof (ipsec_info_t), BPRI_HI); if (first_mp == NULL) goto free_reclist; first_mp->b_datap->db_type = M_CTL; first_mp->b_wptr += sizeof (ipsec_info_t); bzero(first_mp->b_rptr, sizeof (ipsec_info_t)); /* ipsec_out_secure is B_FALSE now */ io = (ipsec_out_t *)first_mp->b_rptr; io->ipsec_out_type = IPSEC_OUT; io->ipsec_out_len = sizeof (ipsec_out_t); io->ipsec_out_use_global_policy = B_TRUE; io->ipsec_out_ill_index = ill->ill_phyint->phyint_ifindex; io->ipsec_out_attach_if = B_TRUE; io->ipsec_out_multicast_loop = B_FALSE; io->ipsec_out_dontroute = B_TRUE; if ((zoneid = ipif->ipif_zoneid) == ALL_ZONES) zoneid = GLOBAL_ZONEID; io->ipsec_out_zoneid = zoneid; mp = allocb(size, BPRI_HI); if (mp == NULL) { freemsg(first_mp); goto free_reclist; } bzero((char *)mp->b_rptr, size); mp->b_wptr = (uchar_t *)(mp->b_rptr + size); first_mp->b_cont = mp; ipha = (ipha_t *)mp->b_rptr; rtralert = (uint8_t *)&(ipha[1]); igmp3ra = (igmp3ra_t *)&(rtralert[RTRALERT_LEN]); grphdr = (grphdra_t *)&(igmp3ra[1]); rp = cur_reclist; for (i = 0; i < numrec; i++) { grphdr->grphdra_type = rp->mrec_type; grphdr->grphdra_numsrc = htons(rp->mrec_srcs.sl_numsrc); grphdr->grphdra_group = V4_PART_OF_V6(rp->mrec_group); src_array = (ipaddr_t *)&(grphdr[1]); for (j = 0; j < rp->mrec_srcs.sl_numsrc; j++) src_array[j] = V4_PART_OF_V6(rp->mrec_srcs.sl_addr[j]); grphdr = (grphdra_t *)&(src_array[j]); rp = rp->mrec_next; } igmp3ra->igmp3ra_type = IGMP_V3_MEMBERSHIP_REPORT; igmp3ra->igmp3ra_numrec = htons(numrec); igmp3ra->igmp3ra_cksum = IP_CSUM(mp, hdrsize, 0); rtralert[0] = IPOPT_COPY | IPOPT_RTRALERT; rtralert[1] = RTRALERT_LEN; rtralert[2] = 0; rtralert[3] = 0; ipha->ipha_version_and_hdr_length = IP_VERSION << 4 | (IP_SIMPLE_HDR_LENGTH_IN_WORDS + RTRALERT_LEN_IN_WORDS); ipha->ipha_type_of_service = IPTOS_PREC_INTERNETCONTROL; ipha->ipha_length = htons(size); ipha->ipha_ttl = IGMP_TTL; ipha->ipha_protocol = IPPROTO_IGMP; ipha->ipha_dst = htonl(INADDR_ALLRPTS_GROUP); ipha->ipha_src = ipif->ipif_src_addr; /* * Request loopback of the report if we are acting as a multicast * router, so that the process-level routing daemon can hear it. * * This will run multiple times for the same group if there are * members on the same group for multiple ipifs on the same ill. * The igmp_input code will suppress this due to the loopback; * thus we always loopback membership report. */ ASSERT(ill->ill_rq != NULL); ip_multicast_loopback(ill->ill_rq, ill, mp, 0, ipif->ipif_zoneid); ip_wput_multicast(ill->ill_wq, first_mp, ipif, zoneid); ++ipst->ips_igmpstat.igps_snd_reports; if (morepkts) { if (more_src_cnt > 0) { int index, mvsize; slist_t *sl = &next_reclist->mrec_srcs; index = sl->sl_numsrc; mvsize = more_src_cnt * sizeof (in6_addr_t); (void) memmove(&sl->sl_addr[0], &sl->sl_addr[index], mvsize); sl->sl_numsrc = more_src_cnt; } goto nextpkt; } free_reclist: while (reclist != NULL) { rp = reclist->mrec_next; mi_free(reclist); reclist = rp; } } /* * mld_input: */ /* ARGSUSED */ void mld_input(queue_t *q, mblk_t *mp, ill_t *ill) { ip6_t *ip6h = (ip6_t *)(mp->b_rptr); mld_hdr_t *mldh; ilm_t *ilm; ipif_t *ipif; uint16_t hdr_length, exthdr_length; in6_addr_t *v6group_ptr, *lcladdr_ptr; uint_t next; int mldlen; ip_stack_t *ipst = ill->ill_ipst; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembTotal); /* Make sure the src address of the packet is link-local */ if (!(IN6_IS_ADDR_LINKLOCAL(&ip6h->ip6_src))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); return; } if (ip6h->ip6_hlim != 1) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpBadHoplimit); freemsg(mp); return; } /* Get to the icmp header part */ if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { hdr_length = ip_hdr_length_v6(mp, ip6h); exthdr_length = hdr_length - IPV6_HDR_LEN; } else { hdr_length = IPV6_HDR_LEN; exthdr_length = 0; } mldlen = ntohs(ip6h->ip6_plen) - exthdr_length; /* An MLD packet must at least be 24 octets to be valid */ if (mldlen < MLD_MINLEN) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); return; } mldh = (mld_hdr_t *)(&mp->b_rptr[hdr_length]); switch (mldh->mld_type) { case MLD_LISTENER_QUERY: /* * packet length differentiates between v1 and v2. v1 * query should be exactly 24 octets long; v2 is >= 28. */ if ((mldlen == MLD_MINLEN) || (ipst->ips_mld_max_version < MLD_V2_ROUTER)) { next = mld_query_in(mldh, ill); } else if (mldlen >= MLD_V2_QUERY_MINLEN) { next = mldv2_query_in((mld2q_t *)mldh, ill, mldlen); } else { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); freemsg(mp); return; } if (next == 0) { freemsg(mp); return; } if (next != INFINITY) mld_start_timers(next, ipst); break; case MLD_LISTENER_REPORT: { ASSERT(ill->ill_ipif != NULL); /* * For fast leave to work, we have to know that we are the * last person to send a report for this group. Reports * generated by us are looped back since we could potentially * be a multicast router, so discard reports sourced by me. */ lcladdr_ptr = &(ill->ill_ipif->ipif_v6subnet); mutex_enter(&ill->ill_lock); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, lcladdr_ptr)) { if (ip_debug > 1) { char buf1[INET6_ADDRSTRLEN]; char buf2[INET6_ADDRSTRLEN]; (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "mld_input: we are only " "member src %s ipif_local %s", inet_ntop(AF_INET6, lcladdr_ptr, buf1, sizeof (buf1)), inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr, buf2, sizeof (buf2))); } mutex_exit(&ill->ill_lock); freemsg(mp); return; } } mutex_exit(&ill->ill_lock); BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembResponses); v6group_ptr = &mldh->mld_addr; if (!IN6_IS_ADDR_MULTICAST(v6group_ptr)) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadReports); freemsg(mp); return; } /* * If we belong to the group being reported, and we are a * 'Delaying member' per the RFC terminology, stop our timer * for that group and 'clear flag' i.e. mark ilm_state as * IGMP_OTHERMEMBER. With zones, there can be multiple group * membership entries for the same group address (one per zone) * so we need to walk the ill_ilm list. */ mutex_enter(&ill->ill_lock); for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, v6group_ptr)) continue; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembOurReports); ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_OTHERMEMBER; } mutex_exit(&ill->ill_lock); break; } case MLD_LISTENER_REDUCTION: BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembReductions); break; } /* * All MLD packets have already been passed up to any * process(es) listening on a ICMP6 raw socket. This * has been accomplished in ip_deliver_local_v6 prior to * this function call. It is assumed that the multicast daemon * will have a SOCK_RAW IPPROTO_ICMPV6 (and presumbly use the * ICMP6_FILTER socket option to only receive the MLD messages) * Thus we can free the MLD message block here */ freemsg(mp); } /* * Handles an MLDv1 Listener Query. Returns 0 on error, or the appropriate * (non-zero, unsigned) timer value to be set on success. */ static uint_t mld_query_in(mld_hdr_t *mldh, ill_t *ill) { ilm_t *ilm; int timer; uint_t next, current; in6_addr_t *v6group; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries); /* * In the MLD specification, there are 3 states and a flag. * * In Non-Listener state, we simply don't have a membership record. * In Delaying state, our timer is running (ilm->ilm_timer < INFINITY) * In Idle Member state, our timer is not running (ilm->ilm_timer == * INFINITY) * * The flag is ilm->ilm_state, it is set to IGMP_OTHERMEMBER if * we have heard a report from another member, or IGMP_IREPORTEDLAST * if I sent the last report. */ v6group = &mldh->mld_addr; if (!(IN6_IS_ADDR_UNSPECIFIED(v6group)) && ((!IN6_IS_ADDR_MULTICAST(v6group)))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembBadQueries); return (0); } /* Need to do compatibility mode checking */ mutex_enter(&ill->ill_lock); ill->ill_mcast_v1_time = 0; ill->ill_mcast_v1_tset = 1; if (ill->ill_mcast_type == MLD_V2_ROUTER) { ip1dbg(("Received MLDv1 Query on %s, switching mode to " "MLD_V1_ROUTER\n", ill->ill_name)); atomic_add_16(&ill->ill_ifptr->illif_mcast_v1, 1); ill->ill_mcast_type = MLD_V1_ROUTER; } mutex_exit(&ill->ill_lock); timer = (int)ntohs(mldh->mld_maxdelay); if (ip_debug > 1) { (void) mi_strlog(ill->ill_rq, 1, SL_TRACE, "mld_input: TIMER = mld_maxdelay %d mld_type 0x%x", timer, (int)mldh->mld_type); } /* * -Start the timers in all of our membership records for * the physical interface on which the query arrived, * excl: * 1. those that belong to the "all hosts" group, * 2. those with 0 scope, or 1 node-local scope. * * -Restart any timer that is already running but has a value * longer that the requested timeout. * -Use the value specified in the query message as the * maximum timeout. */ next = INFINITY; mutex_enter(&ill->ill_lock); current = CURRENT_MSTIME; for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { ASSERT(!IN6_IS_ADDR_V4MAPPED(&ilm->ilm_v6addr)); if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr)) continue; if ((!IN6_ARE_ADDR_EQUAL(&ilm->ilm_v6addr, &ipv6_all_hosts_mcast)) && (IN6_IS_ADDR_UNSPECIFIED(v6group)) || (IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr))) { if (timer == 0) { /* Respond immediately */ ilm->ilm_timer = INFINITY; ilm->ilm_state = IGMP_IREPORTEDLAST; mutex_exit(&ill->ill_lock); mld_sendpkt(ilm, MLD_LISTENER_REPORT, NULL); mutex_enter(&ill->ill_lock); break; } if (ilm->ilm_timer > timer) { MCAST_RANDOM_DELAY(ilm->ilm_timer, timer); if (ilm->ilm_timer < next) next = ilm->ilm_timer; ilm->ilm_timer += current; } break; } } mutex_exit(&ill->ill_lock); return (next); } /* * Handles an MLDv2 Listener Query. On error, returns 0; on success, * returns the appropriate (non-zero, unsigned) timer value (which may * be INFINITY) to be set. */ static uint_t mldv2_query_in(mld2q_t *mld2q, ill_t *ill, int mldlen) { ilm_t *ilm; in6_addr_t *v6group, *src_array; uint_t next, numsrc, i, mrd, delay, qqi, current; uint8_t qrv; v6group = &mld2q->mld2q_addr; numsrc = ntohs(mld2q->mld2q_numsrc); /* make sure numsrc matches packet size */ if (mldlen < MLD_V2_QUERY_MINLEN + (numsrc * sizeof (in6_addr_t))) { BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInErrors); return (0); } src_array = (in6_addr_t *)&mld2q[1]; BUMP_MIB(ill->ill_icmp6_mib, ipv6IfIcmpInGroupMembQueries); /* extract Maximum Response Delay from code in header */ mrd = ntohs(mld2q->mld2q_mxrc); if (mrd >= MLD_V2_MAXRT_FPMIN) { uint_t hdrval, mant, exp; hdrval = mrd; mant = hdrval & MLD_V2_MAXRT_MANT_MASK; exp = (hdrval & MLD_V2_MAXRT_EXP_MASK) >> 12; mrd = (mant | 0x1000) << (exp + 3); } if (mrd == 0) mrd = DSEC_TO_MSEC(MCAST_DEF_QUERY_RESP_INTERVAL); MCAST_RANDOM_DELAY(delay, mrd); next = (unsigned)INFINITY; current = CURRENT_MSTIME; if ((qrv = mld2q->mld2q_sqrv & MLD_V2_RV_MASK) == 0) ill->ill_mcast_rv = MCAST_DEF_ROBUSTNESS; else ill->ill_mcast_rv = qrv; if ((qqi = (uint_t)mld2q->mld2q_qqic) >= MLD_V2_QQI_FPMIN) { uint_t mant, exp; mant = qqi & MLD_V2_QQI_MANT_MASK; exp = (qqi & MLD_V2_QQI_EXP_MASK) >> 12; qqi = (mant | 0x10) << (exp + 3); } ill->ill_mcast_qi = (qqi == 0) ? MCAST_DEF_QUERY_INTERVAL : qqi; /* * If we have a pending general query response that's scheduled * sooner than the delay we calculated for this response, then * no action is required (MLDv2 draft section 6.2 rule 1) */ mutex_enter(&ill->ill_lock); if (ill->ill_global_timer < (current + delay)) { mutex_exit(&ill->ill_lock); return (next); } mutex_exit(&ill->ill_lock); /* * Now take action depending on query type: general, * group specific, or group/source specific. */ if ((numsrc == 0) && IN6_IS_ADDR_UNSPECIFIED(v6group)) { /* * general query * We know global timer is either not running or is * greater than our calculated delay, so reset it to * our delay (random value in range [0, response time]) */ mutex_enter(&ill->ill_lock); ill->ill_global_timer = current + delay; mutex_exit(&ill->ill_lock); next = delay; } else { /* group or group/source specific query */ mutex_enter(&ill->ill_lock); for (ilm = ill->ill_ilm; ilm != NULL; ilm = ilm->ilm_next) { if (IN6_IS_ADDR_UNSPECIFIED(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_NODELOCAL(&ilm->ilm_v6addr) || IN6_IS_ADDR_MC_RESERVED(&ilm->ilm_v6addr) || !IN6_ARE_ADDR_EQUAL(v6group, &ilm->ilm_v6addr)) continue; /* * If the query is group specific or we have a * pending group specific query, the response is * group specific (pending sources list should be * empty). Otherwise, need to update the pending * sources list for the group and source specific * response. */ if (numsrc == 0 || (ilm->ilm_timer < INFINITY && SLIST_IS_EMPTY(ilm->ilm_pendsrcs))) { group_query: FREE_SLIST(ilm->ilm_pendsrcs); ilm->ilm_pendsrcs = NULL; } else { boolean_t overflow; slist_t *pktl; if (numsrc > MAX_FILTER_SIZE || (ilm->ilm_pendsrcs == NULL && (ilm->ilm_pendsrcs = l_alloc()) == NULL)) { /* * We've been sent more sources than * we can deal with; or we can't deal * with a source list at all. Revert * to a group specific query. */ goto group_query; } if ((pktl = l_alloc()) == NULL) goto group_query; pktl->sl_numsrc = numsrc; for (i = 0; i < numsrc; i++) pktl->sl_addr[i] = src_array[i]; l_union_in_a(ilm->ilm_pendsrcs, pktl, &overflow); l_free(pktl); if (overflow) goto group_query; } ilm->ilm_timer = (ilm->ilm_timer == INFINITY) ? INFINITY : (ilm->ilm_timer - current); /* set timer to soonest value */ ilm->ilm_timer = MIN(ilm->ilm_timer, delay); if (ilm->ilm_timer < next) next = ilm->ilm_timer; ilm->ilm_timer += current; break; } mutex_exit(&ill->ill_lock); } return (next); } /* * Send MLDv1 response packet with hoplimit 1 */ static void mld_sendpkt(ilm_t *ilm, uchar_t type, const in6_addr_t *v6addr) { mblk_t *mp; mld_hdr_t *mldh; ip6_t *ip6h; ip6_hbh_t *ip6hbh; struct ip6_opt_router *ip6router; size_t size = IPV6_HDR_LEN + sizeof (mld_hdr_t); ill_t *ill = ilm->ilm_ill; /* Will be the "lower" ill */ ipif_t *ipif; ip6i_t *ip6i; /* * We need to place a router alert option in this packet. The length * of the options must be a multiple of 8. The hbh option header is 2 * bytes followed by the 4 byte router alert option. That leaves * 2 bytes of pad for a total of 8 bytes. */ const int router_alert_length = 8; ASSERT(ill->ill_isv6); /* * We need to make sure that this packet does not get load balanced. * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and * ip_newroute_ipif_v6 knows how to handle such packets. * If it gets load balanced, switches supporting MLD snooping * (in the future) will send the packet that it receives for this * multicast group to the interface that we are sending on. As we have * joined the multicast group on this ill, by sending the packet out * on this ill, we receive all the packets back on this ill. */ size += sizeof (ip6i_t) + router_alert_length; mp = allocb(size, BPRI_HI); if (mp == NULL) return; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; ip6i = (ip6i_t *)mp->b_rptr; ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)&ip6i[1]; ip6hbh = (struct ip6_hbh *)&ip6h[1]; ip6router = (struct ip6_opt_router *)&ip6hbh[1]; /* * A zero is a pad option of length 1. The bzero of the whole packet * above will pad between ip6router and mld. */ mldh = (mld_hdr_t *)((uint8_t *)ip6hbh + router_alert_length); mldh->mld_type = type; mldh->mld_addr = ilm->ilm_v6addr; ip6router->ip6or_type = IP6OPT_ROUTER_ALERT; ip6router->ip6or_len = 2; ip6router->ip6or_value[0] = 0; ip6router->ip6or_value[1] = IP6_ALERT_MLD; ip6hbh->ip6h_nxt = IPPROTO_ICMPV6; ip6hbh->ip6h_len = 0; ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_plen = htons(sizeof (*mldh) + router_alert_length); ip6h->ip6_nxt = IPPROTO_HOPOPTS; ip6h->ip6_hops = MLD_HOP_LIMIT; if (v6addr == NULL) ip6h->ip6_dst = ilm->ilm_v6addr; else ip6h->ip6_dst = *v6addr; /* ipif returned by ipif_lookup_zoneid is link-local (if present) */ if (ipif_lookup_zoneid(ill, ilm->ilm_zoneid, IPIF_UP, &ipif)) { ip6h->ip6_src = ipif->ipif_v6src_addr; ipif_refrele(ipif); } else { /* Otherwise, use IPv6 default address selection. */ ip6h->ip6_src = ipv6_all_zeros; } /* * Prepare for checksum by putting icmp length in the icmp * checksum field. The checksum is calculated in ip_wput_v6. */ mldh->mld_cksum = htons(sizeof (*mldh)); /* * ip_wput will automatically loopback the multicast packet to * the conn if multicast loopback is enabled. * The MIB stats corresponding to this outgoing MLD packet * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6 * ->icmp_update_out_mib_v6 function call. */ (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT); } /* * Sends an MLD_V2_LISTENER_REPORT message out the passed-in ill. The * report will contain one multicast address record for each element of * reclist. If this causes packet length to exceed ill->ill_max_frag, * multiple reports are sent. reclist is assumed to be made up of * buffers allocated by mcast_bldmrec(), and those buffers are freed here. */ static void mldv2_sendrpt(ill_t *ill, mrec_t *reclist) { mblk_t *mp; mld2r_t *mld2r; mld2mar_t *mld2mar; in6_addr_t *srcarray; ip6_t *ip6h; ip6_hbh_t *ip6hbh; ip6i_t *ip6i; struct ip6_opt_router *ip6router; size_t size, optlen, padlen, icmpsize, rsize; ipif_t *ipif; int i, numrec, more_src_cnt; mrec_t *rp, *cur_reclist; mrec_t *next_reclist = reclist; boolean_t morepkts; /* If there aren't any records, there's nothing to send */ if (reclist == NULL) return; ASSERT(ill->ill_isv6); /* * Total option length (optlen + padlen) must be a multiple of * 8 bytes. We assume here that optlen <= 8, so the total option * length will be 8. Assert this in case anything ever changes. */ optlen = sizeof (ip6_hbh_t) + sizeof (struct ip6_opt_router); ASSERT(optlen <= 8); padlen = 8 - optlen; nextpkt: icmpsize = sizeof (mld2r_t); size = IPV6_HDR_LEN + optlen + padlen + icmpsize; morepkts = B_FALSE; more_src_cnt = 0; for (rp = cur_reclist = next_reclist, numrec = 0; rp != NULL; rp = rp->mrec_next, numrec++) { rsize = sizeof (mld2mar_t) + (rp->mrec_srcs.sl_numsrc * sizeof (in6_addr_t)); if (size + rsize > ill->ill_max_frag) { if (rp == cur_reclist) { /* * If the first mrec we looked at is too big * to fit in a single packet (i.e the source * list is too big), we must either truncate * the list (if TO_EX or IS_EX), or send * multiple reports for the same group (all * other types). */ int srcspace, srcsperpkt; srcspace = ill->ill_max_frag - (size + sizeof (mld2mar_t)); srcsperpkt = srcspace / sizeof (in6_addr_t); /* * Increment icmpsize and size, because we will * be sending a record for the mrec we're * looking at now. */ rsize = sizeof (mld2mar_t) + (srcsperpkt * sizeof (in6_addr_t)); icmpsize += rsize; size += rsize; if (rp->mrec_type == MODE_IS_EXCLUDE || rp->mrec_type == CHANGE_TO_EXCLUDE) { rp->mrec_srcs.sl_numsrc = srcsperpkt; if (rp->mrec_next == NULL) { /* no more packets to send */ break; } else { /* * more packets, but we're * done with this mrec. */ next_reclist = rp->mrec_next; } } else { more_src_cnt = rp->mrec_srcs.sl_numsrc - srcsperpkt; rp->mrec_srcs.sl_numsrc = srcsperpkt; /* * We'll fix up this mrec (remove the * srcs we've already sent) before * returning to nextpkt above. */ next_reclist = rp; } } else { next_reclist = rp; } morepkts = B_TRUE; break; } icmpsize += rsize; size += rsize; } /* * We need to make sure that this packet does not get load balanced. * So, we allocate an ip6i_t and set ATTACH_IF. ip_wput_v6 and * ip_newroute_ipif_v6 know how to handle such packets. * If it gets load balanced, switches supporting MLD snooping * (in the future) will send the packet that it receives for this * multicast group to the interface that we are sending on. As we have * joined the multicast group on this ill, by sending the packet out * on this ill, we receive all the packets back on this ill. */ size += sizeof (ip6i_t); mp = allocb(size, BPRI_HI); if (mp == NULL) goto free_reclist; bzero(mp->b_rptr, size); mp->b_wptr = mp->b_rptr + size; ip6i = (ip6i_t *)mp->b_rptr; ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; ip6i->ip6i_flags = IP6I_ATTACH_IF; ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)&(ip6i[1]); ip6hbh = (ip6_hbh_t *)&(ip6h[1]); ip6router = (struct ip6_opt_router *)&(ip6hbh[1]); mld2r = (mld2r_t *)((uint8_t *)ip6hbh + optlen + padlen); mld2mar = (mld2mar_t *)&(mld2r[1]); ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_plen = htons(optlen + padlen + icmpsize); ip6h->ip6_nxt = IPPROTO_HOPOPTS; ip6h->ip6_hops = MLD_HOP_LIMIT; ip6h->ip6_dst = ipv6_all_v2rtrs_mcast; /* ipif returned by ipif_lookup_zoneid is link-local (if present) */ if (ipif_lookup_zoneid(ill, ALL_ZONES, IPIF_UP, &ipif)) { ip6h->ip6_src = ipif->ipif_v6src_addr; ipif_refrele(ipif); } else { /* otherwise, use IPv6 default address selection. */ ip6h->ip6_src = ipv6_all_zeros; } ip6hbh->ip6h_nxt = IPPROTO_ICMPV6; /* * ip6h_len is the number of 8-byte words, not including the first * 8 bytes; we've assumed optlen + padlen == 8 bytes; hence len = 0. */ ip6hbh->ip6h_len = 0; ip6router->ip6or_type = IP6OPT_ROUTER_ALERT; ip6router->ip6or_len = 2; ip6router->ip6or_value[0] = 0; ip6router->ip6or_value[1] = IP6_ALERT_MLD; mld2r->mld2r_type = MLD_V2_LISTENER_REPORT; mld2r->mld2r_nummar = htons(numrec); /* * Prepare for the checksum by putting icmp length in the icmp * checksum field. The checksum is calculated in ip_wput_v6. */ mld2r->mld2r_cksum = htons(icmpsize); for (rp = cur_reclist; rp != NULL; rp = rp->mrec_next) { mld2mar->mld2mar_type = rp->mrec_type; mld2mar->mld2mar_auxlen = 0; mld2mar->mld2mar_numsrc = htons(rp->mrec_srcs.sl_numsrc); mld2mar->mld2mar_group = rp->mrec_group; srcarray = (in6_addr_t *)&(mld2mar[1]); for (i = 0; i < rp->mrec_srcs.sl_numsrc; i++) srcarray[i] = rp->mrec_srcs.sl_addr[i]; mld2mar = (mld2mar_t *)&(srcarray[i]); } /* * ip_wput will automatically loopback the multicast packet to * the conn if multicast loopback is enabled. * The MIB stats corresponding to this outgoing MLD packet * will be accounted for in ip_wput->ip_wput_v6->ip_wput_ire_v6 * ->icmp_update_out_mib_v6 function call. */ (void) ip_output_v6(NULL, mp, ill->ill_wq, IP_WPUT); if (morepkts) { if (more_src_cnt > 0) { int index, mvsize; slist_t *sl = &next_reclist->mrec_srcs; index = sl->sl_numsrc; mvsize = more_src_cnt * sizeof (in6_addr_t); (void) memmove(&sl->sl_addr[0], &sl->sl_addr[index], mvsize); sl->sl_numsrc = more_src_cnt; } goto nextpkt; } free_reclist: while (reclist != NULL) { rp = reclist->mrec_next; mi_free(reclist); reclist = rp; } } static mrec_t * mcast_bldmrec(mcast_record_t type, in6_addr_t *grp, slist_t *srclist, mrec_t *next) { mrec_t *rp; int i; if ((type == ALLOW_NEW_SOURCES || type == BLOCK_OLD_SOURCES) && SLIST_IS_EMPTY(srclist)) return (next); rp = (mrec_t *)mi_alloc(sizeof (mrec_t), BPRI_HI); if (rp == NULL) return (next); rp->mrec_next = next; rp->mrec_type = type; rp->mrec_auxlen = 0; rp->mrec_group = *grp; if (srclist == NULL) { rp->mrec_srcs.sl_numsrc = 0; } else { rp->mrec_srcs.sl_numsrc = srclist->sl_numsrc; for (i = 0; i < srclist->sl_numsrc; i++) rp->mrec_srcs.sl_addr[i] = srclist->sl_addr[i]; } return (rp); } /* * Set up initial retransmit state. If memory cannot be allocated for * the source lists, simply create as much state as is possible; memory * allocation failures are considered one type of transient error that * the retransmissions are designed to overcome (and if they aren't * transient, there are bigger problems than failing to notify the * router about multicast group membership state changes). */ static void mcast_init_rtx(ill_t *ill, rtx_state_t *rtxp, mcast_record_t rtype, slist_t *flist) { /* * There are only three possibilities for rtype: * New join, transition from INCLUDE {} to INCLUDE {flist} * => rtype is ALLOW_NEW_SOURCES * New join, transition from INCLUDE {} to EXCLUDE {flist} * => rtype is CHANGE_TO_EXCLUDE * State change that involves a filter mode change * => rtype is either CHANGE_TO_INCLUDE or CHANGE_TO_EXCLUDE */ ASSERT(rtype == CHANGE_TO_EXCLUDE || rtype == CHANGE_TO_INCLUDE || rtype == ALLOW_NEW_SOURCES); rtxp->rtx_cnt = ill->ill_mcast_rv; switch (rtype) { case CHANGE_TO_EXCLUDE: rtxp->rtx_fmode_cnt = ill->ill_mcast_rv; CLEAR_SLIST(rtxp->rtx_allow); COPY_SLIST(flist, rtxp->rtx_block); break; case ALLOW_NEW_SOURCES: case CHANGE_TO_INCLUDE: rtxp->rtx_fmode_cnt = rtype == ALLOW_NEW_SOURCES ? 0 : ill->ill_mcast_rv; CLEAR_SLIST(rtxp->rtx_block); COPY_SLIST(flist, rtxp->rtx_allow); break; } } /* * The basic strategy here, as extrapolated from RFC 3810 section 6.1 and * RFC 3376 section 5.1, covers three cases: * * The current state change is a filter mode change * Set filter mode retransmit counter; set retransmit allow or * block list to new source list as appropriate, and clear the * retransmit list that was not set; send TO_IN or TO_EX with * new source list. * * The current state change is a source list change, but the filter * mode retransmit counter is > 0 * Decrement filter mode retransmit counter; set retransmit * allow or block list to new source list as appropriate, * and clear the retransmit list that was not set; send TO_IN * or TO_EX with new source list. * * The current state change is a source list change, and the filter * mode retransmit counter is 0. * Merge existing rtx allow and block lists with new state: * rtx_allow = (new allow + rtx_allow) - new block * rtx_block = (new block + rtx_block) - new allow * Send ALLOW and BLOCK records for new retransmit lists; * decrement retransmit counter. * * As is the case for mcast_init_rtx(), memory allocation failures are * acceptable; we just create as much state as we can. */ static mrec_t * mcast_merge_rtx(ilm_t *ilm, mrec_t *mreclist, slist_t *flist) { ill_t *ill; rtx_state_t *rtxp = &ilm->ilm_rtx; mcast_record_t txtype; mrec_t *rp, *rpnext, *rtnmrec; boolean_t ovf; ill = (ilm->ilm_ill == NULL ? ilm->ilm_ipif->ipif_ill : ilm->ilm_ill); if (mreclist == NULL) return (mreclist); /* * A filter mode change is indicated by a single mrec, which is * either TO_IN or TO_EX. In this case, we just need to set new * retransmit state as if this were an initial join. There is * no change to the mrec list. */ if (mreclist->mrec_type == CHANGE_TO_INCLUDE || mreclist->mrec_type == CHANGE_TO_EXCLUDE) { mcast_init_rtx(ill, rtxp, mreclist->mrec_type, &mreclist->mrec_srcs); return (mreclist); } /* * Only the source list has changed */ rtxp->rtx_cnt = ill->ill_mcast_rv; if (rtxp->rtx_fmode_cnt > 0) { /* but we're still sending filter mode change reports */ rtxp->rtx_fmode_cnt--; if (ilm->ilm_fmode == MODE_IS_INCLUDE) { CLEAR_SLIST(rtxp->rtx_block); COPY_SLIST(flist, rtxp->rtx_allow); txtype = CHANGE_TO_INCLUDE; } else { CLEAR_SLIST(rtxp->rtx_allow); COPY_SLIST(flist, rtxp->rtx_block); txtype = CHANGE_TO_EXCLUDE; } /* overwrite first mrec with new info */ mreclist->mrec_type = txtype; l_copy(flist, &mreclist->mrec_srcs); /* then free any remaining mrecs */ for (rp = mreclist->mrec_next; rp != NULL; rp = rpnext) { rpnext = rp->mrec_next; mi_free(rp); } mreclist->mrec_next = NULL; rtnmrec = mreclist; } else { mrec_t *allow_mrec, *block_mrec; /* * Just send the source change reports; but we need to * recalculate the ALLOW and BLOCK lists based on previous * state and new changes. */ rtnmrec = mreclist; allow_mrec = block_mrec = NULL; for (rp = mreclist; rp != NULL; rp = rp->mrec_next) { ASSERT(rp->mrec_type == ALLOW_NEW_SOURCES || rp->mrec_type == BLOCK_OLD_SOURCES); if (rp->mrec_type == ALLOW_NEW_SOURCES) allow_mrec = rp; else block_mrec = rp; } /* * Perform calculations: * new_allow = mrec_allow + (rtx_allow - mrec_block) * new_block = mrec_block + (rtx_block - mrec_allow) * * Each calc requires two steps, for example: * rtx_allow = rtx_allow - mrec_block; * new_allow = mrec_allow + rtx_allow; * * Store results in mrec lists, and then copy into rtx lists. * We do it in this order in case the rtx list hasn't been * alloc'd yet; if it hasn't and our alloc fails, that's okay, * Overflows are also okay. */ if (block_mrec != NULL) { l_difference_in_a(rtxp->rtx_allow, &block_mrec->mrec_srcs); } if (allow_mrec != NULL) { l_difference_in_a(rtxp->rtx_block, &allow_mrec->mrec_srcs); l_union_in_a(&allow_mrec->mrec_srcs, rtxp->rtx_allow, &ovf); } if (block_mrec != NULL) { l_union_in_a(&block_mrec->mrec_srcs, rtxp->rtx_block, &ovf); COPY_SLIST(&block_mrec->mrec_srcs, rtxp->rtx_block); } else { rtnmrec = mcast_bldmrec(BLOCK_OLD_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_block, allow_mrec); } if (allow_mrec != NULL) { COPY_SLIST(&allow_mrec->mrec_srcs, rtxp->rtx_allow); } else { rtnmrec = mcast_bldmrec(ALLOW_NEW_SOURCES, &ilm->ilm_v6addr, rtxp->rtx_allow, block_mrec); } } return (rtnmrec); }