xref: /freebsd/sys/netinet/in_mcast.c (revision 5608fd23c27fa1e8ee595d7b678cbfd35d657fbe)
1 /*-
2  * Copyright (c) 2007-2009 Bruce Simpson.
3  * Copyright (c) 2005 Robert N. M. Watson.
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  * 3. The name of the author may not be used to endorse or promote
15  *    products derived from this software without specific prior written
16  *    permission.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 /*
32  * IPv4 multicast socket, group, and socket option processing module.
33  */
34 
35 #include <sys/cdefs.h>
36 __FBSDID("$FreeBSD$");
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/kernel.h>
41 #include <sys/malloc.h>
42 #include <sys/mbuf.h>
43 #include <sys/protosw.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/protosw.h>
47 #include <sys/sysctl.h>
48 #include <sys/ktr.h>
49 #include <sys/taskqueue.h>
50 #include <sys/tree.h>
51 
52 #include <net/if.h>
53 #include <net/if_var.h>
54 #include <net/if_dl.h>
55 #include <net/route.h>
56 #include <net/vnet.h>
57 
58 #include <netinet/in.h>
59 #include <netinet/in_systm.h>
60 #include <netinet/in_pcb.h>
61 #include <netinet/in_var.h>
62 #include <netinet/ip_var.h>
63 #include <netinet/igmp_var.h>
64 
65 #ifndef KTR_IGMPV3
66 #define KTR_IGMPV3 KTR_INET
67 #endif
68 
69 #ifndef __SOCKUNION_DECLARED
70 union sockunion {
71 	struct sockaddr_storage	ss;
72 	struct sockaddr		sa;
73 	struct sockaddr_dl	sdl;
74 	struct sockaddr_in	sin;
75 };
76 typedef union sockunion sockunion_t;
77 #define __SOCKUNION_DECLARED
78 #endif /* __SOCKUNION_DECLARED */
79 
80 static MALLOC_DEFINE(M_INMFILTER, "in_mfilter",
81     "IPv4 multicast PCB-layer source filter");
82 static MALLOC_DEFINE(M_IPMADDR, "in_multi", "IPv4 multicast group");
83 static MALLOC_DEFINE(M_IPMOPTS, "ip_moptions", "IPv4 multicast options");
84 static MALLOC_DEFINE(M_IPMSOURCE, "ip_msource",
85     "IPv4 multicast IGMP-layer source filter");
86 
87 /*
88  * Locking:
89  * - Lock order is: Giant, INP_WLOCK, IN_MULTI_LOCK, IGMP_LOCK, IF_ADDR_LOCK.
90  * - The IF_ADDR_LOCK is implicitly taken by inm_lookup() earlier, however
91  *   it can be taken by code in net/if.c also.
92  * - ip_moptions and in_mfilter are covered by the INP_WLOCK.
93  *
94  * struct in_multi is covered by IN_MULTI_LOCK. There isn't strictly
95  * any need for in_multi itself to be virtualized -- it is bound to an ifp
96  * anyway no matter what happens.
97  */
98 struct mtx in_multi_mtx;
99 MTX_SYSINIT(in_multi_mtx, &in_multi_mtx, "in_multi_mtx", MTX_DEF);
100 
101 /*
102  * Functions with non-static linkage defined in this file should be
103  * declared in in_var.h:
104  *  imo_multi_filter()
105  *  in_addmulti()
106  *  in_delmulti()
107  *  in_joingroup()
108  *  in_joingroup_locked()
109  *  in_leavegroup()
110  *  in_leavegroup_locked()
111  * and ip_var.h:
112  *  inp_freemoptions()
113  *  inp_getmoptions()
114  *  inp_setmoptions()
115  *
116  * XXX: Both carp and pf need to use the legacy (*,G) KPIs in_addmulti()
117  * and in_delmulti().
118  */
119 static void	imf_commit(struct in_mfilter *);
120 static int	imf_get_source(struct in_mfilter *imf,
121 		    const struct sockaddr_in *psin,
122 		    struct in_msource **);
123 static struct in_msource *
124 		imf_graft(struct in_mfilter *, const uint8_t,
125 		    const struct sockaddr_in *);
126 static void	imf_leave(struct in_mfilter *);
127 static int	imf_prune(struct in_mfilter *, const struct sockaddr_in *);
128 static void	imf_purge(struct in_mfilter *);
129 static void	imf_rollback(struct in_mfilter *);
130 static void	imf_reap(struct in_mfilter *);
131 static int	imo_grow(struct ip_moptions *);
132 static size_t	imo_match_group(const struct ip_moptions *,
133 		    const struct ifnet *, const struct sockaddr *);
134 static struct in_msource *
135 		imo_match_source(const struct ip_moptions *, const size_t,
136 		    const struct sockaddr *);
137 static void	ims_merge(struct ip_msource *ims,
138 		    const struct in_msource *lims, const int rollback);
139 static int	in_getmulti(struct ifnet *, const struct in_addr *,
140 		    struct in_multi **);
141 static int	inm_get_source(struct in_multi *inm, const in_addr_t haddr,
142 		    const int noalloc, struct ip_msource **pims);
143 #ifdef KTR
144 static int	inm_is_ifp_detached(const struct in_multi *);
145 #endif
146 static int	inm_merge(struct in_multi *, /*const*/ struct in_mfilter *);
147 static void	inm_purge(struct in_multi *);
148 static void	inm_reap(struct in_multi *);
149 static struct ip_moptions *
150 		inp_findmoptions(struct inpcb *);
151 static void	inp_freemoptions_internal(struct ip_moptions *);
152 static void	inp_gcmoptions(void *, int);
153 static int	inp_get_source_filters(struct inpcb *, struct sockopt *);
154 static int	inp_join_group(struct inpcb *, struct sockopt *);
155 static int	inp_leave_group(struct inpcb *, struct sockopt *);
156 static struct ifnet *
157 		inp_lookup_mcast_ifp(const struct inpcb *,
158 		    const struct sockaddr_in *, const struct in_addr);
159 static int	inp_block_unblock_source(struct inpcb *, struct sockopt *);
160 static int	inp_set_multicast_if(struct inpcb *, struct sockopt *);
161 static int	inp_set_source_filters(struct inpcb *, struct sockopt *);
162 static int	sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS);
163 
164 static SYSCTL_NODE(_net_inet_ip, OID_AUTO, mcast, CTLFLAG_RW, 0,
165     "IPv4 multicast");
166 
167 static u_long in_mcast_maxgrpsrc = IP_MAX_GROUP_SRC_FILTER;
168 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxgrpsrc,
169     CTLFLAG_RWTUN, &in_mcast_maxgrpsrc, 0,
170     "Max source filters per group");
171 
172 static u_long in_mcast_maxsocksrc = IP_MAX_SOCK_SRC_FILTER;
173 SYSCTL_ULONG(_net_inet_ip_mcast, OID_AUTO, maxsocksrc,
174     CTLFLAG_RWTUN, &in_mcast_maxsocksrc, 0,
175     "Max source filters per socket");
176 
177 int in_mcast_loop = IP_DEFAULT_MULTICAST_LOOP;
178 SYSCTL_INT(_net_inet_ip_mcast, OID_AUTO, loop, CTLFLAG_RWTUN,
179     &in_mcast_loop, 0, "Loopback multicast datagrams by default");
180 
181 static SYSCTL_NODE(_net_inet_ip_mcast, OID_AUTO, filters,
182     CTLFLAG_RD | CTLFLAG_MPSAFE, sysctl_ip_mcast_filters,
183     "Per-interface stack-wide source filters");
184 
185 static STAILQ_HEAD(, ip_moptions) imo_gc_list =
186     STAILQ_HEAD_INITIALIZER(imo_gc_list);
187 static struct task imo_gc_task = TASK_INITIALIZER(0, inp_gcmoptions, NULL);
188 
189 #ifdef KTR
190 /*
191  * Inline function which wraps assertions for a valid ifp.
192  * The ifnet layer will set the ifma's ifp pointer to NULL if the ifp
193  * is detached.
194  */
195 static int __inline
196 inm_is_ifp_detached(const struct in_multi *inm)
197 {
198 	struct ifnet *ifp;
199 
200 	KASSERT(inm->inm_ifma != NULL, ("%s: no ifma", __func__));
201 	ifp = inm->inm_ifma->ifma_ifp;
202 	if (ifp != NULL) {
203 		/*
204 		 * Sanity check that netinet's notion of ifp is the
205 		 * same as net's.
206 		 */
207 		KASSERT(inm->inm_ifp == ifp, ("%s: bad ifp", __func__));
208 	}
209 
210 	return (ifp == NULL);
211 }
212 #endif
213 
214 /*
215  * Initialize an in_mfilter structure to a known state at t0, t1
216  * with an empty source filter list.
217  */
218 static __inline void
219 imf_init(struct in_mfilter *imf, const int st0, const int st1)
220 {
221 	memset(imf, 0, sizeof(struct in_mfilter));
222 	RB_INIT(&imf->imf_sources);
223 	imf->imf_st[0] = st0;
224 	imf->imf_st[1] = st1;
225 }
226 
227 /*
228  * Function for looking up an in_multi record for an IPv4 multicast address
229  * on a given interface. ifp must be valid. If no record found, return NULL.
230  * The IN_MULTI_LOCK and IF_ADDR_LOCK on ifp must be held.
231  */
232 struct in_multi *
233 inm_lookup_locked(struct ifnet *ifp, const struct in_addr ina)
234 {
235 	struct ifmultiaddr *ifma;
236 	struct in_multi *inm;
237 
238 	IN_MULTI_LOCK_ASSERT();
239 	IF_ADDR_LOCK_ASSERT(ifp);
240 
241 	inm = NULL;
242 	TAILQ_FOREACH(ifma, &((ifp)->if_multiaddrs), ifma_link) {
243 		if (ifma->ifma_addr->sa_family == AF_INET) {
244 			inm = (struct in_multi *)ifma->ifma_protospec;
245 			if (inm->inm_addr.s_addr == ina.s_addr)
246 				break;
247 			inm = NULL;
248 		}
249 	}
250 	return (inm);
251 }
252 
253 /*
254  * Wrapper for inm_lookup_locked().
255  * The IF_ADDR_LOCK will be taken on ifp and released on return.
256  */
257 struct in_multi *
258 inm_lookup(struct ifnet *ifp, const struct in_addr ina)
259 {
260 	struct in_multi *inm;
261 
262 	IN_MULTI_LOCK_ASSERT();
263 	IF_ADDR_RLOCK(ifp);
264 	inm = inm_lookup_locked(ifp, ina);
265 	IF_ADDR_RUNLOCK(ifp);
266 
267 	return (inm);
268 }
269 
270 /*
271  * Resize the ip_moptions vector to the next power-of-two minus 1.
272  * May be called with locks held; do not sleep.
273  */
274 static int
275 imo_grow(struct ip_moptions *imo)
276 {
277 	struct in_multi		**nmships;
278 	struct in_multi		**omships;
279 	struct in_mfilter	 *nmfilters;
280 	struct in_mfilter	 *omfilters;
281 	size_t			  idx;
282 	size_t			  newmax;
283 	size_t			  oldmax;
284 
285 	nmships = NULL;
286 	nmfilters = NULL;
287 	omships = imo->imo_membership;
288 	omfilters = imo->imo_mfilters;
289 	oldmax = imo->imo_max_memberships;
290 	newmax = ((oldmax + 1) * 2) - 1;
291 
292 	if (newmax <= IP_MAX_MEMBERSHIPS) {
293 		nmships = (struct in_multi **)realloc(omships,
294 		    sizeof(struct in_multi *) * newmax, M_IPMOPTS, M_NOWAIT);
295 		nmfilters = (struct in_mfilter *)realloc(omfilters,
296 		    sizeof(struct in_mfilter) * newmax, M_INMFILTER, M_NOWAIT);
297 		if (nmships != NULL && nmfilters != NULL) {
298 			/* Initialize newly allocated source filter heads. */
299 			for (idx = oldmax; idx < newmax; idx++) {
300 				imf_init(&nmfilters[idx], MCAST_UNDEFINED,
301 				    MCAST_EXCLUDE);
302 			}
303 			imo->imo_max_memberships = newmax;
304 			imo->imo_membership = nmships;
305 			imo->imo_mfilters = nmfilters;
306 		}
307 	}
308 
309 	if (nmships == NULL || nmfilters == NULL) {
310 		if (nmships != NULL)
311 			free(nmships, M_IPMOPTS);
312 		if (nmfilters != NULL)
313 			free(nmfilters, M_INMFILTER);
314 		return (ETOOMANYREFS);
315 	}
316 
317 	return (0);
318 }
319 
320 /*
321  * Find an IPv4 multicast group entry for this ip_moptions instance
322  * which matches the specified group, and optionally an interface.
323  * Return its index into the array, or -1 if not found.
324  */
325 static size_t
326 imo_match_group(const struct ip_moptions *imo, const struct ifnet *ifp,
327     const struct sockaddr *group)
328 {
329 	const struct sockaddr_in *gsin;
330 	struct in_multi	**pinm;
331 	int		  idx;
332 	int		  nmships;
333 
334 	gsin = (const struct sockaddr_in *)group;
335 
336 	/* The imo_membership array may be lazy allocated. */
337 	if (imo->imo_membership == NULL || imo->imo_num_memberships == 0)
338 		return (-1);
339 
340 	nmships = imo->imo_num_memberships;
341 	pinm = &imo->imo_membership[0];
342 	for (idx = 0; idx < nmships; idx++, pinm++) {
343 		if (*pinm == NULL)
344 			continue;
345 		if ((ifp == NULL || ((*pinm)->inm_ifp == ifp)) &&
346 		    in_hosteq((*pinm)->inm_addr, gsin->sin_addr)) {
347 			break;
348 		}
349 	}
350 	if (idx >= nmships)
351 		idx = -1;
352 
353 	return (idx);
354 }
355 
356 /*
357  * Find an IPv4 multicast source entry for this imo which matches
358  * the given group index for this socket, and source address.
359  *
360  * NOTE: This does not check if the entry is in-mode, merely if
361  * it exists, which may not be the desired behaviour.
362  */
363 static struct in_msource *
364 imo_match_source(const struct ip_moptions *imo, const size_t gidx,
365     const struct sockaddr *src)
366 {
367 	struct ip_msource	 find;
368 	struct in_mfilter	*imf;
369 	struct ip_msource	*ims;
370 	const sockunion_t	*psa;
371 
372 	KASSERT(src->sa_family == AF_INET, ("%s: !AF_INET", __func__));
373 	KASSERT(gidx != -1 && gidx < imo->imo_num_memberships,
374 	    ("%s: invalid index %d\n", __func__, (int)gidx));
375 
376 	/* The imo_mfilters array may be lazy allocated. */
377 	if (imo->imo_mfilters == NULL)
378 		return (NULL);
379 	imf = &imo->imo_mfilters[gidx];
380 
381 	/* Source trees are keyed in host byte order. */
382 	psa = (const sockunion_t *)src;
383 	find.ims_haddr = ntohl(psa->sin.sin_addr.s_addr);
384 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
385 
386 	return ((struct in_msource *)ims);
387 }
388 
389 /*
390  * Perform filtering for multicast datagrams on a socket by group and source.
391  *
392  * Returns 0 if a datagram should be allowed through, or various error codes
393  * if the socket was not a member of the group, or the source was muted, etc.
394  */
395 int
396 imo_multi_filter(const struct ip_moptions *imo, const struct ifnet *ifp,
397     const struct sockaddr *group, const struct sockaddr *src)
398 {
399 	size_t gidx;
400 	struct in_msource *ims;
401 	int mode;
402 
403 	KASSERT(ifp != NULL, ("%s: null ifp", __func__));
404 
405 	gidx = imo_match_group(imo, ifp, group);
406 	if (gidx == -1)
407 		return (MCAST_NOTGMEMBER);
408 
409 	/*
410 	 * Check if the source was included in an (S,G) join.
411 	 * Allow reception on exclusive memberships by default,
412 	 * reject reception on inclusive memberships by default.
413 	 * Exclude source only if an in-mode exclude filter exists.
414 	 * Include source only if an in-mode include filter exists.
415 	 * NOTE: We are comparing group state here at IGMP t1 (now)
416 	 * with socket-layer t0 (since last downcall).
417 	 */
418 	mode = imo->imo_mfilters[gidx].imf_st[1];
419 	ims = imo_match_source(imo, gidx, src);
420 
421 	if ((ims == NULL && mode == MCAST_INCLUDE) ||
422 	    (ims != NULL && ims->imsl_st[0] != mode))
423 		return (MCAST_NOTSMEMBER);
424 
425 	return (MCAST_PASS);
426 }
427 
428 /*
429  * Find and return a reference to an in_multi record for (ifp, group),
430  * and bump its reference count.
431  * If one does not exist, try to allocate it, and update link-layer multicast
432  * filters on ifp to listen for group.
433  * Assumes the IN_MULTI lock is held across the call.
434  * Return 0 if successful, otherwise return an appropriate error code.
435  */
436 static int
437 in_getmulti(struct ifnet *ifp, const struct in_addr *group,
438     struct in_multi **pinm)
439 {
440 	struct sockaddr_in	 gsin;
441 	struct ifmultiaddr	*ifma;
442 	struct in_ifinfo	*ii;
443 	struct in_multi		*inm;
444 	int error;
445 
446 	IN_MULTI_LOCK_ASSERT();
447 
448 	ii = (struct in_ifinfo *)ifp->if_afdata[AF_INET];
449 
450 	inm = inm_lookup(ifp, *group);
451 	if (inm != NULL) {
452 		/*
453 		 * If we already joined this group, just bump the
454 		 * refcount and return it.
455 		 */
456 		KASSERT(inm->inm_refcount >= 1,
457 		    ("%s: bad refcount %d", __func__, inm->inm_refcount));
458 		++inm->inm_refcount;
459 		*pinm = inm;
460 		return (0);
461 	}
462 
463 	memset(&gsin, 0, sizeof(gsin));
464 	gsin.sin_family = AF_INET;
465 	gsin.sin_len = sizeof(struct sockaddr_in);
466 	gsin.sin_addr = *group;
467 
468 	/*
469 	 * Check if a link-layer group is already associated
470 	 * with this network-layer group on the given ifnet.
471 	 */
472 	error = if_addmulti(ifp, (struct sockaddr *)&gsin, &ifma);
473 	if (error != 0)
474 		return (error);
475 
476 	/* XXX ifma_protospec must be covered by IF_ADDR_LOCK */
477 	IF_ADDR_WLOCK(ifp);
478 
479 	/*
480 	 * If something other than netinet is occupying the link-layer
481 	 * group, print a meaningful error message and back out of
482 	 * the allocation.
483 	 * Otherwise, bump the refcount on the existing network-layer
484 	 * group association and return it.
485 	 */
486 	if (ifma->ifma_protospec != NULL) {
487 		inm = (struct in_multi *)ifma->ifma_protospec;
488 #ifdef INVARIANTS
489 		KASSERT(ifma->ifma_addr != NULL, ("%s: no ifma_addr",
490 		    __func__));
491 		KASSERT(ifma->ifma_addr->sa_family == AF_INET,
492 		    ("%s: ifma not AF_INET", __func__));
493 		KASSERT(inm != NULL, ("%s: no ifma_protospec", __func__));
494 		if (inm->inm_ifma != ifma || inm->inm_ifp != ifp ||
495 		    !in_hosteq(inm->inm_addr, *group))
496 			panic("%s: ifma %p is inconsistent with %p (%s)",
497 			    __func__, ifma, inm, inet_ntoa(*group));
498 #endif
499 		++inm->inm_refcount;
500 		*pinm = inm;
501 		IF_ADDR_WUNLOCK(ifp);
502 		return (0);
503 	}
504 
505 	IF_ADDR_WLOCK_ASSERT(ifp);
506 
507 	/*
508 	 * A new in_multi record is needed; allocate and initialize it.
509 	 * We DO NOT perform an IGMP join as the in_ layer may need to
510 	 * push an initial source list down to IGMP to support SSM.
511 	 *
512 	 * The initial source filter state is INCLUDE, {} as per the RFC.
513 	 */
514 	inm = malloc(sizeof(*inm), M_IPMADDR, M_NOWAIT | M_ZERO);
515 	if (inm == NULL) {
516 		if_delmulti_ifma(ifma);
517 		IF_ADDR_WUNLOCK(ifp);
518 		return (ENOMEM);
519 	}
520 	inm->inm_addr = *group;
521 	inm->inm_ifp = ifp;
522 	inm->inm_igi = ii->ii_igmp;
523 	inm->inm_ifma = ifma;
524 	inm->inm_refcount = 1;
525 	inm->inm_state = IGMP_NOT_MEMBER;
526 
527 	/*
528 	 * Pending state-changes per group are subject to a bounds check.
529 	 */
530 	IFQ_SET_MAXLEN(&inm->inm_scq, IGMP_MAX_STATE_CHANGES);
531 
532 	inm->inm_st[0].iss_fmode = MCAST_UNDEFINED;
533 	inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
534 	RB_INIT(&inm->inm_srcs);
535 
536 	ifma->ifma_protospec = inm;
537 
538 	*pinm = inm;
539 
540 	IF_ADDR_WUNLOCK(ifp);
541 	return (0);
542 }
543 
544 /*
545  * Drop a reference to an in_multi record.
546  *
547  * If the refcount drops to 0, free the in_multi record and
548  * delete the underlying link-layer membership.
549  */
550 void
551 inm_release_locked(struct in_multi *inm)
552 {
553 	struct ifmultiaddr *ifma;
554 
555 	IN_MULTI_LOCK_ASSERT();
556 
557 	CTR2(KTR_IGMPV3, "%s: refcount is %d", __func__, inm->inm_refcount);
558 
559 	if (--inm->inm_refcount > 0) {
560 		CTR2(KTR_IGMPV3, "%s: refcount is now %d", __func__,
561 		    inm->inm_refcount);
562 		return;
563 	}
564 
565 	CTR2(KTR_IGMPV3, "%s: freeing inm %p", __func__, inm);
566 
567 	ifma = inm->inm_ifma;
568 
569 	/* XXX this access is not covered by IF_ADDR_LOCK */
570 	CTR2(KTR_IGMPV3, "%s: purging ifma %p", __func__, ifma);
571 	KASSERT(ifma->ifma_protospec == inm,
572 	    ("%s: ifma_protospec != inm", __func__));
573 	ifma->ifma_protospec = NULL;
574 
575 	inm_purge(inm);
576 
577 	free(inm, M_IPMADDR);
578 
579 	if_delmulti_ifma(ifma);
580 }
581 
582 /*
583  * Clear recorded source entries for a group.
584  * Used by the IGMP code. Caller must hold the IN_MULTI lock.
585  * FIXME: Should reap.
586  */
587 void
588 inm_clear_recorded(struct in_multi *inm)
589 {
590 	struct ip_msource	*ims;
591 
592 	IN_MULTI_LOCK_ASSERT();
593 
594 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
595 		if (ims->ims_stp) {
596 			ims->ims_stp = 0;
597 			--inm->inm_st[1].iss_rec;
598 		}
599 	}
600 	KASSERT(inm->inm_st[1].iss_rec == 0,
601 	    ("%s: iss_rec %d not 0", __func__, inm->inm_st[1].iss_rec));
602 }
603 
604 /*
605  * Record a source as pending for a Source-Group IGMPv3 query.
606  * This lives here as it modifies the shared tree.
607  *
608  * inm is the group descriptor.
609  * naddr is the address of the source to record in network-byte order.
610  *
611  * If the net.inet.igmp.sgalloc sysctl is non-zero, we will
612  * lazy-allocate a source node in response to an SG query.
613  * Otherwise, no allocation is performed. This saves some memory
614  * with the trade-off that the source will not be reported to the
615  * router if joined in the window between the query response and
616  * the group actually being joined on the local host.
617  *
618  * VIMAGE: XXX: Currently the igmp_sgalloc feature has been removed.
619  * This turns off the allocation of a recorded source entry if
620  * the group has not been joined.
621  *
622  * Return 0 if the source didn't exist or was already marked as recorded.
623  * Return 1 if the source was marked as recorded by this function.
624  * Return <0 if any error occured (negated errno code).
625  */
626 int
627 inm_record_source(struct in_multi *inm, const in_addr_t naddr)
628 {
629 	struct ip_msource	 find;
630 	struct ip_msource	*ims, *nims;
631 
632 	IN_MULTI_LOCK_ASSERT();
633 
634 	find.ims_haddr = ntohl(naddr);
635 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
636 	if (ims && ims->ims_stp)
637 		return (0);
638 	if (ims == NULL) {
639 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
640 			return (-ENOSPC);
641 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
642 		    M_NOWAIT | M_ZERO);
643 		if (nims == NULL)
644 			return (-ENOMEM);
645 		nims->ims_haddr = find.ims_haddr;
646 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
647 		++inm->inm_nsrc;
648 		ims = nims;
649 	}
650 
651 	/*
652 	 * Mark the source as recorded and update the recorded
653 	 * source count.
654 	 */
655 	++ims->ims_stp;
656 	++inm->inm_st[1].iss_rec;
657 
658 	return (1);
659 }
660 
661 /*
662  * Return a pointer to an in_msource owned by an in_mfilter,
663  * given its source address.
664  * Lazy-allocate if needed. If this is a new entry its filter state is
665  * undefined at t0.
666  *
667  * imf is the filter set being modified.
668  * haddr is the source address in *host* byte-order.
669  *
670  * SMPng: May be called with locks held; malloc must not block.
671  */
672 static int
673 imf_get_source(struct in_mfilter *imf, const struct sockaddr_in *psin,
674     struct in_msource **plims)
675 {
676 	struct ip_msource	 find;
677 	struct ip_msource	*ims, *nims;
678 	struct in_msource	*lims;
679 	int			 error;
680 
681 	error = 0;
682 	ims = NULL;
683 	lims = NULL;
684 
685 	/* key is host byte order */
686 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
687 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
688 	lims = (struct in_msource *)ims;
689 	if (lims == NULL) {
690 		if (imf->imf_nsrc == in_mcast_maxsocksrc)
691 			return (ENOSPC);
692 		nims = malloc(sizeof(struct in_msource), M_INMFILTER,
693 		    M_NOWAIT | M_ZERO);
694 		if (nims == NULL)
695 			return (ENOMEM);
696 		lims = (struct in_msource *)nims;
697 		lims->ims_haddr = find.ims_haddr;
698 		lims->imsl_st[0] = MCAST_UNDEFINED;
699 		RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
700 		++imf->imf_nsrc;
701 	}
702 
703 	*plims = lims;
704 
705 	return (error);
706 }
707 
708 /*
709  * Graft a source entry into an existing socket-layer filter set,
710  * maintaining any required invariants and checking allocations.
711  *
712  * The source is marked as being in the new filter mode at t1.
713  *
714  * Return the pointer to the new node, otherwise return NULL.
715  */
716 static struct in_msource *
717 imf_graft(struct in_mfilter *imf, const uint8_t st1,
718     const struct sockaddr_in *psin)
719 {
720 	struct ip_msource	*nims;
721 	struct in_msource	*lims;
722 
723 	nims = malloc(sizeof(struct in_msource), M_INMFILTER,
724 	    M_NOWAIT | M_ZERO);
725 	if (nims == NULL)
726 		return (NULL);
727 	lims = (struct in_msource *)nims;
728 	lims->ims_haddr = ntohl(psin->sin_addr.s_addr);
729 	lims->imsl_st[0] = MCAST_UNDEFINED;
730 	lims->imsl_st[1] = st1;
731 	RB_INSERT(ip_msource_tree, &imf->imf_sources, nims);
732 	++imf->imf_nsrc;
733 
734 	return (lims);
735 }
736 
737 /*
738  * Prune a source entry from an existing socket-layer filter set,
739  * maintaining any required invariants and checking allocations.
740  *
741  * The source is marked as being left at t1, it is not freed.
742  *
743  * Return 0 if no error occurred, otherwise return an errno value.
744  */
745 static int
746 imf_prune(struct in_mfilter *imf, const struct sockaddr_in *psin)
747 {
748 	struct ip_msource	 find;
749 	struct ip_msource	*ims;
750 	struct in_msource	*lims;
751 
752 	/* key is host byte order */
753 	find.ims_haddr = ntohl(psin->sin_addr.s_addr);
754 	ims = RB_FIND(ip_msource_tree, &imf->imf_sources, &find);
755 	if (ims == NULL)
756 		return (ENOENT);
757 	lims = (struct in_msource *)ims;
758 	lims->imsl_st[1] = MCAST_UNDEFINED;
759 	return (0);
760 }
761 
762 /*
763  * Revert socket-layer filter set deltas at t1 to t0 state.
764  */
765 static void
766 imf_rollback(struct in_mfilter *imf)
767 {
768 	struct ip_msource	*ims, *tims;
769 	struct in_msource	*lims;
770 
771 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
772 		lims = (struct in_msource *)ims;
773 		if (lims->imsl_st[0] == lims->imsl_st[1]) {
774 			/* no change at t1 */
775 			continue;
776 		} else if (lims->imsl_st[0] != MCAST_UNDEFINED) {
777 			/* revert change to existing source at t1 */
778 			lims->imsl_st[1] = lims->imsl_st[0];
779 		} else {
780 			/* revert source added t1 */
781 			CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
782 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
783 			free(ims, M_INMFILTER);
784 			imf->imf_nsrc--;
785 		}
786 	}
787 	imf->imf_st[1] = imf->imf_st[0];
788 }
789 
790 /*
791  * Mark socket-layer filter set as INCLUDE {} at t1.
792  */
793 static void
794 imf_leave(struct in_mfilter *imf)
795 {
796 	struct ip_msource	*ims;
797 	struct in_msource	*lims;
798 
799 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
800 		lims = (struct in_msource *)ims;
801 		lims->imsl_st[1] = MCAST_UNDEFINED;
802 	}
803 	imf->imf_st[1] = MCAST_INCLUDE;
804 }
805 
806 /*
807  * Mark socket-layer filter set deltas as committed.
808  */
809 static void
810 imf_commit(struct in_mfilter *imf)
811 {
812 	struct ip_msource	*ims;
813 	struct in_msource	*lims;
814 
815 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
816 		lims = (struct in_msource *)ims;
817 		lims->imsl_st[0] = lims->imsl_st[1];
818 	}
819 	imf->imf_st[0] = imf->imf_st[1];
820 }
821 
822 /*
823  * Reap unreferenced sources from socket-layer filter set.
824  */
825 static void
826 imf_reap(struct in_mfilter *imf)
827 {
828 	struct ip_msource	*ims, *tims;
829 	struct in_msource	*lims;
830 
831 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
832 		lims = (struct in_msource *)ims;
833 		if ((lims->imsl_st[0] == MCAST_UNDEFINED) &&
834 		    (lims->imsl_st[1] == MCAST_UNDEFINED)) {
835 			CTR2(KTR_IGMPV3, "%s: free lims %p", __func__, ims);
836 			RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
837 			free(ims, M_INMFILTER);
838 			imf->imf_nsrc--;
839 		}
840 	}
841 }
842 
843 /*
844  * Purge socket-layer filter set.
845  */
846 static void
847 imf_purge(struct in_mfilter *imf)
848 {
849 	struct ip_msource	*ims, *tims;
850 
851 	RB_FOREACH_SAFE(ims, ip_msource_tree, &imf->imf_sources, tims) {
852 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
853 		RB_REMOVE(ip_msource_tree, &imf->imf_sources, ims);
854 		free(ims, M_INMFILTER);
855 		imf->imf_nsrc--;
856 	}
857 	imf->imf_st[0] = imf->imf_st[1] = MCAST_UNDEFINED;
858 	KASSERT(RB_EMPTY(&imf->imf_sources),
859 	    ("%s: imf_sources not empty", __func__));
860 }
861 
862 /*
863  * Look up a source filter entry for a multicast group.
864  *
865  * inm is the group descriptor to work with.
866  * haddr is the host-byte-order IPv4 address to look up.
867  * noalloc may be non-zero to suppress allocation of sources.
868  * *pims will be set to the address of the retrieved or allocated source.
869  *
870  * SMPng: NOTE: may be called with locks held.
871  * Return 0 if successful, otherwise return a non-zero error code.
872  */
873 static int
874 inm_get_source(struct in_multi *inm, const in_addr_t haddr,
875     const int noalloc, struct ip_msource **pims)
876 {
877 	struct ip_msource	 find;
878 	struct ip_msource	*ims, *nims;
879 #ifdef KTR
880 	struct in_addr ia;
881 #endif
882 
883 	find.ims_haddr = haddr;
884 	ims = RB_FIND(ip_msource_tree, &inm->inm_srcs, &find);
885 	if (ims == NULL && !noalloc) {
886 		if (inm->inm_nsrc == in_mcast_maxgrpsrc)
887 			return (ENOSPC);
888 		nims = malloc(sizeof(struct ip_msource), M_IPMSOURCE,
889 		    M_NOWAIT | M_ZERO);
890 		if (nims == NULL)
891 			return (ENOMEM);
892 		nims->ims_haddr = haddr;
893 		RB_INSERT(ip_msource_tree, &inm->inm_srcs, nims);
894 		++inm->inm_nsrc;
895 		ims = nims;
896 #ifdef KTR
897 		ia.s_addr = htonl(haddr);
898 		CTR3(KTR_IGMPV3, "%s: allocated %s as %p", __func__,
899 		    inet_ntoa(ia), ims);
900 #endif
901 	}
902 
903 	*pims = ims;
904 	return (0);
905 }
906 
907 /*
908  * Merge socket-layer source into IGMP-layer source.
909  * If rollback is non-zero, perform the inverse of the merge.
910  */
911 static void
912 ims_merge(struct ip_msource *ims, const struct in_msource *lims,
913     const int rollback)
914 {
915 	int n = rollback ? -1 : 1;
916 #ifdef KTR
917 	struct in_addr ia;
918 
919 	ia.s_addr = htonl(ims->ims_haddr);
920 #endif
921 
922 	if (lims->imsl_st[0] == MCAST_EXCLUDE) {
923 		CTR3(KTR_IGMPV3, "%s: t1 ex -= %d on %s",
924 		    __func__, n, inet_ntoa(ia));
925 		ims->ims_st[1].ex -= n;
926 	} else if (lims->imsl_st[0] == MCAST_INCLUDE) {
927 		CTR3(KTR_IGMPV3, "%s: t1 in -= %d on %s",
928 		    __func__, n, inet_ntoa(ia));
929 		ims->ims_st[1].in -= n;
930 	}
931 
932 	if (lims->imsl_st[1] == MCAST_EXCLUDE) {
933 		CTR3(KTR_IGMPV3, "%s: t1 ex += %d on %s",
934 		    __func__, n, inet_ntoa(ia));
935 		ims->ims_st[1].ex += n;
936 	} else if (lims->imsl_st[1] == MCAST_INCLUDE) {
937 		CTR3(KTR_IGMPV3, "%s: t1 in += %d on %s",
938 		    __func__, n, inet_ntoa(ia));
939 		ims->ims_st[1].in += n;
940 	}
941 }
942 
943 /*
944  * Atomically update the global in_multi state, when a membership's
945  * filter list is being updated in any way.
946  *
947  * imf is the per-inpcb-membership group filter pointer.
948  * A fake imf may be passed for in-kernel consumers.
949  *
950  * XXX This is a candidate for a set-symmetric-difference style loop
951  * which would eliminate the repeated lookup from root of ims nodes,
952  * as they share the same key space.
953  *
954  * If any error occurred this function will back out of refcounts
955  * and return a non-zero value.
956  */
957 static int
958 inm_merge(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
959 {
960 	struct ip_msource	*ims, *nims;
961 	struct in_msource	*lims;
962 	int			 schanged, error;
963 	int			 nsrc0, nsrc1;
964 
965 	schanged = 0;
966 	error = 0;
967 	nsrc1 = nsrc0 = 0;
968 
969 	/*
970 	 * Update the source filters first, as this may fail.
971 	 * Maintain count of in-mode filters at t0, t1. These are
972 	 * used to work out if we transition into ASM mode or not.
973 	 * Maintain a count of source filters whose state was
974 	 * actually modified by this operation.
975 	 */
976 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
977 		lims = (struct in_msource *)ims;
978 		if (lims->imsl_st[0] == imf->imf_st[0]) nsrc0++;
979 		if (lims->imsl_st[1] == imf->imf_st[1]) nsrc1++;
980 		if (lims->imsl_st[0] == lims->imsl_st[1]) continue;
981 		error = inm_get_source(inm, lims->ims_haddr, 0, &nims);
982 		++schanged;
983 		if (error)
984 			break;
985 		ims_merge(nims, lims, 0);
986 	}
987 	if (error) {
988 		struct ip_msource *bims;
989 
990 		RB_FOREACH_REVERSE_FROM(ims, ip_msource_tree, nims) {
991 			lims = (struct in_msource *)ims;
992 			if (lims->imsl_st[0] == lims->imsl_st[1])
993 				continue;
994 			(void)inm_get_source(inm, lims->ims_haddr, 1, &bims);
995 			if (bims == NULL)
996 				continue;
997 			ims_merge(bims, lims, 1);
998 		}
999 		goto out_reap;
1000 	}
1001 
1002 	CTR3(KTR_IGMPV3, "%s: imf filters in-mode: %d at t0, %d at t1",
1003 	    __func__, nsrc0, nsrc1);
1004 
1005 	/* Handle transition between INCLUDE {n} and INCLUDE {} on socket. */
1006 	if (imf->imf_st[0] == imf->imf_st[1] &&
1007 	    imf->imf_st[1] == MCAST_INCLUDE) {
1008 		if (nsrc1 == 0) {
1009 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
1010 			--inm->inm_st[1].iss_in;
1011 		}
1012 	}
1013 
1014 	/* Handle filter mode transition on socket. */
1015 	if (imf->imf_st[0] != imf->imf_st[1]) {
1016 		CTR3(KTR_IGMPV3, "%s: imf transition %d to %d",
1017 		    __func__, imf->imf_st[0], imf->imf_st[1]);
1018 
1019 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
1020 			CTR1(KTR_IGMPV3, "%s: --ex on inm at t1", __func__);
1021 			--inm->inm_st[1].iss_ex;
1022 		} else if (imf->imf_st[0] == MCAST_INCLUDE) {
1023 			CTR1(KTR_IGMPV3, "%s: --in on inm at t1", __func__);
1024 			--inm->inm_st[1].iss_in;
1025 		}
1026 
1027 		if (imf->imf_st[1] == MCAST_EXCLUDE) {
1028 			CTR1(KTR_IGMPV3, "%s: ex++ on inm at t1", __func__);
1029 			inm->inm_st[1].iss_ex++;
1030 		} else if (imf->imf_st[1] == MCAST_INCLUDE && nsrc1 > 0) {
1031 			CTR1(KTR_IGMPV3, "%s: in++ on inm at t1", __func__);
1032 			inm->inm_st[1].iss_in++;
1033 		}
1034 	}
1035 
1036 	/*
1037 	 * Track inm filter state in terms of listener counts.
1038 	 * If there are any exclusive listeners, stack-wide
1039 	 * membership is exclusive.
1040 	 * Otherwise, if only inclusive listeners, stack-wide is inclusive.
1041 	 * If no listeners remain, state is undefined at t1,
1042 	 * and the IGMP lifecycle for this group should finish.
1043 	 */
1044 	if (inm->inm_st[1].iss_ex > 0) {
1045 		CTR1(KTR_IGMPV3, "%s: transition to EX", __func__);
1046 		inm->inm_st[1].iss_fmode = MCAST_EXCLUDE;
1047 	} else if (inm->inm_st[1].iss_in > 0) {
1048 		CTR1(KTR_IGMPV3, "%s: transition to IN", __func__);
1049 		inm->inm_st[1].iss_fmode = MCAST_INCLUDE;
1050 	} else {
1051 		CTR1(KTR_IGMPV3, "%s: transition to UNDEF", __func__);
1052 		inm->inm_st[1].iss_fmode = MCAST_UNDEFINED;
1053 	}
1054 
1055 	/* Decrement ASM listener count on transition out of ASM mode. */
1056 	if (imf->imf_st[0] == MCAST_EXCLUDE && nsrc0 == 0) {
1057 		if ((imf->imf_st[1] != MCAST_EXCLUDE) ||
1058 		    (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 > 0))
1059 			CTR1(KTR_IGMPV3, "%s: --asm on inm at t1", __func__);
1060 			--inm->inm_st[1].iss_asm;
1061 	}
1062 
1063 	/* Increment ASM listener count on transition to ASM mode. */
1064 	if (imf->imf_st[1] == MCAST_EXCLUDE && nsrc1 == 0) {
1065 		CTR1(KTR_IGMPV3, "%s: asm++ on inm at t1", __func__);
1066 		inm->inm_st[1].iss_asm++;
1067 	}
1068 
1069 	CTR3(KTR_IGMPV3, "%s: merged imf %p to inm %p", __func__, imf, inm);
1070 	inm_print(inm);
1071 
1072 out_reap:
1073 	if (schanged > 0) {
1074 		CTR1(KTR_IGMPV3, "%s: sources changed; reaping", __func__);
1075 		inm_reap(inm);
1076 	}
1077 	return (error);
1078 }
1079 
1080 /*
1081  * Mark an in_multi's filter set deltas as committed.
1082  * Called by IGMP after a state change has been enqueued.
1083  */
1084 void
1085 inm_commit(struct in_multi *inm)
1086 {
1087 	struct ip_msource	*ims;
1088 
1089 	CTR2(KTR_IGMPV3, "%s: commit inm %p", __func__, inm);
1090 	CTR1(KTR_IGMPV3, "%s: pre commit:", __func__);
1091 	inm_print(inm);
1092 
1093 	RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
1094 		ims->ims_st[0] = ims->ims_st[1];
1095 	}
1096 	inm->inm_st[0] = inm->inm_st[1];
1097 }
1098 
1099 /*
1100  * Reap unreferenced nodes from an in_multi's filter set.
1101  */
1102 static void
1103 inm_reap(struct in_multi *inm)
1104 {
1105 	struct ip_msource	*ims, *tims;
1106 
1107 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
1108 		if (ims->ims_st[0].ex > 0 || ims->ims_st[0].in > 0 ||
1109 		    ims->ims_st[1].ex > 0 || ims->ims_st[1].in > 0 ||
1110 		    ims->ims_stp != 0)
1111 			continue;
1112 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
1113 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
1114 		free(ims, M_IPMSOURCE);
1115 		inm->inm_nsrc--;
1116 	}
1117 }
1118 
1119 /*
1120  * Purge all source nodes from an in_multi's filter set.
1121  */
1122 static void
1123 inm_purge(struct in_multi *inm)
1124 {
1125 	struct ip_msource	*ims, *tims;
1126 
1127 	RB_FOREACH_SAFE(ims, ip_msource_tree, &inm->inm_srcs, tims) {
1128 		CTR2(KTR_IGMPV3, "%s: free ims %p", __func__, ims);
1129 		RB_REMOVE(ip_msource_tree, &inm->inm_srcs, ims);
1130 		free(ims, M_IPMSOURCE);
1131 		inm->inm_nsrc--;
1132 	}
1133 }
1134 
1135 /*
1136  * Join a multicast group; unlocked entry point.
1137  *
1138  * SMPng: XXX: in_joingroup() is called from in_control() when Giant
1139  * is not held. Fortunately, ifp is unlikely to have been detached
1140  * at this point, so we assume it's OK to recurse.
1141  */
1142 int
1143 in_joingroup(struct ifnet *ifp, const struct in_addr *gina,
1144     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
1145 {
1146 	int error;
1147 
1148 	IN_MULTI_LOCK();
1149 	error = in_joingroup_locked(ifp, gina, imf, pinm);
1150 	IN_MULTI_UNLOCK();
1151 
1152 	return (error);
1153 }
1154 
1155 /*
1156  * Join a multicast group; real entry point.
1157  *
1158  * Only preserves atomicity at inm level.
1159  * NOTE: imf argument cannot be const due to sys/tree.h limitations.
1160  *
1161  * If the IGMP downcall fails, the group is not joined, and an error
1162  * code is returned.
1163  */
1164 int
1165 in_joingroup_locked(struct ifnet *ifp, const struct in_addr *gina,
1166     /*const*/ struct in_mfilter *imf, struct in_multi **pinm)
1167 {
1168 	struct in_mfilter	 timf;
1169 	struct in_multi		*inm;
1170 	int			 error;
1171 
1172 	IN_MULTI_LOCK_ASSERT();
1173 
1174 	CTR4(KTR_IGMPV3, "%s: join %s on %p(%s))", __func__,
1175 	    inet_ntoa(*gina), ifp, ifp->if_xname);
1176 
1177 	error = 0;
1178 	inm = NULL;
1179 
1180 	/*
1181 	 * If no imf was specified (i.e. kernel consumer),
1182 	 * fake one up and assume it is an ASM join.
1183 	 */
1184 	if (imf == NULL) {
1185 		imf_init(&timf, MCAST_UNDEFINED, MCAST_EXCLUDE);
1186 		imf = &timf;
1187 	}
1188 
1189 	error = in_getmulti(ifp, gina, &inm);
1190 	if (error) {
1191 		CTR1(KTR_IGMPV3, "%s: in_getmulti() failure", __func__);
1192 		return (error);
1193 	}
1194 
1195 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
1196 	error = inm_merge(inm, imf);
1197 	if (error) {
1198 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
1199 		goto out_inm_release;
1200 	}
1201 
1202 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
1203 	error = igmp_change_state(inm);
1204 	if (error) {
1205 		CTR1(KTR_IGMPV3, "%s: failed to update source", __func__);
1206 		goto out_inm_release;
1207 	}
1208 
1209 out_inm_release:
1210 	if (error) {
1211 		CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
1212 		inm_release_locked(inm);
1213 	} else {
1214 		*pinm = inm;
1215 	}
1216 
1217 	return (error);
1218 }
1219 
1220 /*
1221  * Leave a multicast group; unlocked entry point.
1222  */
1223 int
1224 in_leavegroup(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
1225 {
1226 	int error;
1227 
1228 	IN_MULTI_LOCK();
1229 	error = in_leavegroup_locked(inm, imf);
1230 	IN_MULTI_UNLOCK();
1231 
1232 	return (error);
1233 }
1234 
1235 /*
1236  * Leave a multicast group; real entry point.
1237  * All source filters will be expunged.
1238  *
1239  * Only preserves atomicity at inm level.
1240  *
1241  * Holding the write lock for the INP which contains imf
1242  * is highly advisable. We can't assert for it as imf does not
1243  * contain a back-pointer to the owning inp.
1244  *
1245  * Note: This is not the same as inm_release(*) as this function also
1246  * makes a state change downcall into IGMP.
1247  */
1248 int
1249 in_leavegroup_locked(struct in_multi *inm, /*const*/ struct in_mfilter *imf)
1250 {
1251 	struct in_mfilter	 timf;
1252 	int			 error;
1253 
1254 	error = 0;
1255 
1256 	IN_MULTI_LOCK_ASSERT();
1257 
1258 	CTR5(KTR_IGMPV3, "%s: leave inm %p, %s/%s, imf %p", __func__,
1259 	    inm, inet_ntoa(inm->inm_addr),
1260 	    (inm_is_ifp_detached(inm) ? "null" : inm->inm_ifp->if_xname),
1261 	    imf);
1262 
1263 	/*
1264 	 * If no imf was specified (i.e. kernel consumer),
1265 	 * fake one up and assume it is an ASM join.
1266 	 */
1267 	if (imf == NULL) {
1268 		imf_init(&timf, MCAST_EXCLUDE, MCAST_UNDEFINED);
1269 		imf = &timf;
1270 	}
1271 
1272 	/*
1273 	 * Begin state merge transaction at IGMP layer.
1274 	 *
1275 	 * As this particular invocation should not cause any memory
1276 	 * to be allocated, and there is no opportunity to roll back
1277 	 * the transaction, it MUST NOT fail.
1278 	 */
1279 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
1280 	error = inm_merge(inm, imf);
1281 	KASSERT(error == 0, ("%s: failed to merge inm state", __func__));
1282 
1283 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
1284 	CURVNET_SET(inm->inm_ifp->if_vnet);
1285 	error = igmp_change_state(inm);
1286 	CURVNET_RESTORE();
1287 	if (error)
1288 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
1289 
1290 	CTR2(KTR_IGMPV3, "%s: dropping ref on %p", __func__, inm);
1291 	inm_release_locked(inm);
1292 
1293 	return (error);
1294 }
1295 
1296 /*#ifndef BURN_BRIDGES*/
1297 /*
1298  * Join an IPv4 multicast group in (*,G) exclusive mode.
1299  * The group must be a 224.0.0.0/24 link-scope group.
1300  * This KPI is for legacy kernel consumers only.
1301  */
1302 struct in_multi *
1303 in_addmulti(struct in_addr *ap, struct ifnet *ifp)
1304 {
1305 	struct in_multi *pinm;
1306 	int error;
1307 
1308 	KASSERT(IN_LOCAL_GROUP(ntohl(ap->s_addr)),
1309 	    ("%s: %s not in 224.0.0.0/24", __func__, inet_ntoa(*ap)));
1310 
1311 	error = in_joingroup(ifp, ap, NULL, &pinm);
1312 	if (error != 0)
1313 		pinm = NULL;
1314 
1315 	return (pinm);
1316 }
1317 
1318 /*
1319  * Leave an IPv4 multicast group, assumed to be in exclusive (*,G) mode.
1320  * This KPI is for legacy kernel consumers only.
1321  */
1322 void
1323 in_delmulti(struct in_multi *inm)
1324 {
1325 
1326 	(void)in_leavegroup(inm, NULL);
1327 }
1328 /*#endif*/
1329 
1330 /*
1331  * Block or unblock an ASM multicast source on an inpcb.
1332  * This implements the delta-based API described in RFC 3678.
1333  *
1334  * The delta-based API applies only to exclusive-mode memberships.
1335  * An IGMP downcall will be performed.
1336  *
1337  * SMPng: NOTE: Must take Giant as a join may create a new ifma.
1338  *
1339  * Return 0 if successful, otherwise return an appropriate error code.
1340  */
1341 static int
1342 inp_block_unblock_source(struct inpcb *inp, struct sockopt *sopt)
1343 {
1344 	struct group_source_req		 gsr;
1345 	sockunion_t			*gsa, *ssa;
1346 	struct ifnet			*ifp;
1347 	struct in_mfilter		*imf;
1348 	struct ip_moptions		*imo;
1349 	struct in_msource		*ims;
1350 	struct in_multi			*inm;
1351 	size_t				 idx;
1352 	uint16_t			 fmode;
1353 	int				 error, doblock;
1354 
1355 	ifp = NULL;
1356 	error = 0;
1357 	doblock = 0;
1358 
1359 	memset(&gsr, 0, sizeof(struct group_source_req));
1360 	gsa = (sockunion_t *)&gsr.gsr_group;
1361 	ssa = (sockunion_t *)&gsr.gsr_source;
1362 
1363 	switch (sopt->sopt_name) {
1364 	case IP_BLOCK_SOURCE:
1365 	case IP_UNBLOCK_SOURCE: {
1366 		struct ip_mreq_source	 mreqs;
1367 
1368 		error = sooptcopyin(sopt, &mreqs,
1369 		    sizeof(struct ip_mreq_source),
1370 		    sizeof(struct ip_mreq_source));
1371 		if (error)
1372 			return (error);
1373 
1374 		gsa->sin.sin_family = AF_INET;
1375 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
1376 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
1377 
1378 		ssa->sin.sin_family = AF_INET;
1379 		ssa->sin.sin_len = sizeof(struct sockaddr_in);
1380 		ssa->sin.sin_addr = mreqs.imr_sourceaddr;
1381 
1382 		if (!in_nullhost(mreqs.imr_interface))
1383 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
1384 
1385 		if (sopt->sopt_name == IP_BLOCK_SOURCE)
1386 			doblock = 1;
1387 
1388 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
1389 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
1390 		break;
1391 	    }
1392 
1393 	case MCAST_BLOCK_SOURCE:
1394 	case MCAST_UNBLOCK_SOURCE:
1395 		error = sooptcopyin(sopt, &gsr,
1396 		    sizeof(struct group_source_req),
1397 		    sizeof(struct group_source_req));
1398 		if (error)
1399 			return (error);
1400 
1401 		if (gsa->sin.sin_family != AF_INET ||
1402 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
1403 			return (EINVAL);
1404 
1405 		if (ssa->sin.sin_family != AF_INET ||
1406 		    ssa->sin.sin_len != sizeof(struct sockaddr_in))
1407 			return (EINVAL);
1408 
1409 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
1410 			return (EADDRNOTAVAIL);
1411 
1412 		ifp = ifnet_byindex(gsr.gsr_interface);
1413 
1414 		if (sopt->sopt_name == MCAST_BLOCK_SOURCE)
1415 			doblock = 1;
1416 		break;
1417 
1418 	default:
1419 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
1420 		    __func__, sopt->sopt_name);
1421 		return (EOPNOTSUPP);
1422 		break;
1423 	}
1424 
1425 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
1426 		return (EINVAL);
1427 
1428 	/*
1429 	 * Check if we are actually a member of this group.
1430 	 */
1431 	imo = inp_findmoptions(inp);
1432 	idx = imo_match_group(imo, ifp, &gsa->sa);
1433 	if (idx == -1 || imo->imo_mfilters == NULL) {
1434 		error = EADDRNOTAVAIL;
1435 		goto out_inp_locked;
1436 	}
1437 
1438 	KASSERT(imo->imo_mfilters != NULL,
1439 	    ("%s: imo_mfilters not allocated", __func__));
1440 	imf = &imo->imo_mfilters[idx];
1441 	inm = imo->imo_membership[idx];
1442 
1443 	/*
1444 	 * Attempting to use the delta-based API on an
1445 	 * non exclusive-mode membership is an error.
1446 	 */
1447 	fmode = imf->imf_st[0];
1448 	if (fmode != MCAST_EXCLUDE) {
1449 		error = EINVAL;
1450 		goto out_inp_locked;
1451 	}
1452 
1453 	/*
1454 	 * Deal with error cases up-front:
1455 	 *  Asked to block, but already blocked; or
1456 	 *  Asked to unblock, but nothing to unblock.
1457 	 * If adding a new block entry, allocate it.
1458 	 */
1459 	ims = imo_match_source(imo, idx, &ssa->sa);
1460 	if ((ims != NULL && doblock) || (ims == NULL && !doblock)) {
1461 		CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
1462 		    inet_ntoa(ssa->sin.sin_addr), doblock ? "" : "not ");
1463 		error = EADDRNOTAVAIL;
1464 		goto out_inp_locked;
1465 	}
1466 
1467 	INP_WLOCK_ASSERT(inp);
1468 
1469 	/*
1470 	 * Begin state merge transaction at socket layer.
1471 	 */
1472 	if (doblock) {
1473 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
1474 		ims = imf_graft(imf, fmode, &ssa->sin);
1475 		if (ims == NULL)
1476 			error = ENOMEM;
1477 	} else {
1478 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
1479 		error = imf_prune(imf, &ssa->sin);
1480 	}
1481 
1482 	if (error) {
1483 		CTR1(KTR_IGMPV3, "%s: merge imf state failed", __func__);
1484 		goto out_imf_rollback;
1485 	}
1486 
1487 	/*
1488 	 * Begin state merge transaction at IGMP layer.
1489 	 */
1490 	IN_MULTI_LOCK();
1491 
1492 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
1493 	error = inm_merge(inm, imf);
1494 	if (error) {
1495 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
1496 		goto out_in_multi_locked;
1497 	}
1498 
1499 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
1500 	error = igmp_change_state(inm);
1501 	if (error)
1502 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
1503 
1504 out_in_multi_locked:
1505 
1506 	IN_MULTI_UNLOCK();
1507 
1508 out_imf_rollback:
1509 	if (error)
1510 		imf_rollback(imf);
1511 	else
1512 		imf_commit(imf);
1513 
1514 	imf_reap(imf);
1515 
1516 out_inp_locked:
1517 	INP_WUNLOCK(inp);
1518 	return (error);
1519 }
1520 
1521 /*
1522  * Given an inpcb, return its multicast options structure pointer.  Accepts
1523  * an unlocked inpcb pointer, but will return it locked.  May sleep.
1524  *
1525  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
1526  * SMPng: NOTE: Returns with the INP write lock held.
1527  */
1528 static struct ip_moptions *
1529 inp_findmoptions(struct inpcb *inp)
1530 {
1531 	struct ip_moptions	 *imo;
1532 	struct in_multi		**immp;
1533 	struct in_mfilter	 *imfp;
1534 	size_t			  idx;
1535 
1536 	INP_WLOCK(inp);
1537 	if (inp->inp_moptions != NULL)
1538 		return (inp->inp_moptions);
1539 
1540 	INP_WUNLOCK(inp);
1541 
1542 	imo = malloc(sizeof(*imo), M_IPMOPTS, M_WAITOK);
1543 	immp = malloc(sizeof(*immp) * IP_MIN_MEMBERSHIPS, M_IPMOPTS,
1544 	    M_WAITOK | M_ZERO);
1545 	imfp = malloc(sizeof(struct in_mfilter) * IP_MIN_MEMBERSHIPS,
1546 	    M_INMFILTER, M_WAITOK);
1547 
1548 	imo->imo_multicast_ifp = NULL;
1549 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
1550 	imo->imo_multicast_vif = -1;
1551 	imo->imo_multicast_ttl = IP_DEFAULT_MULTICAST_TTL;
1552 	imo->imo_multicast_loop = in_mcast_loop;
1553 	imo->imo_num_memberships = 0;
1554 	imo->imo_max_memberships = IP_MIN_MEMBERSHIPS;
1555 	imo->imo_membership = immp;
1556 
1557 	/* Initialize per-group source filters. */
1558 	for (idx = 0; idx < IP_MIN_MEMBERSHIPS; idx++)
1559 		imf_init(&imfp[idx], MCAST_UNDEFINED, MCAST_EXCLUDE);
1560 	imo->imo_mfilters = imfp;
1561 
1562 	INP_WLOCK(inp);
1563 	if (inp->inp_moptions != NULL) {
1564 		free(imfp, M_INMFILTER);
1565 		free(immp, M_IPMOPTS);
1566 		free(imo, M_IPMOPTS);
1567 		return (inp->inp_moptions);
1568 	}
1569 	inp->inp_moptions = imo;
1570 	return (imo);
1571 }
1572 
1573 /*
1574  * Discard the IP multicast options (and source filters).  To minimize
1575  * the amount of work done while holding locks such as the INP's
1576  * pcbinfo lock (which is used in the receive path), the free
1577  * operation is performed asynchronously in a separate task.
1578  *
1579  * SMPng: NOTE: assumes INP write lock is held.
1580  */
1581 void
1582 inp_freemoptions(struct ip_moptions *imo)
1583 {
1584 
1585 	KASSERT(imo != NULL, ("%s: ip_moptions is NULL", __func__));
1586 	IN_MULTI_LOCK();
1587 	STAILQ_INSERT_TAIL(&imo_gc_list, imo, imo_link);
1588 	IN_MULTI_UNLOCK();
1589 	taskqueue_enqueue(taskqueue_thread, &imo_gc_task);
1590 }
1591 
1592 static void
1593 inp_freemoptions_internal(struct ip_moptions *imo)
1594 {
1595 	struct in_mfilter	*imf;
1596 	size_t			 idx, nmships;
1597 
1598 	nmships = imo->imo_num_memberships;
1599 	for (idx = 0; idx < nmships; ++idx) {
1600 		imf = imo->imo_mfilters ? &imo->imo_mfilters[idx] : NULL;
1601 		if (imf)
1602 			imf_leave(imf);
1603 		(void)in_leavegroup(imo->imo_membership[idx], imf);
1604 		if (imf)
1605 			imf_purge(imf);
1606 	}
1607 
1608 	if (imo->imo_mfilters)
1609 		free(imo->imo_mfilters, M_INMFILTER);
1610 	free(imo->imo_membership, M_IPMOPTS);
1611 	free(imo, M_IPMOPTS);
1612 }
1613 
1614 static void
1615 inp_gcmoptions(void *context, int pending)
1616 {
1617 	struct ip_moptions *imo;
1618 
1619 	IN_MULTI_LOCK();
1620 	while (!STAILQ_EMPTY(&imo_gc_list)) {
1621 		imo = STAILQ_FIRST(&imo_gc_list);
1622 		STAILQ_REMOVE_HEAD(&imo_gc_list, imo_link);
1623 		IN_MULTI_UNLOCK();
1624 		inp_freemoptions_internal(imo);
1625 		IN_MULTI_LOCK();
1626 	}
1627 	IN_MULTI_UNLOCK();
1628 }
1629 
1630 /*
1631  * Atomically get source filters on a socket for an IPv4 multicast group.
1632  * Called with INP lock held; returns with lock released.
1633  */
1634 static int
1635 inp_get_source_filters(struct inpcb *inp, struct sockopt *sopt)
1636 {
1637 	struct __msfilterreq	 msfr;
1638 	sockunion_t		*gsa;
1639 	struct ifnet		*ifp;
1640 	struct ip_moptions	*imo;
1641 	struct in_mfilter	*imf;
1642 	struct ip_msource	*ims;
1643 	struct in_msource	*lims;
1644 	struct sockaddr_in	*psin;
1645 	struct sockaddr_storage	*ptss;
1646 	struct sockaddr_storage	*tss;
1647 	int			 error;
1648 	size_t			 idx, nsrcs, ncsrcs;
1649 
1650 	INP_WLOCK_ASSERT(inp);
1651 
1652 	imo = inp->inp_moptions;
1653 	KASSERT(imo != NULL, ("%s: null ip_moptions", __func__));
1654 
1655 	INP_WUNLOCK(inp);
1656 
1657 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
1658 	    sizeof(struct __msfilterreq));
1659 	if (error)
1660 		return (error);
1661 
1662 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
1663 		return (EINVAL);
1664 
1665 	ifp = ifnet_byindex(msfr.msfr_ifindex);
1666 	if (ifp == NULL)
1667 		return (EINVAL);
1668 
1669 	INP_WLOCK(inp);
1670 
1671 	/*
1672 	 * Lookup group on the socket.
1673 	 */
1674 	gsa = (sockunion_t *)&msfr.msfr_group;
1675 	idx = imo_match_group(imo, ifp, &gsa->sa);
1676 	if (idx == -1 || imo->imo_mfilters == NULL) {
1677 		INP_WUNLOCK(inp);
1678 		return (EADDRNOTAVAIL);
1679 	}
1680 	imf = &imo->imo_mfilters[idx];
1681 
1682 	/*
1683 	 * Ignore memberships which are in limbo.
1684 	 */
1685 	if (imf->imf_st[1] == MCAST_UNDEFINED) {
1686 		INP_WUNLOCK(inp);
1687 		return (EAGAIN);
1688 	}
1689 	msfr.msfr_fmode = imf->imf_st[1];
1690 
1691 	/*
1692 	 * If the user specified a buffer, copy out the source filter
1693 	 * entries to userland gracefully.
1694 	 * We only copy out the number of entries which userland
1695 	 * has asked for, but we always tell userland how big the
1696 	 * buffer really needs to be.
1697 	 */
1698 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
1699 		msfr.msfr_nsrcs = in_mcast_maxsocksrc;
1700 	tss = NULL;
1701 	if (msfr.msfr_srcs != NULL && msfr.msfr_nsrcs > 0) {
1702 		tss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
1703 		    M_TEMP, M_NOWAIT | M_ZERO);
1704 		if (tss == NULL) {
1705 			INP_WUNLOCK(inp);
1706 			return (ENOBUFS);
1707 		}
1708 	}
1709 
1710 	/*
1711 	 * Count number of sources in-mode at t0.
1712 	 * If buffer space exists and remains, copy out source entries.
1713 	 */
1714 	nsrcs = msfr.msfr_nsrcs;
1715 	ncsrcs = 0;
1716 	ptss = tss;
1717 	RB_FOREACH(ims, ip_msource_tree, &imf->imf_sources) {
1718 		lims = (struct in_msource *)ims;
1719 		if (lims->imsl_st[0] == MCAST_UNDEFINED ||
1720 		    lims->imsl_st[0] != imf->imf_st[0])
1721 			continue;
1722 		++ncsrcs;
1723 		if (tss != NULL && nsrcs > 0) {
1724 			psin = (struct sockaddr_in *)ptss;
1725 			psin->sin_family = AF_INET;
1726 			psin->sin_len = sizeof(struct sockaddr_in);
1727 			psin->sin_addr.s_addr = htonl(lims->ims_haddr);
1728 			psin->sin_port = 0;
1729 			++ptss;
1730 			--nsrcs;
1731 		}
1732 	}
1733 
1734 	INP_WUNLOCK(inp);
1735 
1736 	if (tss != NULL) {
1737 		error = copyout(tss, msfr.msfr_srcs,
1738 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
1739 		free(tss, M_TEMP);
1740 		if (error)
1741 			return (error);
1742 	}
1743 
1744 	msfr.msfr_nsrcs = ncsrcs;
1745 	error = sooptcopyout(sopt, &msfr, sizeof(struct __msfilterreq));
1746 
1747 	return (error);
1748 }
1749 
1750 /*
1751  * Return the IP multicast options in response to user getsockopt().
1752  */
1753 int
1754 inp_getmoptions(struct inpcb *inp, struct sockopt *sopt)
1755 {
1756 	struct ip_mreqn		 mreqn;
1757 	struct ip_moptions	*imo;
1758 	struct ifnet		*ifp;
1759 	struct in_ifaddr	*ia;
1760 	int			 error, optval;
1761 	u_char			 coptval;
1762 
1763 	INP_WLOCK(inp);
1764 	imo = inp->inp_moptions;
1765 	/*
1766 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
1767 	 * or is a divert socket, reject it.
1768 	 */
1769 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
1770 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
1771 	    inp->inp_socket->so_proto->pr_type != SOCK_DGRAM)) {
1772 		INP_WUNLOCK(inp);
1773 		return (EOPNOTSUPP);
1774 	}
1775 
1776 	error = 0;
1777 	switch (sopt->sopt_name) {
1778 	case IP_MULTICAST_VIF:
1779 		if (imo != NULL)
1780 			optval = imo->imo_multicast_vif;
1781 		else
1782 			optval = -1;
1783 		INP_WUNLOCK(inp);
1784 		error = sooptcopyout(sopt, &optval, sizeof(int));
1785 		break;
1786 
1787 	case IP_MULTICAST_IF:
1788 		memset(&mreqn, 0, sizeof(struct ip_mreqn));
1789 		if (imo != NULL) {
1790 			ifp = imo->imo_multicast_ifp;
1791 			if (!in_nullhost(imo->imo_multicast_addr)) {
1792 				mreqn.imr_address = imo->imo_multicast_addr;
1793 			} else if (ifp != NULL) {
1794 				mreqn.imr_ifindex = ifp->if_index;
1795 				IFP_TO_IA(ifp, ia);
1796 				if (ia != NULL) {
1797 					mreqn.imr_address =
1798 					    IA_SIN(ia)->sin_addr;
1799 					ifa_free(&ia->ia_ifa);
1800 				}
1801 			}
1802 		}
1803 		INP_WUNLOCK(inp);
1804 		if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
1805 			error = sooptcopyout(sopt, &mreqn,
1806 			    sizeof(struct ip_mreqn));
1807 		} else {
1808 			error = sooptcopyout(sopt, &mreqn.imr_address,
1809 			    sizeof(struct in_addr));
1810 		}
1811 		break;
1812 
1813 	case IP_MULTICAST_TTL:
1814 		if (imo == 0)
1815 			optval = coptval = IP_DEFAULT_MULTICAST_TTL;
1816 		else
1817 			optval = coptval = imo->imo_multicast_ttl;
1818 		INP_WUNLOCK(inp);
1819 		if (sopt->sopt_valsize == sizeof(u_char))
1820 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
1821 		else
1822 			error = sooptcopyout(sopt, &optval, sizeof(int));
1823 		break;
1824 
1825 	case IP_MULTICAST_LOOP:
1826 		if (imo == 0)
1827 			optval = coptval = IP_DEFAULT_MULTICAST_LOOP;
1828 		else
1829 			optval = coptval = imo->imo_multicast_loop;
1830 		INP_WUNLOCK(inp);
1831 		if (sopt->sopt_valsize == sizeof(u_char))
1832 			error = sooptcopyout(sopt, &coptval, sizeof(u_char));
1833 		else
1834 			error = sooptcopyout(sopt, &optval, sizeof(int));
1835 		break;
1836 
1837 	case IP_MSFILTER:
1838 		if (imo == NULL) {
1839 			error = EADDRNOTAVAIL;
1840 			INP_WUNLOCK(inp);
1841 		} else {
1842 			error = inp_get_source_filters(inp, sopt);
1843 		}
1844 		break;
1845 
1846 	default:
1847 		INP_WUNLOCK(inp);
1848 		error = ENOPROTOOPT;
1849 		break;
1850 	}
1851 
1852 	INP_UNLOCK_ASSERT(inp);
1853 
1854 	return (error);
1855 }
1856 
1857 /*
1858  * Look up the ifnet to use for a multicast group membership,
1859  * given the IPv4 address of an interface, and the IPv4 group address.
1860  *
1861  * This routine exists to support legacy multicast applications
1862  * which do not understand that multicast memberships are scoped to
1863  * specific physical links in the networking stack, or which need
1864  * to join link-scope groups before IPv4 addresses are configured.
1865  *
1866  * If inp is non-NULL, use this socket's current FIB number for any
1867  * required FIB lookup.
1868  * If ina is INADDR_ANY, look up the group address in the unicast FIB,
1869  * and use its ifp; usually, this points to the default next-hop.
1870  *
1871  * If the FIB lookup fails, attempt to use the first non-loopback
1872  * interface with multicast capability in the system as a
1873  * last resort. The legacy IPv4 ASM API requires that we do
1874  * this in order to allow groups to be joined when the routing
1875  * table has not yet been populated during boot.
1876  *
1877  * Returns NULL if no ifp could be found.
1878  *
1879  * SMPng: TODO: Acquire the appropriate locks for INADDR_TO_IFP.
1880  * FUTURE: Implement IPv4 source-address selection.
1881  */
1882 static struct ifnet *
1883 inp_lookup_mcast_ifp(const struct inpcb *inp,
1884     const struct sockaddr_in *gsin, const struct in_addr ina)
1885 {
1886 	struct ifnet *ifp;
1887 
1888 	KASSERT(gsin->sin_family == AF_INET, ("%s: not AF_INET", __func__));
1889 	KASSERT(IN_MULTICAST(ntohl(gsin->sin_addr.s_addr)),
1890 	    ("%s: not multicast", __func__));
1891 
1892 	ifp = NULL;
1893 	if (!in_nullhost(ina)) {
1894 		INADDR_TO_IFP(ina, ifp);
1895 	} else {
1896 		struct route ro;
1897 
1898 		ro.ro_rt = NULL;
1899 		memcpy(&ro.ro_dst, gsin, sizeof(struct sockaddr_in));
1900 		in_rtalloc_ign(&ro, 0, inp ? inp->inp_inc.inc_fibnum : 0);
1901 		if (ro.ro_rt != NULL) {
1902 			ifp = ro.ro_rt->rt_ifp;
1903 			KASSERT(ifp != NULL, ("%s: null ifp", __func__));
1904 			RTFREE(ro.ro_rt);
1905 		} else {
1906 			struct in_ifaddr *ia;
1907 			struct ifnet *mifp;
1908 
1909 			mifp = NULL;
1910 			IN_IFADDR_RLOCK();
1911 			TAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) {
1912 				mifp = ia->ia_ifp;
1913 				if (!(mifp->if_flags & IFF_LOOPBACK) &&
1914 				     (mifp->if_flags & IFF_MULTICAST)) {
1915 					ifp = mifp;
1916 					break;
1917 				}
1918 			}
1919 			IN_IFADDR_RUNLOCK();
1920 		}
1921 	}
1922 
1923 	return (ifp);
1924 }
1925 
1926 /*
1927  * Join an IPv4 multicast group, possibly with a source.
1928  */
1929 static int
1930 inp_join_group(struct inpcb *inp, struct sockopt *sopt)
1931 {
1932 	struct group_source_req		 gsr;
1933 	sockunion_t			*gsa, *ssa;
1934 	struct ifnet			*ifp;
1935 	struct in_mfilter		*imf;
1936 	struct ip_moptions		*imo;
1937 	struct in_multi			*inm;
1938 	struct in_msource		*lims;
1939 	size_t				 idx;
1940 	int				 error, is_new;
1941 
1942 	ifp = NULL;
1943 	imf = NULL;
1944 	lims = NULL;
1945 	error = 0;
1946 	is_new = 0;
1947 
1948 	memset(&gsr, 0, sizeof(struct group_source_req));
1949 	gsa = (sockunion_t *)&gsr.gsr_group;
1950 	gsa->ss.ss_family = AF_UNSPEC;
1951 	ssa = (sockunion_t *)&gsr.gsr_source;
1952 	ssa->ss.ss_family = AF_UNSPEC;
1953 
1954 	switch (sopt->sopt_name) {
1955 	case IP_ADD_MEMBERSHIP:
1956 	case IP_ADD_SOURCE_MEMBERSHIP: {
1957 		struct ip_mreq_source	 mreqs;
1958 
1959 		if (sopt->sopt_name == IP_ADD_MEMBERSHIP) {
1960 			error = sooptcopyin(sopt, &mreqs,
1961 			    sizeof(struct ip_mreq),
1962 			    sizeof(struct ip_mreq));
1963 			/*
1964 			 * Do argument switcharoo from ip_mreq into
1965 			 * ip_mreq_source to avoid using two instances.
1966 			 */
1967 			mreqs.imr_interface = mreqs.imr_sourceaddr;
1968 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
1969 		} else if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
1970 			error = sooptcopyin(sopt, &mreqs,
1971 			    sizeof(struct ip_mreq_source),
1972 			    sizeof(struct ip_mreq_source));
1973 		}
1974 		if (error)
1975 			return (error);
1976 
1977 		gsa->sin.sin_family = AF_INET;
1978 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
1979 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
1980 
1981 		if (sopt->sopt_name == IP_ADD_SOURCE_MEMBERSHIP) {
1982 			ssa->sin.sin_family = AF_INET;
1983 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
1984 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
1985 		}
1986 
1987 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
1988 			return (EINVAL);
1989 
1990 		ifp = inp_lookup_mcast_ifp(inp, &gsa->sin,
1991 		    mreqs.imr_interface);
1992 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
1993 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
1994 		break;
1995 	}
1996 
1997 	case MCAST_JOIN_GROUP:
1998 	case MCAST_JOIN_SOURCE_GROUP:
1999 		if (sopt->sopt_name == MCAST_JOIN_GROUP) {
2000 			error = sooptcopyin(sopt, &gsr,
2001 			    sizeof(struct group_req),
2002 			    sizeof(struct group_req));
2003 		} else if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
2004 			error = sooptcopyin(sopt, &gsr,
2005 			    sizeof(struct group_source_req),
2006 			    sizeof(struct group_source_req));
2007 		}
2008 		if (error)
2009 			return (error);
2010 
2011 		if (gsa->sin.sin_family != AF_INET ||
2012 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
2013 			return (EINVAL);
2014 
2015 		/*
2016 		 * Overwrite the port field if present, as the sockaddr
2017 		 * being copied in may be matched with a binary comparison.
2018 		 */
2019 		gsa->sin.sin_port = 0;
2020 		if (sopt->sopt_name == MCAST_JOIN_SOURCE_GROUP) {
2021 			if (ssa->sin.sin_family != AF_INET ||
2022 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
2023 				return (EINVAL);
2024 			ssa->sin.sin_port = 0;
2025 		}
2026 
2027 		if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
2028 			return (EINVAL);
2029 
2030 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
2031 			return (EADDRNOTAVAIL);
2032 		ifp = ifnet_byindex(gsr.gsr_interface);
2033 		break;
2034 
2035 	default:
2036 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
2037 		    __func__, sopt->sopt_name);
2038 		return (EOPNOTSUPP);
2039 		break;
2040 	}
2041 
2042 	if (ifp == NULL || (ifp->if_flags & IFF_MULTICAST) == 0)
2043 		return (EADDRNOTAVAIL);
2044 
2045 	imo = inp_findmoptions(inp);
2046 	idx = imo_match_group(imo, ifp, &gsa->sa);
2047 	if (idx == -1) {
2048 		is_new = 1;
2049 	} else {
2050 		inm = imo->imo_membership[idx];
2051 		imf = &imo->imo_mfilters[idx];
2052 		if (ssa->ss.ss_family != AF_UNSPEC) {
2053 			/*
2054 			 * MCAST_JOIN_SOURCE_GROUP on an exclusive membership
2055 			 * is an error. On an existing inclusive membership,
2056 			 * it just adds the source to the filter list.
2057 			 */
2058 			if (imf->imf_st[1] != MCAST_INCLUDE) {
2059 				error = EINVAL;
2060 				goto out_inp_locked;
2061 			}
2062 			/*
2063 			 * Throw out duplicates.
2064 			 *
2065 			 * XXX FIXME: This makes a naive assumption that
2066 			 * even if entries exist for *ssa in this imf,
2067 			 * they will be rejected as dupes, even if they
2068 			 * are not valid in the current mode (in-mode).
2069 			 *
2070 			 * in_msource is transactioned just as for anything
2071 			 * else in SSM -- but note naive use of inm_graft()
2072 			 * below for allocating new filter entries.
2073 			 *
2074 			 * This is only an issue if someone mixes the
2075 			 * full-state SSM API with the delta-based API,
2076 			 * which is discouraged in the relevant RFCs.
2077 			 */
2078 			lims = imo_match_source(imo, idx, &ssa->sa);
2079 			if (lims != NULL /*&&
2080 			    lims->imsl_st[1] == MCAST_INCLUDE*/) {
2081 				error = EADDRNOTAVAIL;
2082 				goto out_inp_locked;
2083 			}
2084 		} else {
2085 			/*
2086 			 * MCAST_JOIN_GROUP on an existing exclusive
2087 			 * membership is an error; return EADDRINUSE
2088 			 * to preserve 4.4BSD API idempotence, and
2089 			 * avoid tedious detour to code below.
2090 			 * NOTE: This is bending RFC 3678 a bit.
2091 			 *
2092 			 * On an existing inclusive membership, this is also
2093 			 * an error; if you want to change filter mode,
2094 			 * you must use the userland API setsourcefilter().
2095 			 * XXX We don't reject this for imf in UNDEFINED
2096 			 * state at t1, because allocation of a filter
2097 			 * is atomic with allocation of a membership.
2098 			 */
2099 			error = EINVAL;
2100 			if (imf->imf_st[1] == MCAST_EXCLUDE)
2101 				error = EADDRINUSE;
2102 			goto out_inp_locked;
2103 		}
2104 	}
2105 
2106 	/*
2107 	 * Begin state merge transaction at socket layer.
2108 	 */
2109 	INP_WLOCK_ASSERT(inp);
2110 
2111 	if (is_new) {
2112 		if (imo->imo_num_memberships == imo->imo_max_memberships) {
2113 			error = imo_grow(imo);
2114 			if (error)
2115 				goto out_inp_locked;
2116 		}
2117 		/*
2118 		 * Allocate the new slot upfront so we can deal with
2119 		 * grafting the new source filter in same code path
2120 		 * as for join-source on existing membership.
2121 		 */
2122 		idx = imo->imo_num_memberships;
2123 		imo->imo_membership[idx] = NULL;
2124 		imo->imo_num_memberships++;
2125 		KASSERT(imo->imo_mfilters != NULL,
2126 		    ("%s: imf_mfilters vector was not allocated", __func__));
2127 		imf = &imo->imo_mfilters[idx];
2128 		KASSERT(RB_EMPTY(&imf->imf_sources),
2129 		    ("%s: imf_sources not empty", __func__));
2130 	}
2131 
2132 	/*
2133 	 * Graft new source into filter list for this inpcb's
2134 	 * membership of the group. The in_multi may not have
2135 	 * been allocated yet if this is a new membership, however,
2136 	 * the in_mfilter slot will be allocated and must be initialized.
2137 	 *
2138 	 * Note: Grafting of exclusive mode filters doesn't happen
2139 	 * in this path.
2140 	 * XXX: Should check for non-NULL lims (node exists but may
2141 	 * not be in-mode) for interop with full-state API.
2142 	 */
2143 	if (ssa->ss.ss_family != AF_UNSPEC) {
2144 		/* Membership starts in IN mode */
2145 		if (is_new) {
2146 			CTR1(KTR_IGMPV3, "%s: new join w/source", __func__);
2147 			imf_init(imf, MCAST_UNDEFINED, MCAST_INCLUDE);
2148 		} else {
2149 			CTR2(KTR_IGMPV3, "%s: %s source", __func__, "allow");
2150 		}
2151 		lims = imf_graft(imf, MCAST_INCLUDE, &ssa->sin);
2152 		if (lims == NULL) {
2153 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
2154 			    __func__);
2155 			error = ENOMEM;
2156 			goto out_imo_free;
2157 		}
2158 	} else {
2159 		/* No address specified; Membership starts in EX mode */
2160 		if (is_new) {
2161 			CTR1(KTR_IGMPV3, "%s: new join w/o source", __func__);
2162 			imf_init(imf, MCAST_UNDEFINED, MCAST_EXCLUDE);
2163 		}
2164 	}
2165 
2166 	/*
2167 	 * Begin state merge transaction at IGMP layer.
2168 	 */
2169 	IN_MULTI_LOCK();
2170 
2171 	if (is_new) {
2172 		error = in_joingroup_locked(ifp, &gsa->sin.sin_addr, imf,
2173 		    &inm);
2174 		if (error) {
2175                         CTR1(KTR_IGMPV3, "%s: in_joingroup_locked failed",
2176                             __func__);
2177                         IN_MULTI_UNLOCK();
2178 			goto out_imo_free;
2179                 }
2180 		imo->imo_membership[idx] = inm;
2181 	} else {
2182 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
2183 		error = inm_merge(inm, imf);
2184 		if (error) {
2185 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
2186 			    __func__);
2187 			goto out_in_multi_locked;
2188 		}
2189 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
2190 		error = igmp_change_state(inm);
2191 		if (error) {
2192 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
2193 			    __func__);
2194 			goto out_in_multi_locked;
2195 		}
2196 	}
2197 
2198 out_in_multi_locked:
2199 
2200 	IN_MULTI_UNLOCK();
2201 
2202 	INP_WLOCK_ASSERT(inp);
2203 	if (error) {
2204 		imf_rollback(imf);
2205 		if (is_new)
2206 			imf_purge(imf);
2207 		else
2208 			imf_reap(imf);
2209 	} else {
2210 		imf_commit(imf);
2211 	}
2212 
2213 out_imo_free:
2214 	if (error && is_new) {
2215 		imo->imo_membership[idx] = NULL;
2216 		--imo->imo_num_memberships;
2217 	}
2218 
2219 out_inp_locked:
2220 	INP_WUNLOCK(inp);
2221 	return (error);
2222 }
2223 
2224 /*
2225  * Leave an IPv4 multicast group on an inpcb, possibly with a source.
2226  */
2227 static int
2228 inp_leave_group(struct inpcb *inp, struct sockopt *sopt)
2229 {
2230 	struct group_source_req		 gsr;
2231 	struct ip_mreq_source		 mreqs;
2232 	sockunion_t			*gsa, *ssa;
2233 	struct ifnet			*ifp;
2234 	struct in_mfilter		*imf;
2235 	struct ip_moptions		*imo;
2236 	struct in_msource		*ims;
2237 	struct in_multi			*inm;
2238 	size_t				 idx;
2239 	int				 error, is_final;
2240 
2241 	ifp = NULL;
2242 	error = 0;
2243 	is_final = 1;
2244 
2245 	memset(&gsr, 0, sizeof(struct group_source_req));
2246 	gsa = (sockunion_t *)&gsr.gsr_group;
2247 	gsa->ss.ss_family = AF_UNSPEC;
2248 	ssa = (sockunion_t *)&gsr.gsr_source;
2249 	ssa->ss.ss_family = AF_UNSPEC;
2250 
2251 	switch (sopt->sopt_name) {
2252 	case IP_DROP_MEMBERSHIP:
2253 	case IP_DROP_SOURCE_MEMBERSHIP:
2254 		if (sopt->sopt_name == IP_DROP_MEMBERSHIP) {
2255 			error = sooptcopyin(sopt, &mreqs,
2256 			    sizeof(struct ip_mreq),
2257 			    sizeof(struct ip_mreq));
2258 			/*
2259 			 * Swap interface and sourceaddr arguments,
2260 			 * as ip_mreq and ip_mreq_source are laid
2261 			 * out differently.
2262 			 */
2263 			mreqs.imr_interface = mreqs.imr_sourceaddr;
2264 			mreqs.imr_sourceaddr.s_addr = INADDR_ANY;
2265 		} else if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
2266 			error = sooptcopyin(sopt, &mreqs,
2267 			    sizeof(struct ip_mreq_source),
2268 			    sizeof(struct ip_mreq_source));
2269 		}
2270 		if (error)
2271 			return (error);
2272 
2273 		gsa->sin.sin_family = AF_INET;
2274 		gsa->sin.sin_len = sizeof(struct sockaddr_in);
2275 		gsa->sin.sin_addr = mreqs.imr_multiaddr;
2276 
2277 		if (sopt->sopt_name == IP_DROP_SOURCE_MEMBERSHIP) {
2278 			ssa->sin.sin_family = AF_INET;
2279 			ssa->sin.sin_len = sizeof(struct sockaddr_in);
2280 			ssa->sin.sin_addr = mreqs.imr_sourceaddr;
2281 		}
2282 
2283 		/*
2284 		 * Attempt to look up hinted ifp from interface address.
2285 		 * Fallthrough with null ifp iff lookup fails, to
2286 		 * preserve 4.4BSD mcast API idempotence.
2287 		 * XXX NOTE WELL: The RFC 3678 API is preferred because
2288 		 * using an IPv4 address as a key is racy.
2289 		 */
2290 		if (!in_nullhost(mreqs.imr_interface))
2291 			INADDR_TO_IFP(mreqs.imr_interface, ifp);
2292 
2293 		CTR3(KTR_IGMPV3, "%s: imr_interface = %s, ifp = %p",
2294 		    __func__, inet_ntoa(mreqs.imr_interface), ifp);
2295 
2296 		break;
2297 
2298 	case MCAST_LEAVE_GROUP:
2299 	case MCAST_LEAVE_SOURCE_GROUP:
2300 		if (sopt->sopt_name == MCAST_LEAVE_GROUP) {
2301 			error = sooptcopyin(sopt, &gsr,
2302 			    sizeof(struct group_req),
2303 			    sizeof(struct group_req));
2304 		} else if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
2305 			error = sooptcopyin(sopt, &gsr,
2306 			    sizeof(struct group_source_req),
2307 			    sizeof(struct group_source_req));
2308 		}
2309 		if (error)
2310 			return (error);
2311 
2312 		if (gsa->sin.sin_family != AF_INET ||
2313 		    gsa->sin.sin_len != sizeof(struct sockaddr_in))
2314 			return (EINVAL);
2315 
2316 		if (sopt->sopt_name == MCAST_LEAVE_SOURCE_GROUP) {
2317 			if (ssa->sin.sin_family != AF_INET ||
2318 			    ssa->sin.sin_len != sizeof(struct sockaddr_in))
2319 				return (EINVAL);
2320 		}
2321 
2322 		if (gsr.gsr_interface == 0 || V_if_index < gsr.gsr_interface)
2323 			return (EADDRNOTAVAIL);
2324 
2325 		ifp = ifnet_byindex(gsr.gsr_interface);
2326 
2327 		if (ifp == NULL)
2328 			return (EADDRNOTAVAIL);
2329 		break;
2330 
2331 	default:
2332 		CTR2(KTR_IGMPV3, "%s: unknown sopt_name %d",
2333 		    __func__, sopt->sopt_name);
2334 		return (EOPNOTSUPP);
2335 		break;
2336 	}
2337 
2338 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
2339 		return (EINVAL);
2340 
2341 	/*
2342 	 * Find the membership in the membership array.
2343 	 */
2344 	imo = inp_findmoptions(inp);
2345 	idx = imo_match_group(imo, ifp, &gsa->sa);
2346 	if (idx == -1) {
2347 		error = EADDRNOTAVAIL;
2348 		goto out_inp_locked;
2349 	}
2350 	inm = imo->imo_membership[idx];
2351 	imf = &imo->imo_mfilters[idx];
2352 
2353 	if (ssa->ss.ss_family != AF_UNSPEC)
2354 		is_final = 0;
2355 
2356 	/*
2357 	 * Begin state merge transaction at socket layer.
2358 	 */
2359 	INP_WLOCK_ASSERT(inp);
2360 
2361 	/*
2362 	 * If we were instructed only to leave a given source, do so.
2363 	 * MCAST_LEAVE_SOURCE_GROUP is only valid for inclusive memberships.
2364 	 */
2365 	if (is_final) {
2366 		imf_leave(imf);
2367 	} else {
2368 		if (imf->imf_st[0] == MCAST_EXCLUDE) {
2369 			error = EADDRNOTAVAIL;
2370 			goto out_inp_locked;
2371 		}
2372 		ims = imo_match_source(imo, idx, &ssa->sa);
2373 		if (ims == NULL) {
2374 			CTR3(KTR_IGMPV3, "%s: source %s %spresent", __func__,
2375 			    inet_ntoa(ssa->sin.sin_addr), "not ");
2376 			error = EADDRNOTAVAIL;
2377 			goto out_inp_locked;
2378 		}
2379 		CTR2(KTR_IGMPV3, "%s: %s source", __func__, "block");
2380 		error = imf_prune(imf, &ssa->sin);
2381 		if (error) {
2382 			CTR1(KTR_IGMPV3, "%s: merge imf state failed",
2383 			    __func__);
2384 			goto out_inp_locked;
2385 		}
2386 	}
2387 
2388 	/*
2389 	 * Begin state merge transaction at IGMP layer.
2390 	 */
2391 	IN_MULTI_LOCK();
2392 
2393 	if (is_final) {
2394 		/*
2395 		 * Give up the multicast address record to which
2396 		 * the membership points.
2397 		 */
2398 		(void)in_leavegroup_locked(inm, imf);
2399 	} else {
2400 		CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
2401 		error = inm_merge(inm, imf);
2402 		if (error) {
2403 			CTR1(KTR_IGMPV3, "%s: failed to merge inm state",
2404 			    __func__);
2405 			goto out_in_multi_locked;
2406 		}
2407 
2408 		CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
2409 		error = igmp_change_state(inm);
2410 		if (error) {
2411 			CTR1(KTR_IGMPV3, "%s: failed igmp downcall",
2412 			    __func__);
2413 		}
2414 	}
2415 
2416 out_in_multi_locked:
2417 
2418 	IN_MULTI_UNLOCK();
2419 
2420 	if (error)
2421 		imf_rollback(imf);
2422 	else
2423 		imf_commit(imf);
2424 
2425 	imf_reap(imf);
2426 
2427 	if (is_final) {
2428 		/* Remove the gap in the membership and filter array. */
2429 		for (++idx; idx < imo->imo_num_memberships; ++idx) {
2430 			imo->imo_membership[idx-1] = imo->imo_membership[idx];
2431 			imo->imo_mfilters[idx-1] = imo->imo_mfilters[idx];
2432 		}
2433 		imo->imo_num_memberships--;
2434 	}
2435 
2436 out_inp_locked:
2437 	INP_WUNLOCK(inp);
2438 	return (error);
2439 }
2440 
2441 /*
2442  * Select the interface for transmitting IPv4 multicast datagrams.
2443  *
2444  * Either an instance of struct in_addr or an instance of struct ip_mreqn
2445  * may be passed to this socket option. An address of INADDR_ANY or an
2446  * interface index of 0 is used to remove a previous selection.
2447  * When no interface is selected, one is chosen for every send.
2448  */
2449 static int
2450 inp_set_multicast_if(struct inpcb *inp, struct sockopt *sopt)
2451 {
2452 	struct in_addr		 addr;
2453 	struct ip_mreqn		 mreqn;
2454 	struct ifnet		*ifp;
2455 	struct ip_moptions	*imo;
2456 	int			 error;
2457 
2458 	if (sopt->sopt_valsize == sizeof(struct ip_mreqn)) {
2459 		/*
2460 		 * An interface index was specified using the
2461 		 * Linux-derived ip_mreqn structure.
2462 		 */
2463 		error = sooptcopyin(sopt, &mreqn, sizeof(struct ip_mreqn),
2464 		    sizeof(struct ip_mreqn));
2465 		if (error)
2466 			return (error);
2467 
2468 		if (mreqn.imr_ifindex < 0 || V_if_index < mreqn.imr_ifindex)
2469 			return (EINVAL);
2470 
2471 		if (mreqn.imr_ifindex == 0) {
2472 			ifp = NULL;
2473 		} else {
2474 			ifp = ifnet_byindex(mreqn.imr_ifindex);
2475 			if (ifp == NULL)
2476 				return (EADDRNOTAVAIL);
2477 		}
2478 	} else {
2479 		/*
2480 		 * An interface was specified by IPv4 address.
2481 		 * This is the traditional BSD usage.
2482 		 */
2483 		error = sooptcopyin(sopt, &addr, sizeof(struct in_addr),
2484 		    sizeof(struct in_addr));
2485 		if (error)
2486 			return (error);
2487 		if (in_nullhost(addr)) {
2488 			ifp = NULL;
2489 		} else {
2490 			INADDR_TO_IFP(addr, ifp);
2491 			if (ifp == NULL)
2492 				return (EADDRNOTAVAIL);
2493 		}
2494 		CTR3(KTR_IGMPV3, "%s: ifp = %p, addr = %s", __func__, ifp,
2495 		    inet_ntoa(addr));
2496 	}
2497 
2498 	/* Reject interfaces which do not support multicast. */
2499 	if (ifp != NULL && (ifp->if_flags & IFF_MULTICAST) == 0)
2500 		return (EOPNOTSUPP);
2501 
2502 	imo = inp_findmoptions(inp);
2503 	imo->imo_multicast_ifp = ifp;
2504 	imo->imo_multicast_addr.s_addr = INADDR_ANY;
2505 	INP_WUNLOCK(inp);
2506 
2507 	return (0);
2508 }
2509 
2510 /*
2511  * Atomically set source filters on a socket for an IPv4 multicast group.
2512  *
2513  * SMPng: NOTE: Potentially calls malloc(M_WAITOK) with Giant held.
2514  */
2515 static int
2516 inp_set_source_filters(struct inpcb *inp, struct sockopt *sopt)
2517 {
2518 	struct __msfilterreq	 msfr;
2519 	sockunion_t		*gsa;
2520 	struct ifnet		*ifp;
2521 	struct in_mfilter	*imf;
2522 	struct ip_moptions	*imo;
2523 	struct in_multi		*inm;
2524 	size_t			 idx;
2525 	int			 error;
2526 
2527 	error = sooptcopyin(sopt, &msfr, sizeof(struct __msfilterreq),
2528 	    sizeof(struct __msfilterreq));
2529 	if (error)
2530 		return (error);
2531 
2532 	if (msfr.msfr_nsrcs > in_mcast_maxsocksrc)
2533 		return (ENOBUFS);
2534 
2535 	if ((msfr.msfr_fmode != MCAST_EXCLUDE &&
2536 	     msfr.msfr_fmode != MCAST_INCLUDE))
2537 		return (EINVAL);
2538 
2539 	if (msfr.msfr_group.ss_family != AF_INET ||
2540 	    msfr.msfr_group.ss_len != sizeof(struct sockaddr_in))
2541 		return (EINVAL);
2542 
2543 	gsa = (sockunion_t *)&msfr.msfr_group;
2544 	if (!IN_MULTICAST(ntohl(gsa->sin.sin_addr.s_addr)))
2545 		return (EINVAL);
2546 
2547 	gsa->sin.sin_port = 0;	/* ignore port */
2548 
2549 	if (msfr.msfr_ifindex == 0 || V_if_index < msfr.msfr_ifindex)
2550 		return (EADDRNOTAVAIL);
2551 
2552 	ifp = ifnet_byindex(msfr.msfr_ifindex);
2553 	if (ifp == NULL)
2554 		return (EADDRNOTAVAIL);
2555 
2556 	/*
2557 	 * Take the INP write lock.
2558 	 * Check if this socket is a member of this group.
2559 	 */
2560 	imo = inp_findmoptions(inp);
2561 	idx = imo_match_group(imo, ifp, &gsa->sa);
2562 	if (idx == -1 || imo->imo_mfilters == NULL) {
2563 		error = EADDRNOTAVAIL;
2564 		goto out_inp_locked;
2565 	}
2566 	inm = imo->imo_membership[idx];
2567 	imf = &imo->imo_mfilters[idx];
2568 
2569 	/*
2570 	 * Begin state merge transaction at socket layer.
2571 	 */
2572 	INP_WLOCK_ASSERT(inp);
2573 
2574 	imf->imf_st[1] = msfr.msfr_fmode;
2575 
2576 	/*
2577 	 * Apply any new source filters, if present.
2578 	 * Make a copy of the user-space source vector so
2579 	 * that we may copy them with a single copyin. This
2580 	 * allows us to deal with page faults up-front.
2581 	 */
2582 	if (msfr.msfr_nsrcs > 0) {
2583 		struct in_msource	*lims;
2584 		struct sockaddr_in	*psin;
2585 		struct sockaddr_storage	*kss, *pkss;
2586 		int			 i;
2587 
2588 		INP_WUNLOCK(inp);
2589 
2590 		CTR2(KTR_IGMPV3, "%s: loading %lu source list entries",
2591 		    __func__, (unsigned long)msfr.msfr_nsrcs);
2592 		kss = malloc(sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs,
2593 		    M_TEMP, M_WAITOK);
2594 		error = copyin(msfr.msfr_srcs, kss,
2595 		    sizeof(struct sockaddr_storage) * msfr.msfr_nsrcs);
2596 		if (error) {
2597 			free(kss, M_TEMP);
2598 			return (error);
2599 		}
2600 
2601 		INP_WLOCK(inp);
2602 
2603 		/*
2604 		 * Mark all source filters as UNDEFINED at t1.
2605 		 * Restore new group filter mode, as imf_leave()
2606 		 * will set it to INCLUDE.
2607 		 */
2608 		imf_leave(imf);
2609 		imf->imf_st[1] = msfr.msfr_fmode;
2610 
2611 		/*
2612 		 * Update socket layer filters at t1, lazy-allocating
2613 		 * new entries. This saves a bunch of memory at the
2614 		 * cost of one RB_FIND() per source entry; duplicate
2615 		 * entries in the msfr_nsrcs vector are ignored.
2616 		 * If we encounter an error, rollback transaction.
2617 		 *
2618 		 * XXX This too could be replaced with a set-symmetric
2619 		 * difference like loop to avoid walking from root
2620 		 * every time, as the key space is common.
2621 		 */
2622 		for (i = 0, pkss = kss; i < msfr.msfr_nsrcs; i++, pkss++) {
2623 			psin = (struct sockaddr_in *)pkss;
2624 			if (psin->sin_family != AF_INET) {
2625 				error = EAFNOSUPPORT;
2626 				break;
2627 			}
2628 			if (psin->sin_len != sizeof(struct sockaddr_in)) {
2629 				error = EINVAL;
2630 				break;
2631 			}
2632 			error = imf_get_source(imf, psin, &lims);
2633 			if (error)
2634 				break;
2635 			lims->imsl_st[1] = imf->imf_st[1];
2636 		}
2637 		free(kss, M_TEMP);
2638 	}
2639 
2640 	if (error)
2641 		goto out_imf_rollback;
2642 
2643 	INP_WLOCK_ASSERT(inp);
2644 	IN_MULTI_LOCK();
2645 
2646 	/*
2647 	 * Begin state merge transaction at IGMP layer.
2648 	 */
2649 	CTR1(KTR_IGMPV3, "%s: merge inm state", __func__);
2650 	error = inm_merge(inm, imf);
2651 	if (error) {
2652 		CTR1(KTR_IGMPV3, "%s: failed to merge inm state", __func__);
2653 		goto out_in_multi_locked;
2654 	}
2655 
2656 	CTR1(KTR_IGMPV3, "%s: doing igmp downcall", __func__);
2657 	error = igmp_change_state(inm);
2658 	if (error)
2659 		CTR1(KTR_IGMPV3, "%s: failed igmp downcall", __func__);
2660 
2661 out_in_multi_locked:
2662 
2663 	IN_MULTI_UNLOCK();
2664 
2665 out_imf_rollback:
2666 	if (error)
2667 		imf_rollback(imf);
2668 	else
2669 		imf_commit(imf);
2670 
2671 	imf_reap(imf);
2672 
2673 out_inp_locked:
2674 	INP_WUNLOCK(inp);
2675 	return (error);
2676 }
2677 
2678 /*
2679  * Set the IP multicast options in response to user setsockopt().
2680  *
2681  * Many of the socket options handled in this function duplicate the
2682  * functionality of socket options in the regular unicast API. However,
2683  * it is not possible to merge the duplicate code, because the idempotence
2684  * of the IPv4 multicast part of the BSD Sockets API must be preserved;
2685  * the effects of these options must be treated as separate and distinct.
2686  *
2687  * SMPng: XXX: Unlocked read of inp_socket believed OK.
2688  * FUTURE: The IP_MULTICAST_VIF option may be eliminated if MROUTING
2689  * is refactored to no longer use vifs.
2690  */
2691 int
2692 inp_setmoptions(struct inpcb *inp, struct sockopt *sopt)
2693 {
2694 	struct ip_moptions	*imo;
2695 	int			 error;
2696 
2697 	error = 0;
2698 
2699 	/*
2700 	 * If socket is neither of type SOCK_RAW or SOCK_DGRAM,
2701 	 * or is a divert socket, reject it.
2702 	 */
2703 	if (inp->inp_socket->so_proto->pr_protocol == IPPROTO_DIVERT ||
2704 	    (inp->inp_socket->so_proto->pr_type != SOCK_RAW &&
2705 	     inp->inp_socket->so_proto->pr_type != SOCK_DGRAM))
2706 		return (EOPNOTSUPP);
2707 
2708 	switch (sopt->sopt_name) {
2709 	case IP_MULTICAST_VIF: {
2710 		int vifi;
2711 		/*
2712 		 * Select a multicast VIF for transmission.
2713 		 * Only useful if multicast forwarding is active.
2714 		 */
2715 		if (legal_vif_num == NULL) {
2716 			error = EOPNOTSUPP;
2717 			break;
2718 		}
2719 		error = sooptcopyin(sopt, &vifi, sizeof(int), sizeof(int));
2720 		if (error)
2721 			break;
2722 		if (!legal_vif_num(vifi) && (vifi != -1)) {
2723 			error = EINVAL;
2724 			break;
2725 		}
2726 		imo = inp_findmoptions(inp);
2727 		imo->imo_multicast_vif = vifi;
2728 		INP_WUNLOCK(inp);
2729 		break;
2730 	}
2731 
2732 	case IP_MULTICAST_IF:
2733 		error = inp_set_multicast_if(inp, sopt);
2734 		break;
2735 
2736 	case IP_MULTICAST_TTL: {
2737 		u_char ttl;
2738 
2739 		/*
2740 		 * Set the IP time-to-live for outgoing multicast packets.
2741 		 * The original multicast API required a char argument,
2742 		 * which is inconsistent with the rest of the socket API.
2743 		 * We allow either a char or an int.
2744 		 */
2745 		if (sopt->sopt_valsize == sizeof(u_char)) {
2746 			error = sooptcopyin(sopt, &ttl, sizeof(u_char),
2747 			    sizeof(u_char));
2748 			if (error)
2749 				break;
2750 		} else {
2751 			u_int ittl;
2752 
2753 			error = sooptcopyin(sopt, &ittl, sizeof(u_int),
2754 			    sizeof(u_int));
2755 			if (error)
2756 				break;
2757 			if (ittl > 255) {
2758 				error = EINVAL;
2759 				break;
2760 			}
2761 			ttl = (u_char)ittl;
2762 		}
2763 		imo = inp_findmoptions(inp);
2764 		imo->imo_multicast_ttl = ttl;
2765 		INP_WUNLOCK(inp);
2766 		break;
2767 	}
2768 
2769 	case IP_MULTICAST_LOOP: {
2770 		u_char loop;
2771 
2772 		/*
2773 		 * Set the loopback flag for outgoing multicast packets.
2774 		 * Must be zero or one.  The original multicast API required a
2775 		 * char argument, which is inconsistent with the rest
2776 		 * of the socket API.  We allow either a char or an int.
2777 		 */
2778 		if (sopt->sopt_valsize == sizeof(u_char)) {
2779 			error = sooptcopyin(sopt, &loop, sizeof(u_char),
2780 			    sizeof(u_char));
2781 			if (error)
2782 				break;
2783 		} else {
2784 			u_int iloop;
2785 
2786 			error = sooptcopyin(sopt, &iloop, sizeof(u_int),
2787 					    sizeof(u_int));
2788 			if (error)
2789 				break;
2790 			loop = (u_char)iloop;
2791 		}
2792 		imo = inp_findmoptions(inp);
2793 		imo->imo_multicast_loop = !!loop;
2794 		INP_WUNLOCK(inp);
2795 		break;
2796 	}
2797 
2798 	case IP_ADD_MEMBERSHIP:
2799 	case IP_ADD_SOURCE_MEMBERSHIP:
2800 	case MCAST_JOIN_GROUP:
2801 	case MCAST_JOIN_SOURCE_GROUP:
2802 		error = inp_join_group(inp, sopt);
2803 		break;
2804 
2805 	case IP_DROP_MEMBERSHIP:
2806 	case IP_DROP_SOURCE_MEMBERSHIP:
2807 	case MCAST_LEAVE_GROUP:
2808 	case MCAST_LEAVE_SOURCE_GROUP:
2809 		error = inp_leave_group(inp, sopt);
2810 		break;
2811 
2812 	case IP_BLOCK_SOURCE:
2813 	case IP_UNBLOCK_SOURCE:
2814 	case MCAST_BLOCK_SOURCE:
2815 	case MCAST_UNBLOCK_SOURCE:
2816 		error = inp_block_unblock_source(inp, sopt);
2817 		break;
2818 
2819 	case IP_MSFILTER:
2820 		error = inp_set_source_filters(inp, sopt);
2821 		break;
2822 
2823 	default:
2824 		error = EOPNOTSUPP;
2825 		break;
2826 	}
2827 
2828 	INP_UNLOCK_ASSERT(inp);
2829 
2830 	return (error);
2831 }
2832 
2833 /*
2834  * Expose IGMP's multicast filter mode and source list(s) to userland,
2835  * keyed by (ifindex, group).
2836  * The filter mode is written out as a uint32_t, followed by
2837  * 0..n of struct in_addr.
2838  * For use by ifmcstat(8).
2839  * SMPng: NOTE: unlocked read of ifindex space.
2840  */
2841 static int
2842 sysctl_ip_mcast_filters(SYSCTL_HANDLER_ARGS)
2843 {
2844 	struct in_addr			 src, group;
2845 	struct ifnet			*ifp;
2846 	struct ifmultiaddr		*ifma;
2847 	struct in_multi			*inm;
2848 	struct ip_msource		*ims;
2849 	int				*name;
2850 	int				 retval;
2851 	u_int				 namelen;
2852 	uint32_t			 fmode, ifindex;
2853 
2854 	name = (int *)arg1;
2855 	namelen = arg2;
2856 
2857 	if (req->newptr != NULL)
2858 		return (EPERM);
2859 
2860 	if (namelen != 2)
2861 		return (EINVAL);
2862 
2863 	ifindex = name[0];
2864 	if (ifindex <= 0 || ifindex > V_if_index) {
2865 		CTR2(KTR_IGMPV3, "%s: ifindex %u out of range",
2866 		    __func__, ifindex);
2867 		return (ENOENT);
2868 	}
2869 
2870 	group.s_addr = name[1];
2871 	if (!IN_MULTICAST(ntohl(group.s_addr))) {
2872 		CTR2(KTR_IGMPV3, "%s: group %s is not multicast",
2873 		    __func__, inet_ntoa(group));
2874 		return (EINVAL);
2875 	}
2876 
2877 	ifp = ifnet_byindex(ifindex);
2878 	if (ifp == NULL) {
2879 		CTR2(KTR_IGMPV3, "%s: no ifp for ifindex %u",
2880 		    __func__, ifindex);
2881 		return (ENOENT);
2882 	}
2883 
2884 	retval = sysctl_wire_old_buffer(req,
2885 	    sizeof(uint32_t) + (in_mcast_maxgrpsrc * sizeof(struct in_addr)));
2886 	if (retval)
2887 		return (retval);
2888 
2889 	IN_MULTI_LOCK();
2890 
2891 	IF_ADDR_RLOCK(ifp);
2892 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2893 		if (ifma->ifma_addr->sa_family != AF_INET ||
2894 		    ifma->ifma_protospec == NULL)
2895 			continue;
2896 		inm = (struct in_multi *)ifma->ifma_protospec;
2897 		if (!in_hosteq(inm->inm_addr, group))
2898 			continue;
2899 		fmode = inm->inm_st[1].iss_fmode;
2900 		retval = SYSCTL_OUT(req, &fmode, sizeof(uint32_t));
2901 		if (retval != 0)
2902 			break;
2903 		RB_FOREACH(ims, ip_msource_tree, &inm->inm_srcs) {
2904 #ifdef KTR
2905 			struct in_addr ina;
2906 			ina.s_addr = htonl(ims->ims_haddr);
2907 			CTR2(KTR_IGMPV3, "%s: visit node %s", __func__,
2908 			    inet_ntoa(ina));
2909 #endif
2910 			/*
2911 			 * Only copy-out sources which are in-mode.
2912 			 */
2913 			if (fmode != ims_get_mode(inm, ims, 1)) {
2914 				CTR1(KTR_IGMPV3, "%s: skip non-in-mode",
2915 				    __func__);
2916 				continue;
2917 			}
2918 			src.s_addr = htonl(ims->ims_haddr);
2919 			retval = SYSCTL_OUT(req, &src, sizeof(struct in_addr));
2920 			if (retval != 0)
2921 				break;
2922 		}
2923 	}
2924 	IF_ADDR_RUNLOCK(ifp);
2925 
2926 	IN_MULTI_UNLOCK();
2927 
2928 	return (retval);
2929 }
2930 
2931 #ifdef KTR
2932 
2933 static const char *inm_modestrs[] = { "un", "in", "ex" };
2934 
2935 static const char *
2936 inm_mode_str(const int mode)
2937 {
2938 
2939 	if (mode >= MCAST_UNDEFINED && mode <= MCAST_EXCLUDE)
2940 		return (inm_modestrs[mode]);
2941 	return ("??");
2942 }
2943 
2944 static const char *inm_statestrs[] = {
2945 	"not-member",
2946 	"silent",
2947 	"idle",
2948 	"lazy",
2949 	"sleeping",
2950 	"awakening",
2951 	"query-pending",
2952 	"sg-query-pending",
2953 	"leaving"
2954 };
2955 
2956 static const char *
2957 inm_state_str(const int state)
2958 {
2959 
2960 	if (state >= IGMP_NOT_MEMBER && state <= IGMP_LEAVING_MEMBER)
2961 		return (inm_statestrs[state]);
2962 	return ("??");
2963 }
2964 
2965 /*
2966  * Dump an in_multi structure to the console.
2967  */
2968 void
2969 inm_print(const struct in_multi *inm)
2970 {
2971 	int t;
2972 
2973 	if ((ktr_mask & KTR_IGMPV3) == 0)
2974 		return;
2975 
2976 	printf("%s: --- begin inm %p ---\n", __func__, inm);
2977 	printf("addr %s ifp %p(%s) ifma %p\n",
2978 	    inet_ntoa(inm->inm_addr),
2979 	    inm->inm_ifp,
2980 	    inm->inm_ifp->if_xname,
2981 	    inm->inm_ifma);
2982 	printf("timer %u state %s refcount %u scq.len %u\n",
2983 	    inm->inm_timer,
2984 	    inm_state_str(inm->inm_state),
2985 	    inm->inm_refcount,
2986 	    inm->inm_scq.ifq_len);
2987 	printf("igi %p nsrc %lu sctimer %u scrv %u\n",
2988 	    inm->inm_igi,
2989 	    inm->inm_nsrc,
2990 	    inm->inm_sctimer,
2991 	    inm->inm_scrv);
2992 	for (t = 0; t < 2; t++) {
2993 		printf("t%d: fmode %s asm %u ex %u in %u rec %u\n", t,
2994 		    inm_mode_str(inm->inm_st[t].iss_fmode),
2995 		    inm->inm_st[t].iss_asm,
2996 		    inm->inm_st[t].iss_ex,
2997 		    inm->inm_st[t].iss_in,
2998 		    inm->inm_st[t].iss_rec);
2999 	}
3000 	printf("%s: --- end inm %p ---\n", __func__, inm);
3001 }
3002 
3003 #else /* !KTR */
3004 
3005 void
3006 inm_print(const struct in_multi *inm)
3007 {
3008 
3009 }
3010 
3011 #endif /* KTR */
3012 
3013 RB_GENERATE(ip_msource_tree, ip_msource, ims_link, ip_msource_cmp);
3014