xref: /titanic_44/usr/src/uts/common/inet/ip/ipmp.c (revision 379c004d1f26b343f034bba8a350290691d00d38)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #include <inet/arp.h>
26 #include <inet/ip.h>
27 #include <inet/ip6.h>
28 #include <inet/ip_if.h>
29 #include <inet/ip_ire.h>
30 #include <inet/ip_multi.h>
31 #include <inet/ip_rts.h>
32 #include <inet/mi.h>
33 #include <net/if_types.h>
34 #include <sys/dlpi.h>
35 #include <sys/kmem.h>
36 #include <sys/modhash.h>
37 #include <sys/sdt.h>
38 #include <sys/strsun.h>
39 #include <sys/sunddi.h>
40 #include <sys/types.h>
41 
42 /*
43  * Convenience macros for getting the ip_stack_t associated with an
44  * ipmp_illgrp_t or ipmp_grp_t.
45  */
46 #define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
47 #define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
48 
49 /*
50  * Assorted constants that aren't important enough to be tunable.
51  */
52 #define	IPMP_GRP_HASH_SIZE		64
53 #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
54 
55 /*
56  * Templates for IPMP ARP messages.
57  */
58 static const arie_t ipmp_aract_template = {
59 	AR_IPMP_ACTIVATE,
60 	sizeof (arie_t),		/* Name offset */
61 	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
62 };
63 
64 static const arie_t ipmp_ardeact_template = {
65 	AR_IPMP_DEACTIVATE,
66 	sizeof (arie_t),		/* Name offset */
67 	sizeof (arie_t)			/* Name length (set by ill_arp_alloc) */
68 };
69 
70 /*
71  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
72  */
73 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
74 	{ "obytes",	KSTAT_DATA_UINT32 },
75 	{ "obytes64",	KSTAT_DATA_UINT64 },
76 	{ "rbytes",	KSTAT_DATA_UINT32 },
77 	{ "rbytes64",	KSTAT_DATA_UINT64 },
78 	{ "opackets",	KSTAT_DATA_UINT32 },
79 	{ "opackets64",	KSTAT_DATA_UINT64 },
80 	{ "oerrors",	KSTAT_DATA_UINT32 },
81 	{ "ipackets",	KSTAT_DATA_UINT32 },
82 	{ "ipackets64",	KSTAT_DATA_UINT64 },
83 	{ "ierrors",	KSTAT_DATA_UINT32 },
84 	{ "multircv",	KSTAT_DATA_UINT32 },
85 	{ "multixmt",	KSTAT_DATA_UINT32 },
86 	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
87 	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
88 	{ "link_up",	KSTAT_DATA_UINT32 }
89 };
90 
91 static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
92 static int	ipmp_grp_create_kstats(ipmp_grp_t *);
93 static int	ipmp_grp_update_kstats(kstat_t *, int);
94 static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
95 static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
96 static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
97 static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
98 static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
99 static boolean_t ipmp_ill_activate(ill_t *);
100 static void	ipmp_ill_deactivate(ill_t *);
101 static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
102 static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
103 static void	ipmp_ill_refresh_active_timer_start(ill_t *);
104 static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
105 static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
106 static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
107 static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
108 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
109 
110 /*
111  * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
112  */
113 void
114 ipmp_init(ip_stack_t *ipst)
115 {
116 	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
117 	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
118 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
119 	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
120 }
121 
122 /*
123  * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
124  */
125 void
126 ipmp_destroy(ip_stack_t *ipst)
127 {
128 	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
129 	rw_destroy(&ipst->ips_ipmp_lock);
130 }
131 
132 /*
133  * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
134  * and add it to the hash.  On success, return a pointer to the created group.
135  * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
136  * meta-interface associated with the group also has the same name (but they
137  * may differ later via ipmp_grp_rename()).
138  */
139 ipmp_grp_t *
140 ipmp_grp_create(const char *grname, phyint_t *phyi)
141 {
142 	ipmp_grp_t *grp;
143 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
144 	mod_hash_hndl_t mh;
145 
146 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
147 
148 	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
149 		return (NULL);
150 
151 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
152 	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
153 
154 	/*
155 	 * Cache the group's phyint.  This is safe since a phyint_t will
156 	 * outlive its ipmp_grp_t.
157 	 */
158 	grp->gr_phyint = phyi;
159 
160 	/*
161 	 * Create IPMP group kstats.
162 	 */
163 	if (ipmp_grp_create_kstats(grp) != 0) {
164 		kmem_free(grp, sizeof (ipmp_grp_t));
165 		return (NULL);
166 	}
167 
168 	/*
169 	 * Insert the group into the hash.
170 	 */
171 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
172 		ipmp_grp_destroy_kstats(grp);
173 		kmem_free(grp, sizeof (ipmp_grp_t));
174 		return (NULL);
175 	}
176 	ipmp_grp_insert(grp, mh);
177 
178 	return (grp);
179 }
180 
181 /*
182  * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
183  */
184 static int
185 ipmp_grp_create_kstats(ipmp_grp_t *grp)
186 {
187 	kstat_t *ksp;
188 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
189 
190 	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
191 	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
192 	if (ksp == NULL)
193 		return (ENOMEM);
194 
195 	ksp->ks_update = ipmp_grp_update_kstats;
196 	ksp->ks_private = grp;
197 	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
198 
199 	kstat_install(ksp);
200 	grp->gr_ksp = ksp;
201 	return (0);
202 }
203 
204 /*
205  * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
206  */
207 static int
208 ipmp_grp_update_kstats(kstat_t *ksp, int rw)
209 {
210 	uint_t		i;
211 	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
212 	ipmp_grp_t	*grp = ksp->ks_private;
213 	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
214 	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
215 	phyint_t	*phyi;
216 	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
217 
218 	if (rw == KSTAT_WRITE)
219 		return (EACCES);
220 
221 	/*
222 	 * Start with the group's baseline values.
223 	 */
224 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
225 		if (kn[i].data_type == KSTAT_DATA_UINT32) {
226 			kn[i].value.ui32 = grp->gr_kstats0[i];
227 		} else {
228 			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
229 			kn[i].value.ui64 = grp->gr_kstats0[i];
230 		}
231 	}
232 
233 	/*
234 	 * Add in the stats of each phyint currently in the group.  Since we
235 	 * don't directly track the phyints in a group, we cheat by walking
236 	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
237 	 * ill_g_lock is held.)
238 	 */
239 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
240 	ipsq = grp_ipsq->ipsq_next;
241 	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
242 		phyi = ipsq->ipsq_phyint;
243 
244 		/*
245 		 * If a phyint in a group is being unplumbed, it's possible
246 		 * that ill_glist_delete() -> phyint_free() already freed the
247 		 * phyint (and set ipsq_phyint to NULL), but the unplumb
248 		 * operation has yet to complete (and thus ipsq_dq() has yet
249 		 * to remove the phyint's IPSQ from the group IPSQ's phyint
250 		 * list).  We skip those phyints here (note that their kstats
251 		 * have already been added to gr_kstats0[]).
252 		 */
253 		if (phyi == NULL)
254 			continue;
255 
256 		ipmp_phyint_get_kstats(phyi, phyi_kstats);
257 
258 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
259 			phyi_kstats[i] -= phyi->phyint_kstats0[i];
260 			if (kn[i].data_type == KSTAT_DATA_UINT32)
261 				kn[i].value.ui32 += phyi_kstats[i];
262 			else
263 				kn[i].value.ui64 += phyi_kstats[i];
264 		}
265 	}
266 
267 	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
268 	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
269 
270 	rw_exit(&ipst->ips_ill_g_lock);
271 	return (0);
272 }
273 
274 /*
275  * Destroy IPMP kstat structures for `grp'.
276  */
277 static void
278 ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
279 {
280 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
281 
282 	kstat_delete_netstack(grp->gr_ksp, id);
283 	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
284 	grp->gr_ksp = NULL;
285 }
286 
287 /*
288  * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
289  * does not exist.
290  */
291 ipmp_grp_t *
292 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
293 {
294 	ipmp_grp_t *grp;
295 
296 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
297 
298 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
299 	    (mod_hash_val_t *)&grp) == 0)
300 		return (grp);
301 
302 	return (NULL);
303 }
304 
305 /*
306  * Place information about group `grp' into `lifgr'.
307  */
308 void
309 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
310 {
311 	ill_t *ill;
312 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
313 
314 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
315 
316 	lifgr->gi_v4 = (grp->gr_v4 != NULL);
317 	lifgr->gi_v6 = (grp->gr_v6 != NULL);
318 	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
319 	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
320 	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
321 	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
322 	lifgr->gi_m4ifname[0] = '\0';
323 	lifgr->gi_m6ifname[0] = '\0';
324 	lifgr->gi_bcifname[0] = '\0';
325 
326 	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
327 		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
328 		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
329 	}
330 
331 	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
332 		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
333 }
334 
335 /*
336  * Insert `grp' into the hash using the reserved hash entry `mh'.
337  * Caller must ensure `grp' is not yet in the hash.
338  */
339 static void
340 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
341 {
342 	int err;
343 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
344 
345 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
346 
347 	/*
348 	 * Since grp->gr_name will exist at least as long as `grp' is in the
349 	 * hash, we use it directly as the key.
350 	 */
351 	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
352 	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
353 	if (err != 0) {
354 		/*
355 		 * This should never happen since `mh' was preallocated.
356 		 */
357 		panic("cannot insert IPMP group \"%s\" (err %d)",
358 		    grp->gr_name, err);
359 	}
360 }
361 
362 /*
363  * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
364  */
365 static void
366 ipmp_grp_remove(ipmp_grp_t *grp)
367 {
368 	int err;
369 	mod_hash_val_t val;
370 	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
371 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
372 
373 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
374 
375 	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
376 	if (err != 0 || val != grp) {
377 		panic("cannot remove IPMP group \"%s\" (err %d)",
378 		    grp->gr_name, err);
379 	}
380 }
381 
382 /*
383  * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
384  * group name already exists or is invalid, or if there isn't enough memory.
385  */
386 int
387 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
388 {
389 	mod_hash_hndl_t mh;
390 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
391 
392 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
393 
394 	if (grname[0] == '\0')
395 		return (EINVAL);
396 
397 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
398 	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
399 		return (EEXIST);
400 
401 	/*
402 	 * Before we remove the group from the hash, ensure we'll be able to
403 	 * re-insert it by reserving space.
404 	 */
405 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
406 		return (ENOMEM);
407 
408 	ipmp_grp_remove(grp);
409 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
410 	ipmp_grp_insert(grp, mh);
411 
412 	return (0);
413 }
414 
415 /*
416  * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
417  * the hash, and that there are no interfaces on it.
418  */
419 void
420 ipmp_grp_destroy(ipmp_grp_t *grp)
421 {
422 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
423 
424 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
425 
426 	/*
427 	 * If there are still interfaces using this group, panic before things
428 	 * go really off the rails.
429 	 */
430 	if (grp->gr_nif != 0)
431 		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
432 
433 	ipmp_grp_remove(grp);
434 	ipmp_grp_destroy_kstats(grp);
435 
436 	ASSERT(grp->gr_v4 == NULL);
437 	ASSERT(grp->gr_v6 == NULL);
438 	ASSERT(grp->gr_nv4 == 0);
439 	ASSERT(grp->gr_nv6 == 0);
440 	ASSERT(grp->gr_nactif == 0);
441 	ASSERT(grp->gr_linkdownmp == NULL);
442 	grp->gr_phyint = NULL;
443 
444 	kmem_free(grp, sizeof (ipmp_grp_t));
445 }
446 
447 /*
448  * Check whether `ill' is suitable for inclusion into `grp', and return an
449  * errno describing the problem (if any).  NOTE: many of these errno values
450  * are interpreted by ifconfig, which will take corrective action and retry
451  * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
452  */
453 static int
454 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
455 {
456 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
457 
458 	ASSERT(IAM_WRITER_ILL(ill));
459 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
460 
461 	/*
462 	 * To sidestep complicated address migration logic in the kernel and
463 	 * to force the kernel's all-hosts multicast memberships to be blown
464 	 * away, all addresses that had been brought up must be brought back
465 	 * down prior to adding an interface to a group.  (This includes
466 	 * addresses currently down due to DAD.)  Once the interface has been
467 	 * added to the group, its addresses can then be brought back up, at
468 	 * which point they will be moved to the IPMP meta-interface.
469 	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
470 	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
471 	 */
472 	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
473 		return (EADDRINUSE);
474 
475 	/*
476 	 * To avoid confusing applications by changing addresses that are
477 	 * under their control, all such control must be removed prior to
478 	 * adding an interface into a group.
479 	 */
480 	if (ill_appaddr_cnt(ill) != 0)
481 		return (EADDRNOTAVAIL);
482 
483 	/*
484 	 * Since PTP addresses do not share the same broadcast domain, they
485 	 * are not allowed to be in an IPMP group.
486 	 */
487 	if (ill_ptpaddr_cnt(ill) != 0)
488 		return (EINVAL);
489 
490 	/*
491 	 * An ill must support multicast to be allowed into a group.
492 	 */
493 	if (!(ill->ill_flags & ILLF_MULTICAST))
494 		return (ENOTSUP);
495 
496 	/*
497 	 * An ill must strictly be using ARP and/or ND for address
498 	 * resolution for it to be allowed into a group.
499 	 */
500 	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV))
501 		return (ENOTSUP);
502 
503 	/*
504 	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
505 	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
506 	 * all its modifications as writer.)
507 	 */
508 	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
509 		return (ENOTSUP);
510 
511 	/*
512 	 * All ills in a group must be the same mactype.
513 	 */
514 	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
515 		return (EINVAL);
516 
517 	return (0);
518 }
519 
520 /*
521  * Check whether `phyi' is suitable for inclusion into `grp', and return an
522  * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
523  * regarding errno values.
524  */
525 int
526 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
527 {
528 	int err = 0;
529 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
530 
531 	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
532 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
533 
534 	/*
535 	 * An interface cannot have address families plumbed that are not
536 	 * configured in the group.
537 	 */
538 	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
539 	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
540 		return (EAFNOSUPPORT);
541 
542 	if (phyi->phyint_illv4 != NULL)
543 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
544 	if (err == 0 && phyi->phyint_illv6 != NULL)
545 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
546 
547 	return (err);
548 }
549 
550 /*
551  * Create a new illgrp on IPMP meta-interface `ill'.
552  */
553 ipmp_illgrp_t *
554 ipmp_illgrp_create(ill_t *ill)
555 {
556 	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
557 	ipmp_illgrp_t *illg;
558 
559 	ASSERT(IAM_WRITER_ILL(ill));
560 	ASSERT(IS_IPMP(ill));
561 	ASSERT(ill->ill_grp == NULL);
562 
563 	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
564 		return (NULL);
565 
566 	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
567 	list_create(&illg->ig_actif, sizeof (ill_t),
568 	    offsetof(ill_t, ill_actnode));
569 	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
570 	    offsetof(ipmp_arpent_t, ia_node));
571 
572 	illg->ig_ipmp_ill = ill;
573 	ill->ill_grp = illg;
574 	ipmp_illgrp_set_mtu(illg, mtu);
575 
576 	return (illg);
577 }
578 
579 /*
580  * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
581  */
582 void
583 ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
584 {
585 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
586 	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
587 
588 	/*
589 	 * Verify `illg' is empty.
590 	 */
591 	ASSERT(illg->ig_next_ill == NULL);
592 	ASSERT(illg->ig_cast_ill == NULL);
593 	ASSERT(list_is_empty(&illg->ig_arpent));
594 	ASSERT(list_is_empty(&illg->ig_if));
595 	ASSERT(list_is_empty(&illg->ig_actif));
596 	ASSERT(illg->ig_nactif == 0);
597 
598 	/*
599 	 * Destroy `illg'.
600 	 */
601 	illg->ig_ipmp_ill->ill_grp = NULL;
602 	illg->ig_ipmp_ill = NULL;
603 	list_destroy(&illg->ig_if);
604 	list_destroy(&illg->ig_actif);
605 	list_destroy(&illg->ig_arpent);
606 	kmem_free(illg, sizeof (ipmp_illgrp_t));
607 }
608 
609 /*
610  * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
611  * bind it to an underlying ill, while keeping an even address distribution.
612  * If the bind is successful, return a pointer to the bound ill.
613  */
614 ill_t *
615 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
616 {
617 	ill_t *minill;
618 	ipmp_arpent_t *entp;
619 
620 	ASSERT(IAM_WRITER_IPIF(ipif));
621 	ASSERT(ipmp_ipif_is_dataaddr(ipif));
622 
623 	/*
624 	 * IPMP data address mappings are internally managed by IP itself, so
625 	 * delete any existing ARP entries associated with the address.
626 	 */
627 	if (!ipif->ipif_isv6) {
628 		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
629 		if (entp != NULL)
630 			ipmp_illgrp_destroy_arpent(illg, entp);
631 	}
632 
633 	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
634 		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
635 
636 	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
637 }
638 
639 /*
640  * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
641  * bound, unbind it from the underlying ill while keeping an even address
642  * distribution.
643  */
644 void
645 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
646 {
647 	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
648 
649 	ASSERT(IAM_WRITER_IPIF(ipif));
650 
651 	if (boundill != NULL) {
652 		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
653 
654 		maxill = ipmp_illgrp_max_ill(illg);
655 		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
656 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
657 			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
658 		}
659 	}
660 }
661 
662 /*
663  * Return the active ill with the greatest number of data addresses in `illg'.
664  */
665 static ill_t *
666 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
667 {
668 	ill_t *ill, *bestill = NULL;
669 
670 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
671 
672 	ill = list_head(&illg->ig_actif);
673 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
674 		if (bestill == NULL ||
675 		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
676 			bestill = ill;
677 		}
678 	}
679 	return (bestill);
680 }
681 
682 /*
683  * Return the active ill with the fewest number of data addresses in `illg'.
684  */
685 static ill_t *
686 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
687 {
688 	ill_t *ill, *bestill = NULL;
689 
690 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
691 
692 	ill = list_head(&illg->ig_actif);
693 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
694 		if (bestill == NULL ||
695 		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
696 			if (ill->ill_bound_cnt == 0)
697 				return (ill);	 /* can't get better */
698 			bestill = ill;
699 		}
700 	}
701 	return (bestill);
702 }
703 
704 /*
705  * Return a pointer to IPMP meta-interface for `illg' (which must exist).
706  * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
707  */
708 ill_t *
709 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
710 {
711 	return (illg->ig_ipmp_ill);
712 }
713 
714 /*
715  * Return a pointer to the next available underlying ill in `illg', or NULL if
716  * one doesn't exist.  Caller must be inside the IPSQ.
717  */
718 ill_t *
719 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
720 {
721 	ill_t *ill;
722 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
723 
724 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
725 
726 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
727 	if ((ill = illg->ig_next_ill) != NULL) {
728 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
729 		if (illg->ig_next_ill == NULL)
730 			illg->ig_next_ill = list_head(&illg->ig_actif);
731 	}
732 	rw_exit(&ipst->ips_ipmp_lock);
733 
734 	return (ill);
735 }
736 
737 /*
738  * Return a held pointer to the next available underlying ill in `illg', or
739  * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
740  */
741 ill_t *
742 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
743 {
744 	ill_t *ill;
745 	uint_t i;
746 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
747 
748 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
749 	for (i = 0; i < illg->ig_nactif; i++) {
750 		ill = illg->ig_next_ill;
751 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
752 		if (illg->ig_next_ill == NULL)
753 			illg->ig_next_ill = list_head(&illg->ig_actif);
754 
755 		if (ILL_CAN_LOOKUP(ill)) {
756 			ill_refhold(ill);
757 			rw_exit(&ipst->ips_ipmp_lock);
758 			return (ill);
759 		}
760 	}
761 	rw_exit(&ipst->ips_ipmp_lock);
762 
763 	return (NULL);
764 }
765 
766 /*
767  * Return a pointer to the nominated multicast ill in `illg', or NULL if one
768  * doesn't exist.  Caller must be inside the IPSQ.
769  */
770 ill_t *
771 ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg)
772 {
773 	/*
774 	 * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but
775 	 * this function can get called after that point, handle NULL.
776 	 */
777 	if (illg == NULL)
778 		return (NULL);
779 
780 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
781 	return (illg->ig_cast_ill);
782 }
783 
784 /*
785  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
786  * one doesn't exist.  Caller need not be inside the IPSQ.
787  */
788 ill_t *
789 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
790 {
791 	ill_t *castill;
792 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
793 
794 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
795 	castill = illg->ig_cast_ill;
796 	if (castill != NULL && ILL_CAN_LOOKUP(castill)) {
797 		ill_refhold(castill);
798 		rw_exit(&ipst->ips_ipmp_lock);
799 		return (castill);
800 	}
801 	rw_exit(&ipst->ips_ipmp_lock);
802 	return (NULL);
803 }
804 
805 /*
806  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
807  * any existing nomination is removed.  Caller must be inside the IPSQ.
808  */
809 static void
810 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
811 {
812 	ill_t *ocastill = illg->ig_cast_ill;
813 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
814 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
815 
816 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
817 
818 	/*
819 	 * Disable old nominated ill (if any).
820 	 */
821 	if (ocastill != NULL) {
822 		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
823 		    illg, ill_t *, ocastill);
824 		ASSERT(ocastill->ill_nom_cast);
825 		ocastill->ill_nom_cast = B_FALSE;
826 		/*
827 		 * If the IPMP meta-interface is down, we never did the join,
828 		 * so we must not try to leave.
829 		 */
830 		if (ipmp_ill->ill_dl_up)
831 			ill_leave_multicast(ipmp_ill);
832 	}
833 
834 	/*
835 	 * Set new nomination.
836 	 */
837 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
838 	illg->ig_cast_ill = castill;
839 	rw_exit(&ipst->ips_ipmp_lock);
840 
841 	if (ocastill != NULL) {
842 		/*
843 		 * Delete any IREs tied to the old nomination.  We must do
844 		 * this after the new castill is set and has reached global
845 		 * visibility since the datapath has not been quiesced.
846 		 */
847 		ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
848 		    ill_stq_cache_delete, ocastill, ocastill);
849 	}
850 
851 	/*
852 	 * Enable new nominated ill (if any).
853 	 */
854 	if (castill != NULL) {
855 		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
856 		    illg, ill_t *, castill);
857 		ASSERT(!castill->ill_nom_cast);
858 		castill->ill_nom_cast = B_TRUE;
859 		/*
860 		 * If the IPMP meta-interface is down, the attempt to recover
861 		 * will silently fail but ill_need_recover_multicast will be
862 		 * erroneously cleared -- so check first.
863 		 */
864 		if (ipmp_ill->ill_dl_up)
865 			ill_recover_multicast(ipmp_ill);
866 	}
867 
868 	/*
869 	 * For IPv4, refresh our broadcast IREs.  This needs to be done even
870 	 * if there's no new nomination since ill_refresh_bcast() still must
871 	 * update the IPMP meta-interface's broadcast IREs to point back at
872 	 * the IPMP meta-interface itself.
873 	 */
874 	if (!ipmp_ill->ill_isv6)
875 		ill_refresh_bcast(ipmp_ill);
876 }
877 
878 /*
879  * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
880  * entry for the same IP address already exists, destroy it first.  Return the
881  * created IPMP ARP entry, or NULL on failure.
882  */
883 ipmp_arpent_t *
884 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp)
885 {
886 	uchar_t *addrp;
887 	area_t *area = (area_t *)mp->b_rptr;
888 	ipmp_arpent_t *entp, *oentp;
889 
890 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
891 	ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t));
892 
893 	if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL)
894 		return (NULL);
895 
896 	if ((mp = copyb(mp)) == NULL) {
897 		kmem_free(entp, sizeof (ipmp_arpent_t));
898 		return (NULL);
899 	}
900 
901 	DB_TYPE(mp) = M_PROTO;
902 	entp->ia_area_mp = mp;
903 	entp->ia_proxyarp = proxyarp;
904 	addrp = mi_offset_paramc(mp, area->area_proto_addr_offset,
905 	    sizeof (ipaddr_t));
906 	bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t));
907 
908 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
909 		ipmp_illgrp_destroy_arpent(illg, oentp);
910 
911 	list_insert_head(&illg->ig_arpent, entp);
912 	return (entp);
913 }
914 
915 /*
916  * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
917  */
918 void
919 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
920 {
921 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
922 
923 	list_remove(&illg->ig_arpent, entp);
924 	freeb(entp->ia_area_mp);
925 	kmem_free(entp, sizeof (ipmp_arpent_t));
926 }
927 
928 /*
929  * Mark that ARP has been notified about the IP address on `entp'; `illg' is
930  * taken as a debugging aid for DTrace FBT probes.
931  */
932 /* ARGSUSED */
933 void
934 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
935 {
936 	entp->ia_notified = B_TRUE;
937 }
938 
939 /*
940  * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
941  * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
942  */
943 ipmp_arpent_t *
944 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
945 {
946 	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
947 
948 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
949 
950 	if (addrp == NULL)
951 		return (entp);
952 
953 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
954 		if (entp->ia_ipaddr == *addrp)
955 			break;
956 	return (entp);
957 }
958 
959 /*
960  * Refresh ARP entries on `illg' to be distributed across its active
961  * interfaces.  Entries that cannot be refreshed (e.g., because there are no
962  * active interfaces) are marked so that subsequent calls can try again.
963  */
964 void
965 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
966 {
967 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
968 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
969 	area_t *area;
970 	mblk_t *area_mp;
971 	uchar_t *physaddr;
972 	ipmp_arpent_t *entp;
973 
974 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
975 	ASSERT(!ipmp_ill->ill_isv6);
976 
977 	ill = list_head(&illg->ig_actif);
978 	entp = list_head(&illg->ig_arpent);
979 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
980 		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
981 			entp->ia_notified = B_FALSE;
982 			continue;
983 		}
984 
985 		area = (area_t *)entp->ia_area_mp->b_rptr;
986 		ASSERT(paddrlen == ill->ill_phys_addr_length);
987 		ASSERT(paddrlen == area->area_hw_addr_length);
988 		physaddr = mi_offset_paramc(entp->ia_area_mp,
989 		    area->area_hw_addr_offset, paddrlen);
990 
991 		/*
992 		 * If this is a proxy ARP entry, we can skip notifying ARP if
993 		 * the entry is already up-to-date.  If it has changed, we
994 		 * update the entry's hardware address before notifying ARP.
995 		 */
996 		if (entp->ia_proxyarp) {
997 			if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 &&
998 			    entp->ia_notified)
999 				continue;
1000 			bcopy(ill->ill_phys_addr, physaddr, paddrlen);
1001 		}
1002 
1003 		if ((area_mp = copyb(entp->ia_area_mp)) == NULL) {
1004 			entp->ia_notified = B_FALSE;
1005 			continue;
1006 		}
1007 
1008 		putnext(ipmp_ill->ill_rq, area_mp);
1009 		ipmp_illgrp_mark_arpent(illg, entp);
1010 
1011 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
1012 			ill = list_head(&illg->ig_actif);
1013 	}
1014 }
1015 
1016 /*
1017  * Return an interface in `illg' with the specified `physaddr', or NULL if one
1018  * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
1019  */
1020 ill_t *
1021 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
1022 {
1023 	ill_t *ill;
1024 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1025 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1026 
1027 	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
1028 
1029 	ill = list_head(&illg->ig_if);
1030 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1031 		if (ill->ill_phys_addr_length == paddrlen &&
1032 		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
1033 			return (ill);
1034 	}
1035 	return (NULL);
1036 }
1037 
1038 /*
1039  * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
1040  * Caller must be inside the IPSQ unless this is initialization.
1041  */
1042 static void
1043 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
1044 {
1045 	ill_t *ill = illg->ig_ipmp_ill;
1046 	mblk_t *mp;
1047 
1048 	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1049 
1050 	/*
1051 	 * If allocation fails, we have bigger problems than MTU.
1052 	 */
1053 	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
1054 		illg->ig_mtu = mtu;
1055 		put(ill->ill_rq, mp);
1056 	}
1057 }
1058 
1059 /*
1060  * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1061  * ill MTU if necessary.
1062  */
1063 void
1064 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1065 {
1066 	ill_t *ill;
1067 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1068 	uint_t mtu = 0;
1069 
1070 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
1071 
1072 	/*
1073 	 * Since ill_max_mtu can only change under ill_lock, we hold ill_lock
1074 	 * for each ill as we iterate through the list.  Any changes to the
1075 	 * ill_max_mtu will also trigger an update, so even if we missed it
1076 	 * this time around, the update will catch it.
1077 	 */
1078 	ill = list_head(&illg->ig_if);
1079 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1080 		mutex_enter(&ill->ill_lock);
1081 		if (mtu == 0 || ill->ill_max_mtu < mtu)
1082 			mtu = ill->ill_max_mtu;
1083 		mutex_exit(&ill->ill_lock);
1084 	}
1085 
1086 	/*
1087 	 * MTU must be at least the minimum MTU.
1088 	 */
1089 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1090 
1091 	if (illg->ig_mtu != mtu)
1092 		ipmp_illgrp_set_mtu(illg, mtu);
1093 }
1094 
1095 /*
1096  * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
1097  * allow the same link to be established more than once.
1098  */
1099 void
1100 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1101 {
1102 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1103 
1104 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1105 
1106 	if (illg->ig_ipmp_ill->ill_isv6) {
1107 		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1108 		grp->gr_v6 = illg;
1109 	} else {
1110 		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1111 		grp->gr_v4 = illg;
1112 	}
1113 }
1114 
1115 /*
1116  * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
1117  * cannot be unlinked (e.g., because there are still interfaces using it).
1118  */
1119 int
1120 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1121 {
1122 	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1123 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1124 
1125 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1126 
1127 	if (illg->ig_ipmp_ill->ill_isv6) {
1128 		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1129 			return (EBUSY);
1130 		grp->gr_v6 = NULL;
1131 	} else {
1132 		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1133 			return (EBUSY);
1134 		grp->gr_v4 = NULL;
1135 	}
1136 	return (0);
1137 }
1138 
1139 /*
1140  * Place `ill' into `illg', and rebalance the data addresses on `illg'
1141  * to be spread evenly across the ills now in it.  Also, adjust the IPMP
1142  * ill as necessary to account for `ill' (e.g., MTU).
1143  */
1144 void
1145 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1146 {
1147 	ill_t *ipmp_ill;
1148 	ipif_t *ipif;
1149 	ip_stack_t *ipst = ill->ill_ipst;
1150 
1151 	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1152 	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1153 	ASSERT(IAM_WRITER_ILL(ill));
1154 	ASSERT(ill->ill_grp == NULL);
1155 
1156 	ipmp_ill = illg->ig_ipmp_ill;
1157 
1158 	/*
1159 	 * Account for `ill' joining the illgrp.
1160 	 */
1161 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1162 	if (ill->ill_isv6)
1163 		ill->ill_phyint->phyint_grp->gr_nv6++;
1164 	else
1165 		ill->ill_phyint->phyint_grp->gr_nv4++;
1166 	rw_exit(&ipst->ips_ipmp_lock);
1167 
1168 	/*
1169 	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1170 	 */
1171 	mutex_enter(&ill->ill_lock);
1172 	if (ipmp_ill->ill_flags & ILLF_ROUTER)
1173 		ill->ill_flags |= ILLF_ROUTER;
1174 	else
1175 		ill->ill_flags &= ~ILLF_ROUTER;
1176 	mutex_exit(&ill->ill_lock);
1177 
1178 	/*
1179 	 * Blow away all multicast memberships that currently exist on `ill'.
1180 	 * This may seem odd, but it's consistent with the application view
1181 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1182 	 */
1183 	if (ill->ill_isv6) {
1184 		reset_conn_ill(ill);
1185 		reset_mrt_ill(ill);
1186 	} else {
1187 		ipif = ill->ill_ipif;
1188 		for (; ipif != NULL; ipif = ipif->ipif_next) {
1189 			reset_conn_ipif(ipif);
1190 			reset_mrt_vif_ipif(ipif);
1191 		}
1192 	}
1193 	ip_purge_allmulti(ill);
1194 
1195 	/*
1196 	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1197 	 * physical address length.  All other ills must have the same value,
1198 	 * since they are required to all be the same mactype.  Also update
1199 	 * the IPMP ill's MTU and CoS marking, if necessary.
1200 	 */
1201 	if (list_is_empty(&illg->ig_if)) {
1202 		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1203 		/*
1204 		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1205 		 * doesn't have a physical address.  This means that code must
1206 		 * not assume that ill_phys_addr is non-NULL just because
1207 		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
1208 		 */
1209 		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1210 		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1211 		ipmp_ill->ill_type = ill->ill_type;
1212 
1213 		if (ill->ill_flags & ILLF_COS_ENABLED) {
1214 			mutex_enter(&ipmp_ill->ill_lock);
1215 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1216 			mutex_exit(&ipmp_ill->ill_lock);
1217 		}
1218 		ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
1219 	} else {
1220 		ASSERT(ipmp_ill->ill_phys_addr_length ==
1221 		    ill->ill_phys_addr_length);
1222 		ASSERT(ipmp_ill->ill_type == ill->ill_type);
1223 
1224 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1225 			mutex_enter(&ipmp_ill->ill_lock);
1226 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1227 			mutex_exit(&ipmp_ill->ill_lock);
1228 		}
1229 		if (illg->ig_mtu > ill->ill_max_mtu)
1230 			ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu);
1231 	}
1232 
1233 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1234 	list_insert_tail(&illg->ig_if, ill);
1235 	ill->ill_grp = illg;
1236 	rw_exit(&ipst->ips_ill_g_lock);
1237 
1238 	/*
1239 	 * Hide the IREs on `ill' so that we don't accidentally find them when
1240 	 * sending data traffic.
1241 	 */
1242 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1243 
1244 	/*
1245 	 * Merge any broadcast IREs, if need be.
1246 	 */
1247 	if (!ill->ill_isv6)
1248 		ill_refresh_bcast(ill);
1249 
1250 	ipmp_ill_refresh_active(ill);
1251 }
1252 
1253 /*
1254  * Remove `ill' from its illgrp, and rebalance the data addresses in that
1255  * illgrp to be spread evenly across the remaining ills.  Also, adjust the
1256  * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1257  */
1258 void
1259 ipmp_ill_leave_illgrp(ill_t *ill)
1260 {
1261 	ill_t *ipmp_ill;
1262 	ipif_t *ipif;
1263 	ipmp_arpent_t *entp;
1264 	ipmp_illgrp_t *illg = ill->ill_grp;
1265 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1266 
1267 	ASSERT(IS_UNDER_IPMP(ill));
1268 	ASSERT(IAM_WRITER_ILL(ill));
1269 	ASSERT(illg != NULL);
1270 
1271 	ipmp_ill = illg->ig_ipmp_ill;
1272 
1273 	/*
1274 	 * Cancel IPMP-specific ill timeouts.
1275 	 */
1276 	(void) untimeout(ill->ill_refresh_tid);
1277 
1278 	/*
1279 	 * Expose any previously-hidden IREs on `ill'.
1280 	 */
1281 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1282 
1283 	/*
1284 	 * Ensure the multicast state for each ipif on `ill' is down so that
1285 	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1286 	 * all eligible groups.
1287 	 */
1288 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1289 		if (ipif->ipif_flags & IPIF_UP)
1290 			ipif_multicast_down(ipif);
1291 
1292 	/*
1293 	 * Account for `ill' leaving the illgrp.
1294 	 */
1295 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1296 	if (ill->ill_isv6)
1297 		ill->ill_phyint->phyint_grp->gr_nv6--;
1298 	else
1299 		ill->ill_phyint->phyint_grp->gr_nv4--;
1300 	rw_exit(&ipst->ips_ipmp_lock);
1301 
1302 	/*
1303 	 * Pull `ill' out of the interface lists.
1304 	 */
1305 	if (list_link_active(&ill->ill_actnode))
1306 		ipmp_ill_deactivate(ill);
1307 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1308 	list_remove(&illg->ig_if, ill);
1309 	ill->ill_grp = NULL;
1310 	rw_exit(&ipst->ips_ill_g_lock);
1311 
1312 	/*
1313 	 * Recreate any broadcast IREs that had been shared, if need be.
1314 	 */
1315 	if (!ill->ill_isv6)
1316 		ill_refresh_bcast(ill);
1317 
1318 	/*
1319 	 * Re-establish multicast memberships that were previously being
1320 	 * handled by the IPMP meta-interface.
1321 	 */
1322 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1323 		if (ipif->ipif_flags & IPIF_UP)
1324 			ipif_multicast_up(ipif);
1325 
1326 	/*
1327 	 * Refresh the group MTU based on the new interface list.
1328 	 */
1329 	ipmp_illgrp_refresh_mtu(illg);
1330 
1331 	if (list_is_empty(&illg->ig_if)) {
1332 		/*
1333 		 * No ills left in the illgrp; we no longer have a physical
1334 		 * address length, nor can we support ARP, CoS, or anything
1335 		 * else that depends on knowing the link layer type.
1336 		 */
1337 		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1338 			ipmp_illgrp_destroy_arpent(illg, entp);
1339 
1340 		ipmp_ill->ill_phys_addr_length = 0;
1341 		ipmp_ill->ill_nd_lla_len = 0;
1342 		ipmp_ill->ill_type = IFT_OTHER;
1343 		mutex_enter(&ipmp_ill->ill_lock);
1344 		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1345 		mutex_exit(&ipmp_ill->ill_lock);
1346 	} else {
1347 		/*
1348 		 * If `ill' didn't support CoS, see if it can now be enabled.
1349 		 */
1350 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1351 			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1352 
1353 			ill = list_head(&illg->ig_if);
1354 			do {
1355 				if (!(ill->ill_flags & ILLF_COS_ENABLED))
1356 					break;
1357 			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1358 
1359 			if (ill == NULL) {
1360 				mutex_enter(&ipmp_ill->ill_lock);
1361 				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1362 				mutex_exit(&ipmp_ill->ill_lock);
1363 			}
1364 		}
1365 	}
1366 }
1367 
1368 /*
1369  * Check if `ill' should be active, and activate or deactivate if need be.
1370  * Return B_FALSE if a refresh was necessary but could not be performed.
1371  */
1372 static boolean_t
1373 ipmp_ill_try_refresh_active(ill_t *ill)
1374 {
1375 	boolean_t refreshed = B_TRUE;
1376 
1377 	ASSERT(IAM_WRITER_ILL(ill));
1378 	ASSERT(IS_UNDER_IPMP(ill));
1379 
1380 	if (ipmp_ill_is_active(ill)) {
1381 		if (!list_link_active(&ill->ill_actnode))
1382 			refreshed = ipmp_ill_activate(ill);
1383 	} else {
1384 		if (list_link_active(&ill->ill_actnode))
1385 			ipmp_ill_deactivate(ill);
1386 	}
1387 
1388 	return (refreshed);
1389 }
1390 
1391 /*
1392  * Check if `ill' should be active, and activate or deactivate if need be.
1393  * If the refresh fails, schedule a timer to try again later.
1394  */
1395 void
1396 ipmp_ill_refresh_active(ill_t *ill)
1397 {
1398 	if (!ipmp_ill_try_refresh_active(ill))
1399 		ipmp_ill_refresh_active_timer_start(ill);
1400 }
1401 
1402 /*
1403  * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1404  */
1405 static void
1406 ipmp_ill_refresh_active_timer(void *ill_arg)
1407 {
1408 	ill_t *ill = ill_arg;
1409 	boolean_t refreshed = B_FALSE;
1410 
1411 	/*
1412 	 * Clear ill_refresh_tid to indicate that no timeout is pending
1413 	 * (another thread could schedule a new timeout while we're still
1414 	 * running, but that's harmless).  If the ill is going away, bail.
1415 	 */
1416 	mutex_enter(&ill->ill_lock);
1417 	ill->ill_refresh_tid = 0;
1418 	if (ill->ill_state_flags & ILL_CONDEMNED) {
1419 		mutex_exit(&ill->ill_lock);
1420 		return;
1421 	}
1422 	mutex_exit(&ill->ill_lock);
1423 
1424 	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1425 		refreshed = ipmp_ill_try_refresh_active(ill);
1426 		ipsq_exit(ill->ill_phyint->phyint_ipsq);
1427 	}
1428 
1429 	/*
1430 	 * If the refresh failed, schedule another attempt.
1431 	 */
1432 	if (!refreshed)
1433 		ipmp_ill_refresh_active_timer_start(ill);
1434 }
1435 
1436 /*
1437  * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1438  */
1439 static void
1440 ipmp_ill_refresh_active_timer_start(ill_t *ill)
1441 {
1442 	mutex_enter(&ill->ill_lock);
1443 
1444 	/*
1445 	 * If the ill is going away or a refresh is already scheduled, bail.
1446 	 */
1447 	if (ill->ill_refresh_tid != 0 ||
1448 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
1449 		mutex_exit(&ill->ill_lock);
1450 		return;
1451 	}
1452 
1453 	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1454 	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1455 
1456 	mutex_exit(&ill->ill_lock);
1457 }
1458 
1459 /*
1460  * Activate `ill' so it will be used to send and receive data traffic.  Return
1461  * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
1462  * needed to deactivate `ill' here as well so that deactivation cannot fail.
1463  */
1464 static boolean_t
1465 ipmp_ill_activate(ill_t *ill)
1466 {
1467 	ipif_t		*ipif;
1468 	mblk_t		*actmp = NULL, *deactmp = NULL;
1469 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
1470 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1471 	const char	*grifname = grp->gr_ifname;
1472 	ipmp_illgrp_t	*illg = ill->ill_grp;
1473 	ill_t		*maxill;
1474 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1475 
1476 	ASSERT(IAM_WRITER_ILL(ill));
1477 	ASSERT(IS_UNDER_IPMP(ill));
1478 
1479 	/*
1480 	 * If this will be the first active interface in the group, allocate
1481 	 * the link-up and link-down messages.
1482 	 */
1483 	if (grp->gr_nactif == 0) {
1484 		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1485 		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1486 		if (linkupmp == NULL || linkdownmp == NULL)
1487 			goto fail;
1488 	}
1489 
1490 	/*
1491 	 * For IPv4, allocate the activate/deactivate messages, and tell ARP.
1492 	 */
1493 	if (!ill->ill_isv6) {
1494 		actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template);
1495 		deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template);
1496 		if (actmp == NULL || deactmp == NULL)
1497 			goto fail;
1498 
1499 		ASSERT(ill->ill_ardeact_mp == NULL);
1500 		ill->ill_ardeact_mp = deactmp;
1501 		putnext(illg->ig_ipmp_ill->ill_rq, actmp);
1502 	}
1503 
1504 	if (list_is_empty(&illg->ig_actif)) {
1505 		/*
1506 		 * Now that we have an active ill, nominate it for multicast
1507 		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
1508 		 * since that may need to send multicast packets (e.g., IPv6
1509 		 * neighbor discovery probes).
1510 		 */
1511 		ipmp_illgrp_set_cast(illg, ill);
1512 
1513 		/*
1514 		 * This is the first active ill in the illgrp -- add 'em all.
1515 		 * We can access/walk ig_ipmp_ill's ipif list since we're
1516 		 * writer on its IPSQ as well.
1517 		 */
1518 		ipif = illg->ig_ipmp_ill->ill_ipif;
1519 		for (; ipif != NULL; ipif = ipif->ipif_next)
1520 			if (ipmp_ipif_is_up_dataaddr(ipif))
1521 				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1522 	} else {
1523 		/*
1524 		 * Redistribute the addresses by moving them from the ill with
1525 		 * the most addresses until the ill being activated is at the
1526 		 * same level as the rest of the ills.
1527 		 */
1528 		for (;;) {
1529 			maxill = ipmp_illgrp_max_ill(illg);
1530 			ASSERT(maxill != NULL);
1531 			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1532 				break;
1533 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1534 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1535 		}
1536 
1537 		/*
1538 		 * TODO: explore whether it's advantageous to flush IRE_CACHE
1539 		 * bindings to force existing connections to be redistributed
1540 		 * to the new ill.
1541 		 */
1542 	}
1543 
1544 	/*
1545 	 * Put the interface in the active list.
1546 	 */
1547 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1548 	list_insert_tail(&illg->ig_actif, ill);
1549 	illg->ig_nactif++;
1550 	illg->ig_next_ill = ill;
1551 	rw_exit(&ipst->ips_ipmp_lock);
1552 
1553 	/*
1554 	 * Refresh ARP entries to use `ill', if need be.
1555 	 */
1556 	if (!ill->ill_isv6)
1557 		ipmp_illgrp_refresh_arpent(illg);
1558 
1559 	/*
1560 	 * Finally, mark the group link up, if necessary.
1561 	 */
1562 	if (grp->gr_nactif++ == 0) {
1563 		ASSERT(grp->gr_linkdownmp == NULL);
1564 		grp->gr_linkdownmp = linkdownmp;
1565 		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1566 	}
1567 	return (B_TRUE);
1568 fail:
1569 	freemsg(actmp);
1570 	freemsg(deactmp);
1571 	freemsg(linkupmp);
1572 	freemsg(linkdownmp);
1573 	return (B_FALSE);
1574 }
1575 
1576 /*
1577  * Deactivate `ill' so it will not be used to send or receive data traffic.
1578  */
1579 static void
1580 ipmp_ill_deactivate(ill_t *ill)
1581 {
1582 	ill_t		*minill;
1583 	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
1584 	mblk_t		*mp;
1585 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1586 	ipmp_illgrp_t	*illg = ill->ill_grp;
1587 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1588 
1589 	ASSERT(IAM_WRITER_ILL(ill));
1590 	ASSERT(IS_UNDER_IPMP(ill));
1591 
1592 	/*
1593 	 * Delete IRE_CACHE entries tied to this ill before they become stale.
1594 	 */
1595 	ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE,
1596 	    ill_stq_cache_delete, ill, ill);
1597 
1598 	/*
1599 	 * Pull the interface out of the active list.
1600 	 */
1601 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1602 	list_remove(&illg->ig_actif, ill);
1603 	illg->ig_nactif--;
1604 	illg->ig_next_ill = list_head(&illg->ig_actif);
1605 	rw_exit(&ipst->ips_ipmp_lock);
1606 
1607 	/*
1608 	 * If the ill that's being deactivated had been nominated for
1609 	 * multicast/broadcast, nominate a new one.
1610 	 */
1611 	if (ill == illg->ig_cast_ill)
1612 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1613 
1614 	/*
1615 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1616 	 * we'll rebind them after we tell the resolver the ill is no longer
1617 	 * active.  We must do things in this order or the resolver could
1618 	 * accidentally rebind to the ill we're trying to remove if multiple
1619 	 * ills in the group have the same hardware address (which is
1620 	 * unsupported, but shouldn't lead to a wedged machine).
1621 	 */
1622 	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1623 		ipif->ipif_bound_next = ubheadipif;
1624 		ubheadipif = ipif;
1625 	}
1626 
1627 	if (!ill->ill_isv6) {
1628 		/*
1629 		 * Tell ARP `ill' is no longer active in the group.
1630 		 */
1631 		mp = ill->ill_ardeact_mp;
1632 		ill->ill_ardeact_mp = NULL;
1633 		ASSERT(mp != NULL);
1634 		putnext(illg->ig_ipmp_ill->ill_rq, mp);
1635 
1636 		/*
1637 		 * Refresh any ARP entries that had been using `ill'.
1638 		 */
1639 		ipmp_illgrp_refresh_arpent(illg);
1640 	}
1641 
1642 	/*
1643 	 * Rebind each ipif from the deactivated ill to the active ill with
1644 	 * the fewest ipifs.  If there are no active ills, the ipifs will
1645 	 * remain unbound.
1646 	 */
1647 	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1648 		ubnextipif = ipif->ipif_bound_next;
1649 		ipif->ipif_bound_next = NULL;
1650 
1651 		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1652 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1653 	}
1654 
1655 	/*
1656 	 * Finally, mark the group link down, if necessary.
1657 	 */
1658 	if (--grp->gr_nactif == 0) {
1659 		mp = grp->gr_linkdownmp;
1660 		grp->gr_linkdownmp = NULL;
1661 		ASSERT(mp != NULL);
1662 		put(illg->ig_ipmp_ill->ill_rq, mp);
1663 	}
1664 }
1665 
1666 /*
1667  * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1668  * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1669  */
1670 static void
1671 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1672 {
1673 	ipif_t *ipif;
1674 
1675 	ASSERT(IAM_WRITER_ILL(ill));
1676 	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1677 
1678 	/*
1679 	 * If `ill' is truly down, there are no messages to generate since:
1680 	 *
1681 	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1682 	 *    and its addresses by bringing them down.  But that's already
1683 	 *    true, so there's nothing to hide.
1684 	 *
1685 	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1686 	 *    indicating that any previously-hidden up addresses are again
1687 	 *    back up (along with the interface).  But they aren't, so
1688 	 *    there's nothing to expose.
1689 	 */
1690 	if (ill->ill_ipif_up_count == 0)
1691 		return;
1692 
1693 	if (cmd == RTM_ADD)
1694 		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1695 
1696 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1697 		if (ipif->ipif_flags & IPIF_UP)
1698 			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1699 
1700 	if (cmd == RTM_DELETE)
1701 		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1702 }
1703 
1704 /*
1705  * Bind the address named by `ipif' to the underlying ill named by `ill'.
1706  * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
1707  * will indicate to the resolver whether this is an initial bringup of
1708  * `ipif', or just a rebind to another ill.
1709  */
1710 static void
1711 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1712 {
1713 	int err = 0;
1714 	ip_stack_t *ipst = ill->ill_ipst;
1715 
1716 	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1717 	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1718 	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1719 	ASSERT(ipif->ipif_bound_ill == NULL);
1720 	ASSERT(ipif->ipif_bound_next == NULL);
1721 
1722 	ipif->ipif_bound_next = ill->ill_bound_ipif;
1723 	ill->ill_bound_ipif = ipif;
1724 	ill->ill_bound_cnt++;
1725 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1726 	ipif->ipif_bound_ill = ill;
1727 	rw_exit(&ipst->ips_ipmp_lock);
1728 
1729 	/*
1730 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
1731 	 * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills.
1732 	 */
1733 	if (act != Res_act_none) {
1734 		if (ill->ill_isv6) {
1735 			VERIFY(ipif_resolver_up(ipif, act) == 0);
1736 			err = ipif_ndp_up(ipif, act == Res_act_initial);
1737 		} else {
1738 			err = ipif_resolver_up(ipif, act);
1739 		}
1740 
1741 		/*
1742 		 * Since ipif_ndp_up() never returns EINPROGRESS and
1743 		 * ipif_resolver_up() only returns EINPROGRESS when the
1744 		 * associated ill is not up, we should never be here with
1745 		 * EINPROGRESS.  We rely on this to simplify the design.
1746 		 */
1747 		ASSERT(err != EINPROGRESS);
1748 	}
1749 	/* TODO: retry binding on failure? when? */
1750 	ipif->ipif_bound = (err == 0);
1751 }
1752 
1753 /*
1754  * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1755  * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1756  * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
1757  * B_TRUE, notify the resolver about the change.
1758  */
1759 static ipif_t *
1760 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1761 {
1762 	ill_t *ipmp_ill;
1763 	ipif_t *previpif;
1764 	ip_stack_t *ipst = ill->ill_ipst;
1765 
1766 	ASSERT(IAM_WRITER_ILL(ill));
1767 	ASSERT(IS_UNDER_IPMP(ill));
1768 
1769 	ipmp_ill = ill->ill_grp->ig_ipmp_ill;
1770 
1771 	/*
1772 	 * If necessary, find an ipif to unbind.
1773 	 */
1774 	if (ipif == NULL) {
1775 		if ((ipif = ill->ill_bound_ipif) == NULL) {
1776 			ASSERT(ill->ill_bound_cnt == 0);
1777 			return (NULL);
1778 		}
1779 	}
1780 
1781 	ASSERT(IAM_WRITER_IPIF(ipif));
1782 	ASSERT(IS_IPMP(ipif->ipif_ill));
1783 	ASSERT(ipif->ipif_bound_ill == ill);
1784 	ASSERT(ill->ill_bound_cnt > 0);
1785 
1786 	/*
1787 	 * Unbind it.
1788 	 */
1789 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1790 	ipif->ipif_bound_ill = NULL;
1791 	rw_exit(&ipst->ips_ipmp_lock);
1792 	ill->ill_bound_cnt--;
1793 
1794 	if (ill->ill_bound_ipif == ipif) {
1795 		ill->ill_bound_ipif = ipif->ipif_bound_next;
1796 	} else {
1797 		previpif = ill->ill_bound_ipif;
1798 		while (previpif->ipif_bound_next != ipif)
1799 			previpif = previpif->ipif_bound_next;
1800 
1801 		previpif->ipif_bound_next = ipif->ipif_bound_next;
1802 	}
1803 	ipif->ipif_bound_next = NULL;
1804 
1805 	/*
1806 	 * If requested, notify the resolvers (provided we're bound).
1807 	 */
1808 	if (notifyres && ipif->ipif_bound) {
1809 		if (ill->ill_isv6) {
1810 			ipif_ndp_down(ipif);
1811 		} else {
1812 			ASSERT(ipif->ipif_arp_del_mp != NULL);
1813 			putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp);
1814 			ipif->ipif_arp_del_mp = NULL;
1815 		}
1816 	}
1817 	ipif->ipif_bound = B_FALSE;
1818 
1819 	return (ipif);
1820 }
1821 
1822 /*
1823  * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
1824  * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
1825  * to determine whether an ill should be considered active, other consumers
1826  * may race and learn about an ill that should be deactivated/activated before
1827  * IPMP has performed the activation/deactivation.  This should be safe though
1828  * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1829  * would've been cleaned up by ipmp_ill_deactivate().
1830  */
1831 boolean_t
1832 ipmp_ill_is_active(ill_t *ill)
1833 {
1834 	phyint_t *phyi = ill->ill_phyint;
1835 
1836 	ASSERT(IS_UNDER_IPMP(ill));
1837 	ASSERT(IAM_WRITER_ILL(ill) ||
1838 	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1839 
1840 	/*
1841 	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1842 	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
1843 	 * link flapping logic to be just in in.mpathd and allows us to ignore
1844 	 * changes to PHYI_RUNNING.
1845 	 */
1846 	return (!(ill->ill_ipif_up_count == 0 ||
1847 	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1848 }
1849 
1850 /*
1851  * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet
1852  * IREs with a source address on `ill_arg'.
1853  */
1854 static void
1855 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1856 {
1857 	ill_t *ill = (ill_t *)ill_arg;
1858 
1859 	ASSERT(IAM_WRITER_ILL(ill));
1860 	ASSERT(!IS_IPMP(ill));
1861 
1862 	if (ire->ire_ipif->ipif_ill != ill)
1863 		return;
1864 
1865 	switch (ire->ire_type) {
1866 	case IRE_HOST:
1867 	case IRE_PREFIX:
1868 	case IRE_DEFAULT:
1869 	case IRE_CACHE:
1870 	case IRE_IF_RESOLVER:
1871 	case IRE_IF_NORESOLVER:
1872 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1873 		ire->ire_marks |= IRE_MARK_TESTHIDDEN;
1874 		break;
1875 	default:
1876 		break;
1877 	}
1878 }
1879 
1880 /*
1881  * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source
1882  * address on `ill_arg'.
1883  */
1884 static void
1885 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1886 {
1887 	ill_t *ill = (ill_t *)ill_arg;
1888 
1889 	ASSERT(IAM_WRITER_ILL(ill));
1890 	ASSERT(!IS_IPMP(ill));
1891 
1892 	if (ire->ire_ipif->ipif_ill == ill) {
1893 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1894 		ire->ire_marks &= ~IRE_MARK_TESTHIDDEN;
1895 	}
1896 }
1897 
1898 /*
1899  * Return a held pointer to the IPMP ill for underlying interface `ill', or
1900  * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
1901  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1902  * ill_grp pointer may become stale when not under an IPSQ and not holding
1903  * ipmp_lock.)  Caller need not be inside the IPSQ.
1904  */
1905 ill_t *
1906 ipmp_ill_hold_ipmp_ill(ill_t *ill)
1907 {
1908 	ip_stack_t *ipst = ill->ill_ipst;
1909 	ipmp_illgrp_t *illg;
1910 
1911 	ASSERT(!IS_IPMP(ill));
1912 
1913 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1914 	illg = ill->ill_grp;
1915 	if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) {
1916 		ill_refhold(illg->ig_ipmp_ill);
1917 		rw_exit(&ipst->ips_ipmp_lock);
1918 		return (illg->ig_ipmp_ill);
1919 	}
1920 	/*
1921 	 * Assume `ill' was removed from the illgrp in the meantime.
1922 	 */
1923 	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1924 	return (NULL);
1925 }
1926 
1927 /*
1928  * Return the interface index for the IPMP ill tied to underlying interface
1929  * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
1930  */
1931 uint_t
1932 ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1933 {
1934 	uint_t ifindex = 0;
1935 	ip_stack_t *ipst = ill->ill_ipst;
1936 	ipmp_grp_t *grp;
1937 
1938 	ASSERT(!IS_IPMP(ill));
1939 
1940 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1941 	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1942 		ifindex = grp->gr_phyint->phyint_ifindex;
1943 	rw_exit(&ipst->ips_ipmp_lock);
1944 	return (ifindex);
1945 }
1946 
1947 /*
1948  * Place phyint `phyi' into IPMP group `grp'.
1949  */
1950 void
1951 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1952 {
1953 	ill_t *ill;
1954 	ipsq_t *ipsq = phyi->phyint_ipsq;
1955 	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1956 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1957 
1958 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1959 	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1960 
1961 	/*
1962 	 * Send routing socket messages indicating that the phyint's ills
1963 	 * and ipifs vanished.
1964 	 */
1965 	if (phyi->phyint_illv4 != NULL) {
1966 		ill = phyi->phyint_illv4;
1967 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1968 	}
1969 
1970 	if (phyi->phyint_illv6 != NULL) {
1971 		ill = phyi->phyint_illv6;
1972 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1973 	}
1974 
1975 	/*
1976 	 * Snapshot the phyint's initial kstats as a baseline.
1977 	 */
1978 	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1979 
1980 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1981 
1982 	phyi->phyint_grp = grp;
1983 	if (++grp->gr_nif == 1)
1984 		grp->gr_mactype = ill->ill_mactype;
1985 	else
1986 		ASSERT(grp->gr_mactype == ill->ill_mactype);
1987 
1988 	/*
1989 	 * Now that we're in the group, request a switch to the group's xop
1990 	 * when we ipsq_exit().  All future operations will be exclusive on
1991 	 * the group xop until ipmp_phyint_leave_grp() is called.
1992 	 */
1993 	ASSERT(ipsq->ipsq_swxop == NULL);
1994 	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1995 	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1996 
1997 	rw_exit(&ipst->ips_ipmp_lock);
1998 }
1999 
2000 /*
2001  * Remove phyint `phyi' from its current IPMP group.
2002  */
2003 void
2004 ipmp_phyint_leave_grp(phyint_t *phyi)
2005 {
2006 	uint_t i;
2007 	ipsq_t *ipsq = phyi->phyint_ipsq;
2008 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
2009 	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
2010 
2011 	ASSERT(IAM_WRITER_IPSQ(ipsq));
2012 
2013 	/*
2014 	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
2015 	 */
2016 	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
2017 		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
2018 	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
2019 		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
2020 
2021 	/*
2022 	 * Send routing socket messages indicating that the phyint's ills
2023 	 * and ipifs have reappeared.
2024 	 */
2025 	if (phyi->phyint_illv4 != NULL)
2026 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
2027 	if (phyi->phyint_illv6 != NULL)
2028 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
2029 
2030 	/*
2031 	 * Calculate the phyint's cumulative kstats while it was in the group,
2032 	 * and add that to the group's baseline.
2033 	 */
2034 	ipmp_phyint_get_kstats(phyi, phyi_kstats);
2035 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2036 		phyi_kstats[i] -= phyi->phyint_kstats0[i];
2037 		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
2038 	}
2039 
2040 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
2041 
2042 	phyi->phyint_grp->gr_nif--;
2043 	phyi->phyint_grp = NULL;
2044 
2045 	/*
2046 	 * As our final act in leaving the group, request a switch back to our
2047 	 * IPSQ's own xop when we ipsq_exit().
2048 	 */
2049 	ASSERT(ipsq->ipsq_swxop == NULL);
2050 	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
2051 
2052 	rw_exit(&ipst->ips_ipmp_lock);
2053 }
2054 
2055 /*
2056  * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
2057  * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
2058  */
2059 static void
2060 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
2061 {
2062 	uint_t		i, j;
2063 	const char	*name;
2064 	kstat_t		*ksp;
2065 	kstat_named_t	*kn;
2066 
2067 	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
2068 
2069 	/*
2070 	 * NOTE: ALL_ZONES here assumes that there's at most one link
2071 	 * with a given name on a given system (safe for now).
2072 	 */
2073 	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES);
2074 	if (ksp == NULL)
2075 		return;
2076 
2077 	KSTAT_ENTER(ksp);
2078 
2079 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2080 		/*
2081 		 * Bring kstats up-to-date before recording.
2082 		 */
2083 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
2084 
2085 		kn = KSTAT_NAMED_PTR(ksp);
2086 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2087 			name = ipmp_kstats[i].name;
2088 			kstats[i] = 0;
2089 			for (j = 0; j < ksp->ks_ndata; j++) {
2090 				if (strcmp(kn[j].name, name) != 0)
2091 					continue;
2092 
2093 				switch (kn[j].data_type) {
2094 				case KSTAT_DATA_INT32:
2095 				case KSTAT_DATA_UINT32:
2096 					kstats[i] = kn[j].value.ui32;
2097 					break;
2098 #ifdef	_LP64
2099 				case KSTAT_DATA_LONG:
2100 				case KSTAT_DATA_ULONG:
2101 					kstats[i] = kn[j].value.ul;
2102 					break;
2103 #endif
2104 				case KSTAT_DATA_INT64:
2105 				case KSTAT_DATA_UINT64:
2106 					kstats[i] = kn[j].value.ui64;
2107 					break;
2108 				}
2109 				break;
2110 			}
2111 		}
2112 	}
2113 
2114 	KSTAT_EXIT(ksp);
2115 	kstat_rele(ksp);
2116 }
2117 
2118 /*
2119  * Refresh the active state of all ills on `phyi'.
2120  */
2121 void
2122 ipmp_phyint_refresh_active(phyint_t *phyi)
2123 {
2124 	if (phyi->phyint_illv4 != NULL)
2125 		ipmp_ill_refresh_active(phyi->phyint_illv4);
2126 	if (phyi->phyint_illv6 != NULL)
2127 		ipmp_ill_refresh_active(phyi->phyint_illv6);
2128 }
2129 
2130 /*
2131  * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2132  * doesn't exist.  Caller need not be inside the IPSQ.
2133  */
2134 ill_t *
2135 ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2136 {
2137 	ill_t *boundill;
2138 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2139 
2140 	ASSERT(IS_IPMP(ipif->ipif_ill));
2141 
2142 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2143 	boundill = ipif->ipif_bound_ill;
2144 	if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) {
2145 		ill_refhold(boundill);
2146 		rw_exit(&ipst->ips_ipmp_lock);
2147 		return (boundill);
2148 	}
2149 	rw_exit(&ipst->ips_ipmp_lock);
2150 	return (NULL);
2151 }
2152 
2153 /*
2154  * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2155  * doesn't exist.  Caller must be inside the IPSQ.
2156  */
2157 ill_t *
2158 ipmp_ipif_bound_ill(const ipif_t *ipif)
2159 {
2160 	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2161 	ASSERT(IS_IPMP(ipif->ipif_ill));
2162 
2163 	return (ipif->ipif_bound_ill);
2164 }
2165 
2166 /*
2167  * Check if `ipif' is a "stub" (placeholder address not being used).
2168  */
2169 boolean_t
2170 ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2171 {
2172 	if (ipif->ipif_flags & IPIF_UP)
2173 		return (B_FALSE);
2174 	if (ipif->ipif_ill->ill_isv6)
2175 		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2176 	else
2177 		return (ipif->ipif_lcl_addr == INADDR_ANY);
2178 }
2179 
2180 /*
2181  * Check if `ipif' is an IPMP data address.
2182  */
2183 boolean_t
2184 ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2185 {
2186 	if (ipif->ipif_flags & IPIF_NOFAILOVER)
2187 		return (B_FALSE);
2188 	if (ipif->ipif_ill->ill_isv6)
2189 		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2190 	else
2191 		return (ipif->ipif_lcl_addr != INADDR_ANY);
2192 }
2193 
2194 /*
2195  * Check if `ipif' is an IPIF_UP IPMP data address.
2196  */
2197 static boolean_t
2198 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2199 {
2200 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2201 }
2202