xref: /illumos-gate/usr/src/uts/common/inet/ip/ipmp.c (revision a38ee58261c5aa81028a4329e73da4016006aa99)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
22  */
23 
24 #include <inet/ip.h>
25 #include <inet/ip6.h>
26 #include <inet/ip_if.h>
27 #include <inet/ip_ire.h>
28 #include <inet/ip_multi.h>
29 #include <inet/ip_ndp.h>
30 #include <inet/ip_rts.h>
31 #include <inet/mi.h>
32 #include <net/if_types.h>
33 #include <sys/dlpi.h>
34 #include <sys/kmem.h>
35 #include <sys/modhash.h>
36 #include <sys/sdt.h>
37 #include <sys/strsun.h>
38 #include <sys/sunddi.h>
39 #include <sys/types.h>
40 
41 /*
42  * Convenience macros for getting the ip_stack_t associated with an
43  * ipmp_illgrp_t or ipmp_grp_t.
44  */
45 #define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
46 #define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
47 
48 /*
49  * Assorted constants that aren't important enough to be tunable.
50  */
51 #define	IPMP_GRP_HASH_SIZE		64
52 #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
53 
54 /*
55  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
56  */
57 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
58 	{ "obytes",	KSTAT_DATA_UINT32 },
59 	{ "obytes64",	KSTAT_DATA_UINT64 },
60 	{ "rbytes",	KSTAT_DATA_UINT32 },
61 	{ "rbytes64",	KSTAT_DATA_UINT64 },
62 	{ "opackets",	KSTAT_DATA_UINT32 },
63 	{ "opackets64",	KSTAT_DATA_UINT64 },
64 	{ "oerrors",	KSTAT_DATA_UINT32 },
65 	{ "ipackets",	KSTAT_DATA_UINT32 },
66 	{ "ipackets64",	KSTAT_DATA_UINT64 },
67 	{ "ierrors",	KSTAT_DATA_UINT32 },
68 	{ "multircv",	KSTAT_DATA_UINT32 },
69 	{ "multixmt",	KSTAT_DATA_UINT32 },
70 	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
71 	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
72 	{ "link_up",	KSTAT_DATA_UINT32 }
73 };
74 
75 static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
76 static int	ipmp_grp_create_kstats(ipmp_grp_t *);
77 static int	ipmp_grp_update_kstats(kstat_t *, int);
78 static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
79 static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
80 static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
81 static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
82 static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t);
83 static boolean_t ipmp_ill_activate(ill_t *);
84 static void	ipmp_ill_deactivate(ill_t *);
85 static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
86 static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
87 static void	ipmp_ill_refresh_active_timer_start(ill_t *);
88 static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
89 static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
90 static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
91 static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
92 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
93 static void	ipmp_ncec_delete_nonlocal(ncec_t *, uchar_t *);
94 
95 /*
96  * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
97  */
98 void
99 ipmp_init(ip_stack_t *ipst)
100 {
101 	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
102 	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
103 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
104 	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
105 }
106 
107 /*
108  * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
109  */
110 void
111 ipmp_destroy(ip_stack_t *ipst)
112 {
113 	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
114 	rw_destroy(&ipst->ips_ipmp_lock);
115 }
116 
117 /*
118  * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
119  * and add it to the hash.  On success, return a pointer to the created group.
120  * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
121  * meta-interface associated with the group also has the same name (but they
122  * may differ later via ipmp_grp_rename()).
123  */
124 ipmp_grp_t *
125 ipmp_grp_create(const char *grname, phyint_t *phyi)
126 {
127 	ipmp_grp_t *grp;
128 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
129 	mod_hash_hndl_t mh;
130 
131 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
132 
133 	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
134 		return (NULL);
135 
136 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
137 	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
138 
139 	/*
140 	 * Cache the group's phyint.  This is safe since a phyint_t will
141 	 * outlive its ipmp_grp_t.
142 	 */
143 	grp->gr_phyint = phyi;
144 
145 	/*
146 	 * Create IPMP group kstats.
147 	 */
148 	if (ipmp_grp_create_kstats(grp) != 0) {
149 		kmem_free(grp, sizeof (ipmp_grp_t));
150 		return (NULL);
151 	}
152 
153 	/*
154 	 * Insert the group into the hash.
155 	 */
156 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
157 		ipmp_grp_destroy_kstats(grp);
158 		kmem_free(grp, sizeof (ipmp_grp_t));
159 		return (NULL);
160 	}
161 	ipmp_grp_insert(grp, mh);
162 
163 	return (grp);
164 }
165 
166 /*
167  * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
168  */
169 static int
170 ipmp_grp_create_kstats(ipmp_grp_t *grp)
171 {
172 	kstat_t *ksp;
173 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
174 
175 	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
176 	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
177 	if (ksp == NULL)
178 		return (ENOMEM);
179 
180 	ksp->ks_update = ipmp_grp_update_kstats;
181 	ksp->ks_private = grp;
182 	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
183 
184 	kstat_install(ksp);
185 	grp->gr_ksp = ksp;
186 	return (0);
187 }
188 
189 /*
190  * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
191  */
192 static int
193 ipmp_grp_update_kstats(kstat_t *ksp, int rw)
194 {
195 	uint_t		i;
196 	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
197 	ipmp_grp_t	*grp = ksp->ks_private;
198 	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
199 	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
200 	phyint_t	*phyi;
201 	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
202 
203 	if (rw == KSTAT_WRITE)
204 		return (EACCES);
205 
206 	/*
207 	 * Start with the group's baseline values.
208 	 */
209 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
210 		if (kn[i].data_type == KSTAT_DATA_UINT32) {
211 			kn[i].value.ui32 = grp->gr_kstats0[i];
212 		} else {
213 			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
214 			kn[i].value.ui64 = grp->gr_kstats0[i];
215 		}
216 	}
217 
218 	/*
219 	 * Add in the stats of each phyint currently in the group.  Since we
220 	 * don't directly track the phyints in a group, we cheat by walking
221 	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
222 	 * ill_g_lock is held.)
223 	 */
224 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
225 	ipsq = grp_ipsq->ipsq_next;
226 	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
227 		phyi = ipsq->ipsq_phyint;
228 
229 		/*
230 		 * If a phyint in a group is being unplumbed, it's possible
231 		 * that ill_glist_delete() -> phyint_free() already freed the
232 		 * phyint (and set ipsq_phyint to NULL), but the unplumb
233 		 * operation has yet to complete (and thus ipsq_dq() has yet
234 		 * to remove the phyint's IPSQ from the group IPSQ's phyint
235 		 * list).  We skip those phyints here (note that their kstats
236 		 * have already been added to gr_kstats0[]).
237 		 */
238 		if (phyi == NULL)
239 			continue;
240 
241 		ipmp_phyint_get_kstats(phyi, phyi_kstats);
242 
243 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
244 			phyi_kstats[i] -= phyi->phyint_kstats0[i];
245 			if (kn[i].data_type == KSTAT_DATA_UINT32)
246 				kn[i].value.ui32 += phyi_kstats[i];
247 			else
248 				kn[i].value.ui64 += phyi_kstats[i];
249 		}
250 	}
251 
252 	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
253 	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
254 
255 	rw_exit(&ipst->ips_ill_g_lock);
256 	return (0);
257 }
258 
259 /*
260  * Destroy IPMP kstat structures for `grp'.
261  */
262 static void
263 ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
264 {
265 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
266 
267 	kstat_delete_netstack(grp->gr_ksp, id);
268 	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
269 	grp->gr_ksp = NULL;
270 }
271 
272 /*
273  * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
274  * does not exist.
275  */
276 ipmp_grp_t *
277 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
278 {
279 	ipmp_grp_t *grp;
280 
281 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
282 
283 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
284 	    (mod_hash_val_t *)&grp) == 0)
285 		return (grp);
286 
287 	return (NULL);
288 }
289 
290 /*
291  * Place information about group `grp' into `lifgr'.
292  */
293 void
294 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
295 {
296 	ill_t *ill;
297 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
298 
299 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
300 
301 	lifgr->gi_v4 = (grp->gr_v4 != NULL);
302 	lifgr->gi_v6 = (grp->gr_v6 != NULL);
303 	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
304 	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
305 	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
306 	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
307 	lifgr->gi_m4ifname[0] = '\0';
308 	lifgr->gi_m6ifname[0] = '\0';
309 	lifgr->gi_bcifname[0] = '\0';
310 
311 	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
312 		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
313 		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
314 	}
315 
316 	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
317 		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
318 }
319 
320 /*
321  * Insert `grp' into the hash using the reserved hash entry `mh'.
322  * Caller must ensure `grp' is not yet in the hash.
323  */
324 static void
325 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
326 {
327 	int err;
328 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
329 
330 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
331 
332 	/*
333 	 * Since grp->gr_name will exist at least as long as `grp' is in the
334 	 * hash, we use it directly as the key.
335 	 */
336 	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
337 	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
338 	if (err != 0) {
339 		/*
340 		 * This should never happen since `mh' was preallocated.
341 		 */
342 		panic("cannot insert IPMP group \"%s\" (err %d)",
343 		    grp->gr_name, err);
344 	}
345 }
346 
347 /*
348  * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
349  */
350 static void
351 ipmp_grp_remove(ipmp_grp_t *grp)
352 {
353 	int err;
354 	mod_hash_val_t val;
355 	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
356 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
357 
358 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
359 
360 	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
361 	if (err != 0 || val != grp) {
362 		panic("cannot remove IPMP group \"%s\" (err %d)",
363 		    grp->gr_name, err);
364 	}
365 }
366 
367 /*
368  * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
369  * group name already exists or is invalid, or if there isn't enough memory.
370  */
371 int
372 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
373 {
374 	mod_hash_hndl_t mh;
375 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
376 
377 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
378 
379 	if (grname[0] == '\0')
380 		return (EINVAL);
381 
382 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
383 	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
384 		return (EEXIST);
385 
386 	/*
387 	 * Before we remove the group from the hash, ensure we'll be able to
388 	 * re-insert it by reserving space.
389 	 */
390 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
391 		return (ENOMEM);
392 
393 	ipmp_grp_remove(grp);
394 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
395 	ipmp_grp_insert(grp, mh);
396 
397 	return (0);
398 }
399 
400 /*
401  * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
402  * the hash, and that there are no interfaces on it.
403  */
404 void
405 ipmp_grp_destroy(ipmp_grp_t *grp)
406 {
407 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
408 
409 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
410 
411 	/*
412 	 * If there are still interfaces using this group, panic before things
413 	 * go really off the rails.
414 	 */
415 	if (grp->gr_nif != 0)
416 		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
417 
418 	ipmp_grp_remove(grp);
419 	ipmp_grp_destroy_kstats(grp);
420 
421 	ASSERT(grp->gr_v4 == NULL);
422 	ASSERT(grp->gr_v6 == NULL);
423 	ASSERT(grp->gr_nv4 == 0);
424 	ASSERT(grp->gr_nv6 == 0);
425 	ASSERT(grp->gr_nactif == 0);
426 	ASSERT(grp->gr_linkdownmp == NULL);
427 	grp->gr_phyint = NULL;
428 
429 	kmem_free(grp, sizeof (ipmp_grp_t));
430 }
431 
432 /*
433  * Check whether `ill' is suitable for inclusion into `grp', and return an
434  * errno describing the problem (if any).  NOTE: many of these errno values
435  * are interpreted by ifconfig, which will take corrective action and retry
436  * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
437  */
438 static int
439 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
440 {
441 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
442 
443 	ASSERT(IAM_WRITER_ILL(ill));
444 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
445 
446 	/*
447 	 * To sidestep complicated address migration logic in the kernel and
448 	 * to force the kernel's all-hosts multicast memberships to be blown
449 	 * away, all addresses that had been brought up must be brought back
450 	 * down prior to adding an interface to a group.  (This includes
451 	 * addresses currently down due to DAD.)  Once the interface has been
452 	 * added to the group, its addresses can then be brought back up, at
453 	 * which point they will be moved to the IPMP meta-interface.
454 	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
455 	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
456 	 */
457 	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
458 		return (EADDRINUSE);
459 
460 	/*
461 	 * To avoid confusing applications by changing addresses that are
462 	 * under their control, all such control must be removed prior to
463 	 * adding an interface into a group.
464 	 */
465 	if (ill_appaddr_cnt(ill) != 0)
466 		return (EADDRNOTAVAIL);
467 
468 	/*
469 	 * Since PTP addresses do not share the same broadcast domain, they
470 	 * are not allowed to be in an IPMP group.
471 	 */
472 	if (ill_ptpaddr_cnt(ill) != 0)
473 		return (EINVAL);
474 
475 	/*
476 	 * An ill must support multicast to be allowed into a group.
477 	 */
478 	if (!(ill->ill_flags & ILLF_MULTICAST))
479 		return (ENOTSUP);
480 
481 	/*
482 	 * An ill must strictly be using ARP and/or ND for address
483 	 * resolution for it to be allowed into a group.
484 	 */
485 	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
486 		return (ENOTSUP);
487 
488 	/*
489 	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
490 	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
491 	 * all its modifications as writer.)
492 	 */
493 	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
494 		return (ENOTSUP);
495 
496 	/*
497 	 * All ills in a group must be the same mactype.
498 	 */
499 	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
500 		return (EINVAL);
501 
502 	return (0);
503 }
504 
505 /*
506  * Check whether `phyi' is suitable for inclusion into `grp', and return an
507  * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
508  * regarding errno values.
509  */
510 int
511 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
512 {
513 	int err = 0;
514 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
515 
516 	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
517 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
518 
519 	/*
520 	 * An interface cannot have address families plumbed that are not
521 	 * configured in the group.
522 	 */
523 	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
524 	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
525 		return (EAFNOSUPPORT);
526 
527 	if (phyi->phyint_illv4 != NULL)
528 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
529 	if (err == 0 && phyi->phyint_illv6 != NULL)
530 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
531 
532 	return (err);
533 }
534 
535 /*
536  * Create a new illgrp on IPMP meta-interface `ill'.
537  */
538 ipmp_illgrp_t *
539 ipmp_illgrp_create(ill_t *ill)
540 {
541 	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
542 	ipmp_illgrp_t *illg;
543 
544 	ASSERT(IAM_WRITER_ILL(ill));
545 	ASSERT(IS_IPMP(ill));
546 	ASSERT(ill->ill_grp == NULL);
547 
548 	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
549 		return (NULL);
550 
551 	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
552 	list_create(&illg->ig_actif, sizeof (ill_t),
553 	    offsetof(ill_t, ill_actnode));
554 	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
555 	    offsetof(ipmp_arpent_t, ia_node));
556 
557 	illg->ig_ipmp_ill = ill;
558 	ill->ill_grp = illg;
559 	ipmp_illgrp_set_mtu(illg, mtu, mtu);
560 
561 	return (illg);
562 }
563 
564 /*
565  * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
566  */
567 void
568 ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
569 {
570 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
571 	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
572 
573 	/*
574 	 * Verify `illg' is empty.
575 	 */
576 	ASSERT(illg->ig_next_ill == NULL);
577 	ASSERT(illg->ig_cast_ill == NULL);
578 	ASSERT(list_is_empty(&illg->ig_arpent));
579 	ASSERT(list_is_empty(&illg->ig_if));
580 	ASSERT(list_is_empty(&illg->ig_actif));
581 	ASSERT(illg->ig_nactif == 0);
582 
583 	/*
584 	 * Destroy `illg'.
585 	 */
586 	illg->ig_ipmp_ill->ill_grp = NULL;
587 	illg->ig_ipmp_ill = NULL;
588 	list_destroy(&illg->ig_if);
589 	list_destroy(&illg->ig_actif);
590 	list_destroy(&illg->ig_arpent);
591 	kmem_free(illg, sizeof (ipmp_illgrp_t));
592 }
593 
594 /*
595  * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
596  * bind it to an underlying ill, while keeping an even address distribution.
597  * If the bind is successful, return a pointer to the bound ill.
598  */
599 ill_t *
600 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
601 {
602 	ill_t *minill;
603 	ipmp_arpent_t *entp;
604 
605 	ASSERT(IAM_WRITER_IPIF(ipif));
606 	ASSERT(ipmp_ipif_is_dataaddr(ipif));
607 
608 	/*
609 	 * IPMP data address mappings are internally managed by IP itself, so
610 	 * delete any existing ARP entries associated with the address.
611 	 */
612 	if (!ipif->ipif_isv6) {
613 		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
614 		if (entp != NULL)
615 			ipmp_illgrp_destroy_arpent(illg, entp);
616 	}
617 
618 	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
619 		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
620 
621 	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
622 }
623 
624 /*
625  * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
626  * bound, unbind it from the underlying ill while keeping an even address
627  * distribution.
628  */
629 void
630 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
631 {
632 	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
633 
634 	ASSERT(IAM_WRITER_IPIF(ipif));
635 
636 	if (boundill != NULL) {
637 		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
638 
639 		maxill = ipmp_illgrp_max_ill(illg);
640 		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
641 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
642 			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
643 		}
644 	}
645 }
646 
647 /*
648  * Return the active ill with the greatest number of data addresses in `illg'.
649  */
650 static ill_t *
651 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
652 {
653 	ill_t *ill, *bestill = NULL;
654 
655 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
656 
657 	ill = list_head(&illg->ig_actif);
658 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
659 		if (bestill == NULL ||
660 		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
661 			bestill = ill;
662 		}
663 	}
664 	return (bestill);
665 }
666 
667 /*
668  * Return the active ill with the fewest number of data addresses in `illg'.
669  */
670 static ill_t *
671 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
672 {
673 	ill_t *ill, *bestill = NULL;
674 
675 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
676 
677 	ill = list_head(&illg->ig_actif);
678 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
679 		if (bestill == NULL ||
680 		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
681 			if (ill->ill_bound_cnt == 0)
682 				return (ill);	 /* can't get better */
683 			bestill = ill;
684 		}
685 	}
686 	return (bestill);
687 }
688 
689 /*
690  * Return a pointer to IPMP meta-interface for `illg' (which must exist).
691  * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
692  */
693 ill_t *
694 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
695 {
696 	return (illg->ig_ipmp_ill);
697 }
698 
699 /*
700  * Return a pointer to the next available underlying ill in `illg', or NULL if
701  * one doesn't exist.  Caller must be inside the IPSQ.
702  */
703 ill_t *
704 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
705 {
706 	ill_t *ill;
707 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
708 
709 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
710 
711 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
712 	if ((ill = illg->ig_next_ill) != NULL) {
713 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
714 		if (illg->ig_next_ill == NULL)
715 			illg->ig_next_ill = list_head(&illg->ig_actif);
716 	}
717 	rw_exit(&ipst->ips_ipmp_lock);
718 
719 	return (ill);
720 }
721 
722 /*
723  * Return a held pointer to the next available underlying ill in `illg', or
724  * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
725  */
726 ill_t *
727 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
728 {
729 	ill_t *ill;
730 	uint_t i;
731 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
732 
733 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
734 	for (i = 0; i < illg->ig_nactif; i++) {
735 		ill = illg->ig_next_ill;
736 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
737 		if (illg->ig_next_ill == NULL)
738 			illg->ig_next_ill = list_head(&illg->ig_actif);
739 
740 		if (ill_check_and_refhold(ill)) {
741 			rw_exit(&ipst->ips_ipmp_lock);
742 			return (ill);
743 		}
744 	}
745 	rw_exit(&ipst->ips_ipmp_lock);
746 
747 	return (NULL);
748 }
749 
750 /*
751  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
752  * one doesn't exist.  Caller need not be inside the IPSQ.
753  */
754 ill_t *
755 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
756 {
757 	ill_t *castill;
758 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
759 
760 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
761 	castill = illg->ig_cast_ill;
762 	if (castill != NULL && ill_check_and_refhold(castill)) {
763 		rw_exit(&ipst->ips_ipmp_lock);
764 		return (castill);
765 	}
766 	rw_exit(&ipst->ips_ipmp_lock);
767 	return (NULL);
768 }
769 
770 /*
771  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
772  * any existing nomination is removed.  Caller must be inside the IPSQ.
773  */
774 static void
775 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
776 {
777 	ill_t *ocastill = illg->ig_cast_ill;
778 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
779 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
780 
781 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
782 
783 	/*
784 	 * Disable old nominated ill (if any).
785 	 */
786 	if (ocastill != NULL) {
787 		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
788 		    illg, ill_t *, ocastill);
789 		ASSERT(ocastill->ill_nom_cast);
790 		ocastill->ill_nom_cast = B_FALSE;
791 		/*
792 		 * If the IPMP meta-interface is down, we never did the join,
793 		 * so we must not try to leave.
794 		 */
795 		if (ipmp_ill->ill_dl_up)
796 			ill_leave_multicast(ipmp_ill);
797 
798 		/*
799 		 * Delete any NCEs tied to the old nomination.  We must do this
800 		 * last since ill_leave_multicast() may trigger IREs to be
801 		 * built using ig_cast_ill.
802 		 */
803 		ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
804 		    ocastill->ill_ipst);
805 	}
806 
807 	/*
808 	 * Set new nomination.
809 	 */
810 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
811 	illg->ig_cast_ill = castill;
812 	rw_exit(&ipst->ips_ipmp_lock);
813 
814 	/*
815 	 * Enable new nominated ill (if any).
816 	 */
817 	if (castill != NULL) {
818 		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
819 		    illg, ill_t *, castill);
820 		ASSERT(!castill->ill_nom_cast);
821 		castill->ill_nom_cast = B_TRUE;
822 		/*
823 		 * If the IPMP meta-interface is down, the attempt to recover
824 		 * will silently fail but ill_need_recover_multicast will be
825 		 * erroneously cleared -- so check first.
826 		 */
827 		if (ipmp_ill->ill_dl_up)
828 			ill_recover_multicast(ipmp_ill);
829 	}
830 }
831 
832 /*
833  * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
834  * entry for the same IP address already exists, destroy it first.  Return the
835  * created IPMP ARP entry, or NULL on failure.
836  */
837 ipmp_arpent_t *
838 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
839     ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
840 {
841 	ipmp_arpent_t *entp, *oentp;
842 
843 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
844 
845 	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
846 	    KM_NOSLEEP)) == NULL)
847 		return (NULL);
848 
849 	/*
850 	 * Delete any existing ARP entry for this address.
851 	 */
852 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
853 		ipmp_illgrp_destroy_arpent(illg, oentp);
854 
855 	/*
856 	 * Prepend the new entry.
857 	 */
858 	entp->ia_ipaddr = ipaddr;
859 	entp->ia_flags = flags;
860 	entp->ia_lladdr_len = lladdr_len;
861 	entp->ia_lladdr = (uchar_t *)&entp[1];
862 	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
863 	entp->ia_proxyarp = proxyarp;
864 	entp->ia_notified = B_TRUE;
865 	list_insert_head(&illg->ig_arpent, entp);
866 	return (entp);
867 }
868 
869 /*
870  * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
871  */
872 void
873 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
874 {
875 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
876 
877 	list_remove(&illg->ig_arpent, entp);
878 	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
879 }
880 
881 /*
882  * Mark that ARP has been notified about the IP address on `entp'; `illg' is
883  * taken as a debugging aid for DTrace FBT probes.
884  */
885 /* ARGSUSED */
886 void
887 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
888 {
889 	entp->ia_notified = B_TRUE;
890 }
891 
892 /*
893  * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
894  * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
895  */
896 ipmp_arpent_t *
897 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
898 {
899 	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
900 
901 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
902 
903 	if (addrp == NULL)
904 		return (entp);
905 
906 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
907 		if (entp->ia_ipaddr == *addrp)
908 			break;
909 	return (entp);
910 }
911 
912 /*
913  * Refresh ARP entries on `illg' to be distributed across its active
914  * interfaces.  Entries that cannot be refreshed (e.g., because there are no
915  * active interfaces) are marked so that subsequent calls can try again.
916  */
917 void
918 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
919 {
920 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
921 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
922 	ipmp_arpent_t *entp;
923 	ncec_t *ncec;
924 	nce_t  *nce;
925 
926 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
927 	ASSERT(!ipmp_ill->ill_isv6);
928 
929 	ill = list_head(&illg->ig_actif);
930 	entp = list_head(&illg->ig_arpent);
931 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
932 		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
933 			entp->ia_notified = B_FALSE;
934 			continue;
935 		}
936 
937 		ASSERT(paddrlen == ill->ill_phys_addr_length);
938 
939 		/*
940 		 * If this is a proxy ARP entry, we can skip notifying ARP if
941 		 * the entry is already up-to-date.  If it has changed, we
942 		 * update the entry's hardware address before notifying ARP.
943 		 */
944 		if (entp->ia_proxyarp) {
945 			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
946 			    paddrlen) == 0 && entp->ia_notified)
947 				continue;
948 			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
949 		}
950 
951 		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
952 		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
953 		    &nce);
954 		if (nce == NULL || !entp->ia_proxyarp) {
955 			if (nce != NULL)
956 				nce_refrele(nce);
957 			continue;
958 		}
959 		ncec = nce->nce_common;
960 		mutex_enter(&ncec->ncec_lock);
961 		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
962 		mutex_exit(&ncec->ncec_lock);
963 		nce_refrele(nce);
964 		ipmp_illgrp_mark_arpent(illg, entp);
965 
966 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
967 			ill = list_head(&illg->ig_actif);
968 	}
969 }
970 
971 /*
972  * Return an interface in `illg' with the specified `physaddr', or NULL if one
973  * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
974  */
975 ill_t *
976 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
977 {
978 	ill_t *ill;
979 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
980 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
981 
982 	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
983 
984 	ill = list_head(&illg->ig_if);
985 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
986 		if (ill->ill_phys_addr_length == paddrlen &&
987 		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
988 			return (ill);
989 	}
990 	return (NULL);
991 }
992 
993 /*
994  * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
995  * Caller must be inside the IPSQ unless this is initialization.
996  */
997 static void
998 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu)
999 {
1000 	ill_t *ill = illg->ig_ipmp_ill;
1001 	mblk_t *mp;
1002 
1003 	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1004 
1005 	/*
1006 	 * If allocation fails, we have bigger problems than MTU.
1007 	 */
1008 	if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) {
1009 		illg->ig_mtu = mtu;
1010 		illg->ig_mc_mtu = mc_mtu;
1011 		put(ill->ill_rq, mp);
1012 	}
1013 }
1014 
1015 /*
1016  * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1017  * ill MTU if necessary.
1018  */
1019 void
1020 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1021 {
1022 	ill_t *ill;
1023 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1024 	uint_t mtu = 0;
1025 	uint_t mc_mtu = 0;
1026 
1027 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
1028 
1029 	/*
1030 	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
1031 	 * for each ill as we iterate through the list.  Any changes to the
1032 	 * ill_mtu will also trigger an update, so even if we missed it
1033 	 * this time around, the update will catch it.
1034 	 */
1035 	ill = list_head(&illg->ig_if);
1036 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1037 		mutex_enter(&ill->ill_lock);
1038 		if (mtu == 0 || ill->ill_mtu < mtu)
1039 			mtu = ill->ill_mtu;
1040 		if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu)
1041 			mc_mtu = ill->ill_mc_mtu;
1042 		mutex_exit(&ill->ill_lock);
1043 	}
1044 
1045 	/*
1046 	 * MTU must be at least the minimum MTU.
1047 	 */
1048 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1049 	mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1050 	if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu)
1051 		ipmp_illgrp_set_mtu(illg, mtu, mc_mtu);
1052 }
1053 
1054 /*
1055  * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
1056  * allow the same link to be established more than once.
1057  */
1058 void
1059 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1060 {
1061 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1062 
1063 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1064 
1065 	if (illg->ig_ipmp_ill->ill_isv6) {
1066 		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1067 		grp->gr_v6 = illg;
1068 	} else {
1069 		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1070 		grp->gr_v4 = illg;
1071 	}
1072 }
1073 
1074 /*
1075  * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
1076  * cannot be unlinked (e.g., because there are still interfaces using it).
1077  */
1078 int
1079 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1080 {
1081 	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1082 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1083 
1084 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1085 
1086 	if (illg->ig_ipmp_ill->ill_isv6) {
1087 		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1088 			return (EBUSY);
1089 		grp->gr_v6 = NULL;
1090 	} else {
1091 		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1092 			return (EBUSY);
1093 		grp->gr_v4 = NULL;
1094 	}
1095 	return (0);
1096 }
1097 
1098 /*
1099  * Place `ill' into `illg', and rebalance the data addresses on `illg'
1100  * to be spread evenly across the ills now in it.  Also, adjust the IPMP
1101  * ill as necessary to account for `ill' (e.g., MTU).
1102  */
1103 void
1104 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1105 {
1106 	ill_t *ipmp_ill;
1107 	ipif_t *ipif;
1108 	ip_stack_t *ipst = ill->ill_ipst;
1109 
1110 	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1111 	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1112 	ASSERT(IAM_WRITER_ILL(ill));
1113 	ASSERT(ill->ill_grp == NULL);
1114 
1115 	ipmp_ill = illg->ig_ipmp_ill;
1116 
1117 	/*
1118 	 * Account for `ill' joining the illgrp.
1119 	 */
1120 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1121 	if (ill->ill_isv6)
1122 		ill->ill_phyint->phyint_grp->gr_nv6++;
1123 	else
1124 		ill->ill_phyint->phyint_grp->gr_nv4++;
1125 	rw_exit(&ipst->ips_ipmp_lock);
1126 
1127 	/*
1128 	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1129 	 */
1130 	mutex_enter(&ill->ill_lock);
1131 	if (ipmp_ill->ill_flags & ILLF_ROUTER)
1132 		ill->ill_flags |= ILLF_ROUTER;
1133 	else
1134 		ill->ill_flags &= ~ILLF_ROUTER;
1135 	mutex_exit(&ill->ill_lock);
1136 
1137 	/*
1138 	 * Blow away all multicast memberships that currently exist on `ill'.
1139 	 * This may seem odd, but it's consistent with the application view
1140 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1141 	 * The ill_grp_pending bit prevents multicast group joins after
1142 	 * update_conn_ill() and before ill_grp assignment.
1143 	 */
1144 	mutex_enter(&ill->ill_mcast_serializer);
1145 	ill->ill_grp_pending = 1;
1146 	mutex_exit(&ill->ill_mcast_serializer);
1147 	update_conn_ill(ill, ill->ill_ipst);
1148 	if (ill->ill_isv6) {
1149 		reset_mrt_ill(ill);
1150 	} else {
1151 		ipif = ill->ill_ipif;
1152 		for (; ipif != NULL; ipif = ipif->ipif_next) {
1153 			reset_mrt_vif_ipif(ipif);
1154 		}
1155 	}
1156 	ip_purge_allmulti(ill);
1157 
1158 	/*
1159 	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1160 	 * physical address length.  All other ills must have the same value,
1161 	 * since they are required to all be the same mactype.  Also update
1162 	 * the IPMP ill's MTU and CoS marking, if necessary.
1163 	 */
1164 	if (list_is_empty(&illg->ig_if)) {
1165 		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1166 		/*
1167 		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1168 		 * doesn't have a physical address.  This means that code must
1169 		 * not assume that ill_phys_addr is non-NULL just because
1170 		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
1171 		 */
1172 		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1173 		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1174 		ipmp_ill->ill_type = ill->ill_type;
1175 
1176 		if (ill->ill_flags & ILLF_COS_ENABLED) {
1177 			mutex_enter(&ipmp_ill->ill_lock);
1178 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1179 			mutex_exit(&ipmp_ill->ill_lock);
1180 		}
1181 		ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu);
1182 	} else {
1183 		ASSERT(ipmp_ill->ill_phys_addr_length ==
1184 		    ill->ill_phys_addr_length);
1185 		ASSERT(ipmp_ill->ill_type == ill->ill_type);
1186 
1187 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1188 			mutex_enter(&ipmp_ill->ill_lock);
1189 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1190 			mutex_exit(&ipmp_ill->ill_lock);
1191 		}
1192 		if (illg->ig_mtu > ill->ill_mtu ||
1193 		    illg->ig_mc_mtu > ill->ill_mc_mtu) {
1194 			ipmp_illgrp_set_mtu(illg, ill->ill_mtu,
1195 			    ill->ill_mc_mtu);
1196 		}
1197 	}
1198 
1199 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1200 	list_insert_tail(&illg->ig_if, ill);
1201 	ill->ill_grp = illg;
1202 	rw_exit(&ipst->ips_ill_g_lock);
1203 
1204 	mutex_enter(&ill->ill_mcast_serializer);
1205 	ill->ill_grp_pending = 0;
1206 	mutex_exit(&ill->ill_mcast_serializer);
1207 
1208 	/*
1209 	 * Hide the IREs on `ill' so that we don't accidentally find them when
1210 	 * sending data traffic.
1211 	 */
1212 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1213 
1214 	ipmp_ill_refresh_active(ill);
1215 }
1216 
1217 /*
1218  * Remove `ill' from its illgrp, and rebalance the data addresses in that
1219  * illgrp to be spread evenly across the remaining ills.  Also, adjust the
1220  * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1221  */
1222 void
1223 ipmp_ill_leave_illgrp(ill_t *ill)
1224 {
1225 	ill_t *ipmp_ill;
1226 	ipif_t *ipif;
1227 	ipmp_arpent_t *entp;
1228 	ipmp_illgrp_t *illg = ill->ill_grp;
1229 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1230 
1231 	ASSERT(IS_UNDER_IPMP(ill));
1232 	ASSERT(IAM_WRITER_ILL(ill));
1233 	ASSERT(illg != NULL);
1234 
1235 	ipmp_ill = illg->ig_ipmp_ill;
1236 
1237 	/*
1238 	 * Cancel IPMP-specific ill timeouts.
1239 	 */
1240 	(void) untimeout(ill->ill_refresh_tid);
1241 
1242 	/*
1243 	 * Expose any previously-hidden IREs on `ill'.
1244 	 */
1245 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1246 
1247 	/*
1248 	 * Ensure the multicast state for each ipif on `ill' is down so that
1249 	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1250 	 * all eligible groups.
1251 	 */
1252 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1253 		if (ipif->ipif_flags & IPIF_UP)
1254 			ipif_multicast_down(ipif);
1255 
1256 	/*
1257 	 * Account for `ill' leaving the illgrp.
1258 	 */
1259 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1260 	if (ill->ill_isv6)
1261 		ill->ill_phyint->phyint_grp->gr_nv6--;
1262 	else
1263 		ill->ill_phyint->phyint_grp->gr_nv4--;
1264 	rw_exit(&ipst->ips_ipmp_lock);
1265 
1266 	/*
1267 	 * Pull `ill' out of the interface lists.
1268 	 */
1269 	if (list_link_active(&ill->ill_actnode))
1270 		ipmp_ill_deactivate(ill);
1271 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1272 	list_remove(&illg->ig_if, ill);
1273 	ill->ill_grp = NULL;
1274 	rw_exit(&ipst->ips_ill_g_lock);
1275 
1276 	/*
1277 	 * Re-establish multicast memberships that were previously being
1278 	 * handled by the IPMP meta-interface.
1279 	 */
1280 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1281 		if (ipif->ipif_flags & IPIF_UP)
1282 			ipif_multicast_up(ipif);
1283 
1284 	/*
1285 	 * Refresh the group MTU based on the new interface list.
1286 	 */
1287 	ipmp_illgrp_refresh_mtu(illg);
1288 
1289 	if (list_is_empty(&illg->ig_if)) {
1290 		/*
1291 		 * No ills left in the illgrp; we no longer have a physical
1292 		 * address length, nor can we support ARP, CoS, or anything
1293 		 * else that depends on knowing the link layer type.
1294 		 */
1295 		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1296 			ipmp_illgrp_destroy_arpent(illg, entp);
1297 
1298 		ipmp_ill->ill_phys_addr_length = 0;
1299 		ipmp_ill->ill_nd_lla_len = 0;
1300 		ipmp_ill->ill_type = IFT_OTHER;
1301 		mutex_enter(&ipmp_ill->ill_lock);
1302 		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1303 		mutex_exit(&ipmp_ill->ill_lock);
1304 	} else {
1305 		/*
1306 		 * If `ill' didn't support CoS, see if it can now be enabled.
1307 		 */
1308 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1309 			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1310 
1311 			ill = list_head(&illg->ig_if);
1312 			do {
1313 				if (!(ill->ill_flags & ILLF_COS_ENABLED))
1314 					break;
1315 			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1316 
1317 			if (ill == NULL) {
1318 				mutex_enter(&ipmp_ill->ill_lock);
1319 				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1320 				mutex_exit(&ipmp_ill->ill_lock);
1321 			}
1322 		}
1323 	}
1324 }
1325 
1326 /*
1327  * Check if `ill' should be active, and activate or deactivate if need be.
1328  * Return B_FALSE if a refresh was necessary but could not be performed.
1329  */
1330 static boolean_t
1331 ipmp_ill_try_refresh_active(ill_t *ill)
1332 {
1333 	boolean_t refreshed = B_TRUE;
1334 
1335 	ASSERT(IAM_WRITER_ILL(ill));
1336 	ASSERT(IS_UNDER_IPMP(ill));
1337 
1338 	if (ipmp_ill_is_active(ill)) {
1339 		if (!list_link_active(&ill->ill_actnode))
1340 			refreshed = ipmp_ill_activate(ill);
1341 	} else {
1342 		if (list_link_active(&ill->ill_actnode))
1343 			ipmp_ill_deactivate(ill);
1344 	}
1345 
1346 	return (refreshed);
1347 }
1348 
1349 /*
1350  * Check if `ill' should be active, and activate or deactivate if need be.
1351  * If the refresh fails, schedule a timer to try again later.
1352  */
1353 void
1354 ipmp_ill_refresh_active(ill_t *ill)
1355 {
1356 	if (!ipmp_ill_try_refresh_active(ill))
1357 		ipmp_ill_refresh_active_timer_start(ill);
1358 }
1359 
1360 /*
1361  * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1362  */
1363 static void
1364 ipmp_ill_refresh_active_timer(void *ill_arg)
1365 {
1366 	ill_t *ill = ill_arg;
1367 	boolean_t refreshed = B_FALSE;
1368 
1369 	/*
1370 	 * Clear ill_refresh_tid to indicate that no timeout is pending
1371 	 * (another thread could schedule a new timeout while we're still
1372 	 * running, but that's harmless).  If the ill is going away, bail.
1373 	 */
1374 	mutex_enter(&ill->ill_lock);
1375 	ill->ill_refresh_tid = 0;
1376 	if (ill->ill_state_flags & ILL_CONDEMNED) {
1377 		mutex_exit(&ill->ill_lock);
1378 		return;
1379 	}
1380 	mutex_exit(&ill->ill_lock);
1381 
1382 	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1383 		refreshed = ipmp_ill_try_refresh_active(ill);
1384 		ipsq_exit(ill->ill_phyint->phyint_ipsq);
1385 	}
1386 
1387 	/*
1388 	 * If the refresh failed, schedule another attempt.
1389 	 */
1390 	if (!refreshed)
1391 		ipmp_ill_refresh_active_timer_start(ill);
1392 }
1393 
1394 /*
1395  * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1396  */
1397 static void
1398 ipmp_ill_refresh_active_timer_start(ill_t *ill)
1399 {
1400 	mutex_enter(&ill->ill_lock);
1401 
1402 	/*
1403 	 * If the ill is going away or a refresh is already scheduled, bail.
1404 	 */
1405 	if (ill->ill_refresh_tid != 0 ||
1406 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
1407 		mutex_exit(&ill->ill_lock);
1408 		return;
1409 	}
1410 
1411 	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1412 	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1413 
1414 	mutex_exit(&ill->ill_lock);
1415 }
1416 
1417 /*
1418  * Activate `ill' so it will be used to send and receive data traffic.  Return
1419  * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
1420  * needed to deactivate `ill' here as well so that deactivation cannot fail.
1421  */
1422 static boolean_t
1423 ipmp_ill_activate(ill_t *ill)
1424 {
1425 	ipif_t		*ipif;
1426 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
1427 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1428 	ipmp_illgrp_t	*illg = ill->ill_grp;
1429 	ill_t		*maxill;
1430 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1431 
1432 	ASSERT(IAM_WRITER_ILL(ill));
1433 	ASSERT(IS_UNDER_IPMP(ill));
1434 
1435 	/*
1436 	 * If this will be the first active interface in the group, allocate
1437 	 * the link-up and link-down messages.
1438 	 */
1439 	if (grp->gr_nactif == 0) {
1440 		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1441 		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1442 		if (linkupmp == NULL || linkdownmp == NULL)
1443 			goto fail;
1444 	}
1445 
1446 	if (list_is_empty(&illg->ig_actif)) {
1447 		/*
1448 		 * Now that we have an active ill, nominate it for multicast
1449 		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
1450 		 * since that may need to send multicast packets (e.g., IPv6
1451 		 * neighbor discovery probes).
1452 		 */
1453 		ipmp_illgrp_set_cast(illg, ill);
1454 
1455 		/*
1456 		 * This is the first active ill in the illgrp -- add 'em all.
1457 		 * We can access/walk ig_ipmp_ill's ipif list since we're
1458 		 * writer on its IPSQ as well.
1459 		 */
1460 		ipif = illg->ig_ipmp_ill->ill_ipif;
1461 		for (; ipif != NULL; ipif = ipif->ipif_next)
1462 			if (ipmp_ipif_is_up_dataaddr(ipif))
1463 				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1464 	} else {
1465 		/*
1466 		 * Redistribute the addresses by moving them from the ill with
1467 		 * the most addresses until the ill being activated is at the
1468 		 * same level as the rest of the ills.
1469 		 */
1470 		for (;;) {
1471 			maxill = ipmp_illgrp_max_ill(illg);
1472 			ASSERT(maxill != NULL);
1473 			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1474 				break;
1475 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1476 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1477 		}
1478 	}
1479 
1480 	/*
1481 	 * Put the interface in the active list.
1482 	 */
1483 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1484 	list_insert_tail(&illg->ig_actif, ill);
1485 	illg->ig_nactif++;
1486 	illg->ig_next_ill = ill;
1487 	rw_exit(&ipst->ips_ipmp_lock);
1488 
1489 	/*
1490 	 * Refresh static/proxy ARP entries to use `ill', if need be.
1491 	 */
1492 	if (!ill->ill_isv6)
1493 		ipmp_illgrp_refresh_arpent(illg);
1494 
1495 	/*
1496 	 * Finally, mark the group link up, if necessary.
1497 	 */
1498 	if (grp->gr_nactif++ == 0) {
1499 		ASSERT(grp->gr_linkdownmp == NULL);
1500 		grp->gr_linkdownmp = linkdownmp;
1501 		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1502 	}
1503 	return (B_TRUE);
1504 fail:
1505 	freemsg(linkupmp);
1506 	freemsg(linkdownmp);
1507 	return (B_FALSE);
1508 }
1509 
1510 /*
1511  * Deactivate `ill' so it will not be used to send or receive data traffic.
1512  */
1513 static void
1514 ipmp_ill_deactivate(ill_t *ill)
1515 {
1516 	ill_t		*minill, *ipmp_ill;
1517 	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
1518 	mblk_t		*mp;
1519 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1520 	ipmp_illgrp_t	*illg = ill->ill_grp;
1521 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1522 
1523 	ASSERT(IAM_WRITER_ILL(ill));
1524 	ASSERT(IS_UNDER_IPMP(ill));
1525 
1526 	ipmp_ill = illg->ig_ipmp_ill;
1527 
1528 	/*
1529 	 * Pull the interface out of the active list.
1530 	 */
1531 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1532 	list_remove(&illg->ig_actif, ill);
1533 	illg->ig_nactif--;
1534 	illg->ig_next_ill = list_head(&illg->ig_actif);
1535 	rw_exit(&ipst->ips_ipmp_lock);
1536 
1537 	/*
1538 	 * If the ill that's being deactivated had been nominated for
1539 	 * multicast/broadcast, nominate a new one.
1540 	 */
1541 	if (ill == illg->ig_cast_ill)
1542 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1543 
1544 	/*
1545 	 * Delete all nce_t entries using this ill, so that the next attempt
1546 	 * to send data traffic will revalidate cached nce's.
1547 	 */
1548 	nce_flush(ill, B_TRUE);
1549 
1550 	/*
1551 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1552 	 * we'll rebind them after we tell the resolver the ill is no longer
1553 	 * active.  We must do things in this order or the resolver could
1554 	 * accidentally rebind to the ill we're trying to remove if multiple
1555 	 * ills in the group have the same hardware address (which is
1556 	 * unsupported, but shouldn't lead to a wedged machine).
1557 	 */
1558 	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1559 		ipif->ipif_bound_next = ubheadipif;
1560 		ubheadipif = ipif;
1561 	}
1562 
1563 	if (!ill->ill_isv6) {
1564 		/*
1565 		 * Refresh static/proxy ARP entries that had been using `ill'.
1566 		 */
1567 		ipmp_illgrp_refresh_arpent(illg);
1568 	}
1569 
1570 	/*
1571 	 * Rebind each ipif from the deactivated ill to the active ill with
1572 	 * the fewest ipifs.  If there are no active ills, the ipifs will
1573 	 * remain unbound.
1574 	 */
1575 	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1576 		ubnextipif = ipif->ipif_bound_next;
1577 		ipif->ipif_bound_next = NULL;
1578 
1579 		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1580 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1581 	}
1582 
1583 	/*
1584 	 * Remove any IRE_IF_CLONEs for this ill since they might have an
1585 	 * ire_nce_cache/nce_common which refers to another ill in the group.
1586 	 */
1587 	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, ill,
1588 	    ill);
1589 
1590 	/*
1591 	 * Finally, if there are no longer any active interfaces, then delete
1592 	 * any NCECs associated with the group and mark the group link down.
1593 	 */
1594 	if (--grp->gr_nactif == 0) {
1595 		ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, ipmp_ill, ipst);
1596 		mp = grp->gr_linkdownmp;
1597 		grp->gr_linkdownmp = NULL;
1598 		ASSERT(mp != NULL);
1599 		put(ipmp_ill->ill_rq, mp);
1600 	}
1601 }
1602 
1603 /*
1604  * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1605  * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1606  */
1607 static void
1608 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1609 {
1610 	ipif_t *ipif;
1611 
1612 	ASSERT(IAM_WRITER_ILL(ill));
1613 	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1614 
1615 	/*
1616 	 * If `ill' is truly down, there are no messages to generate since:
1617 	 *
1618 	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1619 	 *    and its addresses by bringing them down.  But that's already
1620 	 *    true, so there's nothing to hide.
1621 	 *
1622 	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1623 	 *    indicating that any previously-hidden up addresses are again
1624 	 *    back up (along with the interface).  But they aren't, so
1625 	 *    there's nothing to expose.
1626 	 */
1627 	if (ill->ill_ipif_up_count == 0)
1628 		return;
1629 
1630 	if (cmd == RTM_ADD)
1631 		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1632 
1633 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1634 		if (ipif->ipif_flags & IPIF_UP)
1635 			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1636 
1637 	if (cmd == RTM_DELETE)
1638 		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1639 }
1640 
1641 /*
1642  * Bind the address named by `ipif' to the underlying ill named by `ill'.
1643  * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
1644  * will indicate to the resolver whether this is an initial bringup of
1645  * `ipif', or just a rebind to another ill.
1646  */
1647 static void
1648 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1649 {
1650 	int err = 0;
1651 	ip_stack_t *ipst = ill->ill_ipst;
1652 
1653 	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1654 	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1655 	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1656 	ASSERT(ipif->ipif_bound_ill == NULL);
1657 	ASSERT(ipif->ipif_bound_next == NULL);
1658 
1659 	ipif->ipif_bound_next = ill->ill_bound_ipif;
1660 	ill->ill_bound_ipif = ipif;
1661 	ill->ill_bound_cnt++;
1662 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1663 	ipif->ipif_bound_ill = ill;
1664 	rw_exit(&ipst->ips_ipmp_lock);
1665 
1666 	/*
1667 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
1668 	 * ipif_resolver_up() cannot fail for IPv6 ills.
1669 	 */
1670 	if (act != Res_act_none) {
1671 		if (ill->ill_isv6) {
1672 			VERIFY(ipif_resolver_up(ipif, act) == 0);
1673 			err = ipif_ndp_up(ipif, act == Res_act_initial);
1674 		} else {
1675 			err = ipif_resolver_up(ipif, act);
1676 		}
1677 
1678 		/*
1679 		 * Since ipif_ndp_up() never returns EINPROGRESS and
1680 		 * ipif_resolver_up() only returns EINPROGRESS when the
1681 		 * associated ill is not up, we should never be here with
1682 		 * EINPROGRESS.  We rely on this to simplify the design.
1683 		 */
1684 		ASSERT(err != EINPROGRESS);
1685 	}
1686 	/* TODO: retry binding on failure? when? */
1687 	ipif->ipif_bound = (err == 0);
1688 }
1689 
1690 /*
1691  * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1692  * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1693  * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
1694  * B_TRUE, notify the resolver about the change.
1695  */
1696 static ipif_t *
1697 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1698 {
1699 	ipif_t *previpif;
1700 	ip_stack_t *ipst = ill->ill_ipst;
1701 
1702 	ASSERT(IAM_WRITER_ILL(ill));
1703 	ASSERT(IS_UNDER_IPMP(ill));
1704 
1705 	/*
1706 	 * If necessary, find an ipif to unbind.
1707 	 */
1708 	if (ipif == NULL) {
1709 		if ((ipif = ill->ill_bound_ipif) == NULL) {
1710 			ASSERT(ill->ill_bound_cnt == 0);
1711 			return (NULL);
1712 		}
1713 	}
1714 
1715 	ASSERT(IAM_WRITER_IPIF(ipif));
1716 	ASSERT(IS_IPMP(ipif->ipif_ill));
1717 	ASSERT(ipif->ipif_bound_ill == ill);
1718 	ASSERT(ill->ill_bound_cnt > 0);
1719 
1720 	/*
1721 	 * Unbind it.
1722 	 */
1723 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1724 	ipif->ipif_bound_ill = NULL;
1725 	rw_exit(&ipst->ips_ipmp_lock);
1726 	ill->ill_bound_cnt--;
1727 
1728 	if (ill->ill_bound_ipif == ipif) {
1729 		ill->ill_bound_ipif = ipif->ipif_bound_next;
1730 	} else {
1731 		previpif = ill->ill_bound_ipif;
1732 		while (previpif->ipif_bound_next != ipif)
1733 			previpif = previpif->ipif_bound_next;
1734 
1735 		previpif->ipif_bound_next = ipif->ipif_bound_next;
1736 	}
1737 	ipif->ipif_bound_next = NULL;
1738 
1739 	/*
1740 	 * If requested, notify the resolvers (provided we're bound).
1741 	 */
1742 	if (notifyres && ipif->ipif_bound) {
1743 		if (ill->ill_isv6)
1744 			ipif_ndp_down(ipif);
1745 		else
1746 			(void) ipif_arp_down(ipif);
1747 	}
1748 	ipif->ipif_bound = B_FALSE;
1749 
1750 	return (ipif);
1751 }
1752 
1753 /*
1754  * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
1755  * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
1756  * to determine whether an ill should be considered active, other consumers
1757  * may race and learn about an ill that should be deactivated/activated before
1758  * IPMP has performed the activation/deactivation.  This should be safe though
1759  * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1760  * would've been cleaned up by ipmp_ill_deactivate().
1761  */
1762 boolean_t
1763 ipmp_ill_is_active(ill_t *ill)
1764 {
1765 	phyint_t *phyi = ill->ill_phyint;
1766 
1767 	ASSERT(IS_UNDER_IPMP(ill));
1768 	ASSERT(IAM_WRITER_ILL(ill) ||
1769 	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1770 
1771 	/*
1772 	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1773 	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
1774 	 * link flapping logic to be just in in.mpathd and allows us to ignore
1775 	 * changes to PHYI_RUNNING.
1776 	 */
1777 	return (!(ill->ill_ipif_up_count == 0 ||
1778 	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1779 }
1780 
1781 /*
1782  * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
1783  * with `ill_arg'.
1784  */
1785 static void
1786 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1787 {
1788 	ill_t *ill = (ill_t *)ill_arg;
1789 
1790 	ASSERT(IAM_WRITER_ILL(ill));
1791 	ASSERT(!IS_IPMP(ill));
1792 
1793 	if (ire->ire_ill != ill)
1794 		return;
1795 
1796 	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
1797 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1798 		ire->ire_testhidden = B_TRUE;
1799 	}
1800 }
1801 
1802 /*
1803  * IRE walker callback: clear ire_testhidden if the IRE has a source address
1804  * on `ill_arg'.
1805  */
1806 static void
1807 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1808 {
1809 	ill_t *ill = (ill_t *)ill_arg;
1810 
1811 	ASSERT(IAM_WRITER_ILL(ill));
1812 	ASSERT(!IS_IPMP(ill));
1813 
1814 	if (ire->ire_ill == ill) {
1815 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1816 		ire->ire_testhidden = B_FALSE;
1817 	}
1818 }
1819 
1820 /*
1821  * Return a held pointer to the IPMP ill for underlying interface `ill', or
1822  * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
1823  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1824  * ill_grp pointer may become stale when not inside an IPSQ and not holding
1825  * ipmp_lock.)  Caller need not be inside the IPSQ.
1826  */
1827 ill_t *
1828 ipmp_ill_hold_ipmp_ill(ill_t *ill)
1829 {
1830 	ip_stack_t *ipst = ill->ill_ipst;
1831 	ipmp_illgrp_t *illg;
1832 
1833 	ASSERT(!IS_IPMP(ill));
1834 
1835 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1836 	illg = ill->ill_grp;
1837 	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
1838 		rw_exit(&ipst->ips_ipmp_lock);
1839 		return (illg->ig_ipmp_ill);
1840 	}
1841 	/*
1842 	 * Assume `ill' was removed from the illgrp in the meantime.
1843 	 */
1844 	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1845 	return (NULL);
1846 }
1847 
1848 /*
1849  * Return a held pointer to the appropriate underlying ill for sending the
1850  * specified type of packet.  (Unfortunately, this function needs to take an
1851  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1852  * ill_grp pointer may become stale when not inside an IPSQ and not holding
1853  * ipmp_lock.)  Caller need not be inside the IPSQ.
1854  */
1855 ill_t *
1856 ipmp_ill_hold_xmit_ill(ill_t *ill, boolean_t is_unicast)
1857 {
1858 	ill_t *xmit_ill;
1859 	ip_stack_t *ipst = ill->ill_ipst;
1860 
1861 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1862 	if (ill->ill_grp == NULL) {
1863 		/*
1864 		 * The ill was taken out of the group, so just send on it.
1865 		 */
1866 		rw_exit(&ipst->ips_ill_g_lock);
1867 		ill_refhold(ill);
1868 		return (ill);
1869 	}
1870 	if (is_unicast)
1871 		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
1872 	else
1873 		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
1874 	rw_exit(&ipst->ips_ill_g_lock);
1875 
1876 	return (xmit_ill);
1877 }
1878 
1879 /*
1880  * Return the interface index for the IPMP ill tied to underlying interface
1881  * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
1882  */
1883 uint_t
1884 ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1885 {
1886 	uint_t ifindex = 0;
1887 	ip_stack_t *ipst = ill->ill_ipst;
1888 	ipmp_grp_t *grp;
1889 
1890 	ASSERT(!IS_IPMP(ill));
1891 
1892 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1893 	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1894 		ifindex = grp->gr_phyint->phyint_ifindex;
1895 	rw_exit(&ipst->ips_ipmp_lock);
1896 	return (ifindex);
1897 }
1898 
1899 /*
1900  * Place phyint `phyi' into IPMP group `grp'.
1901  */
1902 void
1903 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1904 {
1905 	ill_t *ill;
1906 	ipsq_t *ipsq = phyi->phyint_ipsq;
1907 	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1908 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1909 
1910 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1911 	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1912 
1913 	/*
1914 	 * Send routing socket messages indicating that the phyint's ills
1915 	 * and ipifs vanished.
1916 	 */
1917 	if (phyi->phyint_illv4 != NULL) {
1918 		ill = phyi->phyint_illv4;
1919 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1920 	}
1921 
1922 	if (phyi->phyint_illv6 != NULL) {
1923 		ill = phyi->phyint_illv6;
1924 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1925 	}
1926 
1927 	/*
1928 	 * Snapshot the phyint's initial kstats as a baseline.
1929 	 */
1930 	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1931 
1932 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1933 
1934 	phyi->phyint_grp = grp;
1935 	if (++grp->gr_nif == 1)
1936 		grp->gr_mactype = ill->ill_mactype;
1937 	else
1938 		ASSERT(grp->gr_mactype == ill->ill_mactype);
1939 
1940 	/*
1941 	 * Now that we're in the group, request a switch to the group's xop
1942 	 * when we ipsq_exit().  All future operations will be exclusive on
1943 	 * the group xop until ipmp_phyint_leave_grp() is called.
1944 	 */
1945 	ASSERT(ipsq->ipsq_swxop == NULL);
1946 	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1947 	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1948 
1949 	rw_exit(&ipst->ips_ipmp_lock);
1950 }
1951 
1952 /*
1953  * Remove phyint `phyi' from its current IPMP group.
1954  */
1955 void
1956 ipmp_phyint_leave_grp(phyint_t *phyi)
1957 {
1958 	uint_t i;
1959 	ipsq_t *ipsq = phyi->phyint_ipsq;
1960 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1961 	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
1962 
1963 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1964 
1965 	/*
1966 	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
1967 	 */
1968 	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
1969 		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
1970 	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
1971 		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
1972 
1973 	/*
1974 	 * Send routing socket messages indicating that the phyint's ills
1975 	 * and ipifs have reappeared.
1976 	 */
1977 	if (phyi->phyint_illv4 != NULL)
1978 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
1979 	if (phyi->phyint_illv6 != NULL)
1980 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
1981 
1982 	/*
1983 	 * Calculate the phyint's cumulative kstats while it was in the group,
1984 	 * and add that to the group's baseline.
1985 	 */
1986 	ipmp_phyint_get_kstats(phyi, phyi_kstats);
1987 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
1988 		phyi_kstats[i] -= phyi->phyint_kstats0[i];
1989 		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
1990 	}
1991 
1992 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1993 
1994 	phyi->phyint_grp->gr_nif--;
1995 	phyi->phyint_grp = NULL;
1996 
1997 	/*
1998 	 * As our final act in leaving the group, request a switch back to our
1999 	 * IPSQ's own xop when we ipsq_exit().
2000 	 */
2001 	ASSERT(ipsq->ipsq_swxop == NULL);
2002 	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
2003 
2004 	rw_exit(&ipst->ips_ipmp_lock);
2005 }
2006 
2007 /*
2008  * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
2009  * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
2010  */
2011 static void
2012 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
2013 {
2014 	uint_t		i, j;
2015 	const char	*name;
2016 	kstat_t		*ksp;
2017 	kstat_named_t	*kn;
2018 	ip_stack_t	*ipst = PHYINT_TO_IPST(phyi);
2019 	zoneid_t	zoneid;
2020 
2021 	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
2022 	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
2023 	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
2024 	if (ksp == NULL)
2025 		return;
2026 
2027 	KSTAT_ENTER(ksp);
2028 
2029 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2030 		/*
2031 		 * Bring kstats up-to-date before recording.
2032 		 */
2033 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
2034 
2035 		kn = KSTAT_NAMED_PTR(ksp);
2036 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2037 			name = ipmp_kstats[i].name;
2038 			kstats[i] = 0;
2039 			for (j = 0; j < ksp->ks_ndata; j++) {
2040 				if (strcmp(kn[j].name, name) != 0)
2041 					continue;
2042 
2043 				switch (kn[j].data_type) {
2044 				case KSTAT_DATA_INT32:
2045 				case KSTAT_DATA_UINT32:
2046 					kstats[i] = kn[j].value.ui32;
2047 					break;
2048 #ifdef	_LP64
2049 				case KSTAT_DATA_LONG:
2050 				case KSTAT_DATA_ULONG:
2051 					kstats[i] = kn[j].value.ul;
2052 					break;
2053 #endif
2054 				case KSTAT_DATA_INT64:
2055 				case KSTAT_DATA_UINT64:
2056 					kstats[i] = kn[j].value.ui64;
2057 					break;
2058 				}
2059 				break;
2060 			}
2061 		}
2062 	}
2063 
2064 	KSTAT_EXIT(ksp);
2065 	kstat_rele(ksp);
2066 }
2067 
2068 /*
2069  * Refresh the active state of all ills on `phyi'.
2070  */
2071 void
2072 ipmp_phyint_refresh_active(phyint_t *phyi)
2073 {
2074 	if (phyi->phyint_illv4 != NULL)
2075 		ipmp_ill_refresh_active(phyi->phyint_illv4);
2076 	if (phyi->phyint_illv6 != NULL)
2077 		ipmp_ill_refresh_active(phyi->phyint_illv6);
2078 }
2079 
2080 /*
2081  * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2082  * doesn't exist.  Caller need not be inside the IPSQ.
2083  */
2084 ill_t *
2085 ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2086 {
2087 	ill_t *boundill;
2088 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2089 
2090 	ASSERT(IS_IPMP(ipif->ipif_ill));
2091 
2092 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2093 	boundill = ipif->ipif_bound_ill;
2094 	if (boundill != NULL && ill_check_and_refhold(boundill)) {
2095 		rw_exit(&ipst->ips_ipmp_lock);
2096 		return (boundill);
2097 	}
2098 	rw_exit(&ipst->ips_ipmp_lock);
2099 	return (NULL);
2100 }
2101 
2102 /*
2103  * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2104  * doesn't exist.  Caller must be inside the IPSQ.
2105  */
2106 ill_t *
2107 ipmp_ipif_bound_ill(const ipif_t *ipif)
2108 {
2109 	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2110 	ASSERT(IS_IPMP(ipif->ipif_ill));
2111 
2112 	return (ipif->ipif_bound_ill);
2113 }
2114 
2115 /*
2116  * Check if `ipif' is a "stub" (placeholder address not being used).
2117  */
2118 boolean_t
2119 ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2120 {
2121 	if (ipif->ipif_flags & IPIF_UP)
2122 		return (B_FALSE);
2123 	if (ipif->ipif_ill->ill_isv6)
2124 		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2125 	else
2126 		return (ipif->ipif_lcl_addr == INADDR_ANY);
2127 }
2128 
2129 /*
2130  * Check if `ipif' is an IPMP data address.
2131  */
2132 boolean_t
2133 ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2134 {
2135 	if (ipif->ipif_flags & IPIF_NOFAILOVER)
2136 		return (B_FALSE);
2137 	if (ipif->ipif_ill->ill_isv6)
2138 		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2139 	else
2140 		return (ipif->ipif_lcl_addr != INADDR_ANY);
2141 }
2142 
2143 /*
2144  * Check if `ipif' is an IPIF_UP IPMP data address.
2145  */
2146 static boolean_t
2147 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2148 {
2149 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2150 }
2151 
2152 /*
2153  * Check if `mp' contains a probe packet by checking if the IP source address
2154  * is a test address on underlying interface `ill'.  Caller need not be inside
2155  * the IPSQ.
2156  */
2157 boolean_t
2158 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
2159 {
2160 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2161 	ipha_t *ipha = (ipha_t *)mp->b_rptr;
2162 
2163 	ASSERT(DB_TYPE(mp) != M_CTL);
2164 
2165 	if (!IS_UNDER_IPMP(ill))
2166 		return (B_FALSE);
2167 
2168 	if (ill->ill_isv6) {
2169 		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
2170 		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
2171 			return (B_TRUE);
2172 	} else {
2173 		if (ipha->ipha_src != INADDR_ANY &&
2174 		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
2175 			return (B_TRUE);
2176 	}
2177 	return (B_FALSE);
2178 }
2179 
2180 /*
2181  * NCEC walker callback: delete `ncec' if it is associated with `ill_arg' and
2182  * is not one of our local addresses.  Caller must be inside the IPSQ.
2183  */
2184 static void
2185 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *ill_arg)
2186 {
2187 	if (!NCE_MYADDR(ncec) && ncec->ncec_ill == (ill_t *)ill_arg)
2188 		ncec_delete(ncec);
2189 }
2190 
2191 /*
2192  * Delete any NCEs tied to the illgrp associated with `ncec'.  Caller need not
2193  * be inside the IPSQ.
2194  */
2195 void
2196 ipmp_ncec_delete_nce(ncec_t *ncec)
2197 {
2198 	ipmp_illgrp_t	*illg = ncec->ncec_ill->ill_grp;
2199 	ip_stack_t	*ipst = ncec->ncec_ipst;
2200 	ill_t		*ill;
2201 	nce_t		*nce;
2202 	list_t		dead;
2203 
2204 	ASSERT(IS_IPMP(ncec->ncec_ill));
2205 
2206 	/*
2207 	 * For each underlying interface, delete `ncec' from its ill_nce list
2208 	 * via nce_fastpath_list_delete().  Defer the actual nce_refrele()
2209 	 * until we've dropped ill_g_lock.
2210 	 */
2211 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
2212 
2213 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2214 	ill = list_head(&illg->ig_if);
2215 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2216 		nce_fastpath_list_delete(ill, ncec, &dead);
2217 	rw_exit(&ipst->ips_ill_g_lock);
2218 
2219 	while ((nce = list_remove_head(&dead)) != NULL)
2220 		nce_refrele(nce);
2221 
2222 	list_destroy(&dead);
2223 }
2224 
2225 /*
2226  * Refresh any NCE entries tied to the illgrp associated with `ncec' to
2227  * use the information in `ncec'.  Caller need not be inside the IPSQ.
2228  */
2229 void
2230 ipmp_ncec_refresh_nce(ncec_t *ncec)
2231 {
2232 	ipmp_illgrp_t	*illg = ncec->ncec_ill->ill_grp;
2233 	ip_stack_t	*ipst = ncec->ncec_ipst;
2234 	ill_t		*ill;
2235 	nce_t		*nce, *nce_next;
2236 	list_t		replace;
2237 
2238 	ASSERT(IS_IPMP(ncec->ncec_ill));
2239 
2240 	/*
2241 	 * If `ncec' is not reachable, there is no use in refreshing NCEs.
2242 	 */
2243 	if (!NCE_ISREACHABLE(ncec))
2244 		return;
2245 
2246 	/*
2247 	 * Find all the NCEs matching ncec->ncec_addr.  We cannot update them
2248 	 * in-situ because we're holding ipmp_lock to prevent changes to IPMP
2249 	 * group membership and updating indirectly calls nce_fastpath_probe()
2250 	 * -> putnext() which cannot hold locks.  Thus, move the NCEs to a
2251 	 * separate list and process that list after dropping ipmp_lock.
2252 	 */
2253 	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
2254 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2255 	ill = list_head(&illg->ig_actif);
2256 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
2257 		mutex_enter(&ill->ill_lock);
2258 		nce = list_head(&ill->ill_nce);
2259 		for (; nce != NULL; nce = nce_next) {
2260 			nce_next = list_next(&ill->ill_nce, nce);
2261 			if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
2262 			    &ncec->ncec_addr)) {
2263 				nce_refhold(nce);
2264 				nce_delete(nce);
2265 				list_insert_tail(&replace, nce);
2266 			}
2267 		}
2268 		mutex_exit(&ill->ill_lock);
2269 	}
2270 	rw_exit(&ipst->ips_ipmp_lock);
2271 
2272 	/*
2273 	 * Process the list; nce_lookup_then_add_v* ensures that nce->nce_ill
2274 	 * is still in the group for ncec->ncec_ill.
2275 	 */
2276 	while ((nce = list_remove_head(&replace)) != NULL) {
2277 		if (ncec->ncec_ill->ill_isv6) {
2278 			(void) nce_lookup_then_add_v6(nce->nce_ill,
2279 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2280 			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
2281 			    NULL);
2282 		} else {
2283 			ipaddr_t ipaddr;
2284 
2285 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
2286 			(void) nce_lookup_then_add_v4(nce->nce_ill,
2287 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2288 			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
2289 		}
2290 		nce_refrele(nce);
2291 	}
2292 
2293 	list_destroy(&replace);
2294 }
2295