xref: /titanic_52/usr/src/uts/common/inet/ip/ipmp.c (revision 9eb19f4d61679ca0382def038665019234458edd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  *
21  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
22  * Use is subject to license terms.
23  */
24 
25 #include <inet/ip.h>
26 #include <inet/ip6.h>
27 #include <inet/ip_if.h>
28 #include <inet/ip_ire.h>
29 #include <inet/ip_multi.h>
30 #include <inet/ip_ndp.h>
31 #include <inet/ip_rts.h>
32 #include <inet/mi.h>
33 #include <net/if_types.h>
34 #include <sys/dlpi.h>
35 #include <sys/kmem.h>
36 #include <sys/modhash.h>
37 #include <sys/sdt.h>
38 #include <sys/strsun.h>
39 #include <sys/sunddi.h>
40 #include <sys/types.h>
41 
42 /*
43  * Convenience macros for getting the ip_stack_t associated with an
44  * ipmp_illgrp_t or ipmp_grp_t.
45  */
46 #define	IPMP_GRP_TO_IPST(grp)		PHYINT_TO_IPST((grp)->gr_phyint)
47 #define	IPMP_ILLGRP_TO_IPST(illg)	((illg)->ig_ipmp_ill->ill_ipst)
48 
49 /*
50  * Assorted constants that aren't important enough to be tunable.
51  */
52 #define	IPMP_GRP_HASH_SIZE		64
53 #define	IPMP_ILL_REFRESH_TIMEOUT	120	/* seconds */
54 
55 
56 /*
57  * IPMP meta-interface kstats (based on those in PSARC/1997/198).
58  */
59 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
60 	{ "obytes",	KSTAT_DATA_UINT32 },
61 	{ "obytes64",	KSTAT_DATA_UINT64 },
62 	{ "rbytes",	KSTAT_DATA_UINT32 },
63 	{ "rbytes64",	KSTAT_DATA_UINT64 },
64 	{ "opackets",	KSTAT_DATA_UINT32 },
65 	{ "opackets64",	KSTAT_DATA_UINT64 },
66 	{ "oerrors",	KSTAT_DATA_UINT32 },
67 	{ "ipackets",	KSTAT_DATA_UINT32 },
68 	{ "ipackets64",	KSTAT_DATA_UINT64 },
69 	{ "ierrors",	KSTAT_DATA_UINT32 },
70 	{ "multircv",	KSTAT_DATA_UINT32 },
71 	{ "multixmt",	KSTAT_DATA_UINT32 },
72 	{ "brdcstrcv",	KSTAT_DATA_UINT32 },
73 	{ "brdcstxmt",	KSTAT_DATA_UINT32 },
74 	{ "link_up",	KSTAT_DATA_UINT32 }
75 };
76 
77 static void	ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
78 static int	ipmp_grp_create_kstats(ipmp_grp_t *);
79 static int	ipmp_grp_update_kstats(kstat_t *, int);
80 static void	ipmp_grp_destroy_kstats(ipmp_grp_t *);
81 static ill_t	*ipmp_illgrp_min_ill(ipmp_illgrp_t *);
82 static ill_t	*ipmp_illgrp_max_ill(ipmp_illgrp_t *);
83 static void	ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
84 static void	ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t);
85 static boolean_t ipmp_ill_activate(ill_t *);
86 static void	ipmp_ill_deactivate(ill_t *);
87 static void	ipmp_ill_ire_mark_testhidden(ire_t *, char *);
88 static void	ipmp_ill_ire_clear_testhidden(ire_t *, char *);
89 static void	ipmp_ill_refresh_active_timer_start(ill_t *);
90 static void	ipmp_ill_rtsaddrmsg(ill_t *, int);
91 static void	ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
92 static ipif_t	*ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
93 static void	ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
94 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
95 
96 /*
97  * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
98  */
99 void
100 ipmp_init(ip_stack_t *ipst)
101 {
102 	ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
103 	    IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
104 	    mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
105 	rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
106 }
107 
108 /*
109  * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
110  */
111 void
112 ipmp_destroy(ip_stack_t *ipst)
113 {
114 	mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
115 	rw_destroy(&ipst->ips_ipmp_lock);
116 }
117 
118 /*
119  * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
120  * and add it to the hash.  On success, return a pointer to the created group.
121  * Caller must ensure `grname' is not yet in the hash.  Assumes that the IPMP
122  * meta-interface associated with the group also has the same name (but they
123  * may differ later via ipmp_grp_rename()).
124  */
125 ipmp_grp_t *
126 ipmp_grp_create(const char *grname, phyint_t *phyi)
127 {
128 	ipmp_grp_t *grp;
129 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
130 	mod_hash_hndl_t mh;
131 
132 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
133 
134 	if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
135 		return (NULL);
136 
137 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
138 	(void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
139 
140 	/*
141 	 * Cache the group's phyint.  This is safe since a phyint_t will
142 	 * outlive its ipmp_grp_t.
143 	 */
144 	grp->gr_phyint = phyi;
145 
146 	/*
147 	 * Create IPMP group kstats.
148 	 */
149 	if (ipmp_grp_create_kstats(grp) != 0) {
150 		kmem_free(grp, sizeof (ipmp_grp_t));
151 		return (NULL);
152 	}
153 
154 	/*
155 	 * Insert the group into the hash.
156 	 */
157 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
158 		ipmp_grp_destroy_kstats(grp);
159 		kmem_free(grp, sizeof (ipmp_grp_t));
160 		return (NULL);
161 	}
162 	ipmp_grp_insert(grp, mh);
163 
164 	return (grp);
165 }
166 
167 /*
168  * Create IPMP kstat structures for `grp'.  Return an errno upon failure.
169  */
170 static int
171 ipmp_grp_create_kstats(ipmp_grp_t *grp)
172 {
173 	kstat_t *ksp;
174 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
175 
176 	ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
177 	    KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
178 	if (ksp == NULL)
179 		return (ENOMEM);
180 
181 	ksp->ks_update = ipmp_grp_update_kstats;
182 	ksp->ks_private = grp;
183 	bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
184 
185 	kstat_install(ksp);
186 	grp->gr_ksp = ksp;
187 	return (0);
188 }
189 
190 /*
191  * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
192  */
193 static int
194 ipmp_grp_update_kstats(kstat_t *ksp, int rw)
195 {
196 	uint_t		i;
197 	kstat_named_t	*kn = KSTAT_NAMED_PTR(ksp);
198 	ipmp_grp_t	*grp = ksp->ks_private;
199 	ip_stack_t	*ipst = IPMP_GRP_TO_IPST(grp);
200 	ipsq_t		*ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
201 	phyint_t	*phyi;
202 	uint64_t	phyi_kstats[IPMP_KSTAT_MAX];
203 
204 	if (rw == KSTAT_WRITE)
205 		return (EACCES);
206 
207 	/*
208 	 * Start with the group's baseline values.
209 	 */
210 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
211 		if (kn[i].data_type == KSTAT_DATA_UINT32) {
212 			kn[i].value.ui32 = grp->gr_kstats0[i];
213 		} else {
214 			ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
215 			kn[i].value.ui64 = grp->gr_kstats0[i];
216 		}
217 	}
218 
219 	/*
220 	 * Add in the stats of each phyint currently in the group.  Since we
221 	 * don't directly track the phyints in a group, we cheat by walking
222 	 * the IPSQ set under ill_g_lock.  (The IPSQ list cannot change while
223 	 * ill_g_lock is held.)
224 	 */
225 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
226 	ipsq = grp_ipsq->ipsq_next;
227 	for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
228 		phyi = ipsq->ipsq_phyint;
229 
230 		/*
231 		 * If a phyint in a group is being unplumbed, it's possible
232 		 * that ill_glist_delete() -> phyint_free() already freed the
233 		 * phyint (and set ipsq_phyint to NULL), but the unplumb
234 		 * operation has yet to complete (and thus ipsq_dq() has yet
235 		 * to remove the phyint's IPSQ from the group IPSQ's phyint
236 		 * list).  We skip those phyints here (note that their kstats
237 		 * have already been added to gr_kstats0[]).
238 		 */
239 		if (phyi == NULL)
240 			continue;
241 
242 		ipmp_phyint_get_kstats(phyi, phyi_kstats);
243 
244 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
245 			phyi_kstats[i] -= phyi->phyint_kstats0[i];
246 			if (kn[i].data_type == KSTAT_DATA_UINT32)
247 				kn[i].value.ui32 += phyi_kstats[i];
248 			else
249 				kn[i].value.ui64 += phyi_kstats[i];
250 		}
251 	}
252 
253 	kn[IPMP_KSTAT_LINK_UP].value.ui32 =
254 	    (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
255 
256 	rw_exit(&ipst->ips_ill_g_lock);
257 	return (0);
258 }
259 
260 /*
261  * Destroy IPMP kstat structures for `grp'.
262  */
263 static void
264 ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
265 {
266 	netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
267 
268 	kstat_delete_netstack(grp->gr_ksp, id);
269 	bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
270 	grp->gr_ksp = NULL;
271 }
272 
273 /*
274  * Look up an IPMP group named `grname' on IP stack `ipst'.  Return NULL if it
275  * does not exist.
276  */
277 ipmp_grp_t *
278 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
279 {
280 	ipmp_grp_t *grp;
281 
282 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
283 
284 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
285 	    (mod_hash_val_t *)&grp) == 0)
286 		return (grp);
287 
288 	return (NULL);
289 }
290 
291 /*
292  * Place information about group `grp' into `lifgr'.
293  */
294 void
295 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
296 {
297 	ill_t *ill;
298 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
299 
300 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
301 
302 	lifgr->gi_v4 = (grp->gr_v4 != NULL);
303 	lifgr->gi_v6 = (grp->gr_v6 != NULL);
304 	lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
305 	lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
306 	lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
307 	(void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
308 	lifgr->gi_m4ifname[0] = '\0';
309 	lifgr->gi_m6ifname[0] = '\0';
310 	lifgr->gi_bcifname[0] = '\0';
311 
312 	if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
313 		(void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
314 		(void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
315 	}
316 
317 	if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
318 		(void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
319 }
320 
321 /*
322  * Insert `grp' into the hash using the reserved hash entry `mh'.
323  * Caller must ensure `grp' is not yet in the hash.
324  */
325 static void
326 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
327 {
328 	int err;
329 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
330 
331 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
332 
333 	/*
334 	 * Since grp->gr_name will exist at least as long as `grp' is in the
335 	 * hash, we use it directly as the key.
336 	 */
337 	err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
338 	    (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
339 	if (err != 0) {
340 		/*
341 		 * This should never happen since `mh' was preallocated.
342 		 */
343 		panic("cannot insert IPMP group \"%s\" (err %d)",
344 		    grp->gr_name, err);
345 	}
346 }
347 
348 /*
349  * Remove `grp' from the hash.  Caller must ensure `grp' is in it.
350  */
351 static void
352 ipmp_grp_remove(ipmp_grp_t *grp)
353 {
354 	int err;
355 	mod_hash_val_t val;
356 	mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
357 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
358 
359 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
360 
361 	err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
362 	if (err != 0 || val != grp) {
363 		panic("cannot remove IPMP group \"%s\" (err %d)",
364 		    grp->gr_name, err);
365 	}
366 }
367 
368 /*
369  * Attempt to rename `grp' to new name `grname'.  Return an errno if the new
370  * group name already exists or is invalid, or if there isn't enough memory.
371  */
372 int
373 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
374 {
375 	mod_hash_hndl_t mh;
376 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
377 
378 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
379 
380 	if (grname[0] == '\0')
381 		return (EINVAL);
382 
383 	if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
384 	    (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
385 		return (EEXIST);
386 
387 	/*
388 	 * Before we remove the group from the hash, ensure we'll be able to
389 	 * re-insert it by reserving space.
390 	 */
391 	if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
392 		return (ENOMEM);
393 
394 	ipmp_grp_remove(grp);
395 	(void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
396 	ipmp_grp_insert(grp, mh);
397 
398 	return (0);
399 }
400 
401 /*
402  * Destroy `grp' and remove it from the hash.  Caller must ensure `grp' is in
403  * the hash, and that there are no interfaces on it.
404  */
405 void
406 ipmp_grp_destroy(ipmp_grp_t *grp)
407 {
408 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
409 
410 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
411 
412 	/*
413 	 * If there are still interfaces using this group, panic before things
414 	 * go really off the rails.
415 	 */
416 	if (grp->gr_nif != 0)
417 		panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
418 
419 	ipmp_grp_remove(grp);
420 	ipmp_grp_destroy_kstats(grp);
421 
422 	ASSERT(grp->gr_v4 == NULL);
423 	ASSERT(grp->gr_v6 == NULL);
424 	ASSERT(grp->gr_nv4 == 0);
425 	ASSERT(grp->gr_nv6 == 0);
426 	ASSERT(grp->gr_nactif == 0);
427 	ASSERT(grp->gr_linkdownmp == NULL);
428 	grp->gr_phyint = NULL;
429 
430 	kmem_free(grp, sizeof (ipmp_grp_t));
431 }
432 
433 /*
434  * Check whether `ill' is suitable for inclusion into `grp', and return an
435  * errno describing the problem (if any).  NOTE: many of these errno values
436  * are interpreted by ifconfig, which will take corrective action and retry
437  * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
438  */
439 static int
440 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
441 {
442 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
443 
444 	ASSERT(IAM_WRITER_ILL(ill));
445 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
446 
447 	/*
448 	 * To sidestep complicated address migration logic in the kernel and
449 	 * to force the kernel's all-hosts multicast memberships to be blown
450 	 * away, all addresses that had been brought up must be brought back
451 	 * down prior to adding an interface to a group.  (This includes
452 	 * addresses currently down due to DAD.)  Once the interface has been
453 	 * added to the group, its addresses can then be brought back up, at
454 	 * which point they will be moved to the IPMP meta-interface.
455 	 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
456 	 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
457 	 */
458 	if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
459 		return (EADDRINUSE);
460 
461 	/*
462 	 * To avoid confusing applications by changing addresses that are
463 	 * under their control, all such control must be removed prior to
464 	 * adding an interface into a group.
465 	 */
466 	if (ill_appaddr_cnt(ill) != 0)
467 		return (EADDRNOTAVAIL);
468 
469 	/*
470 	 * Since PTP addresses do not share the same broadcast domain, they
471 	 * are not allowed to be in an IPMP group.
472 	 */
473 	if (ill_ptpaddr_cnt(ill) != 0)
474 		return (EINVAL);
475 
476 	/*
477 	 * An ill must support multicast to be allowed into a group.
478 	 */
479 	if (!(ill->ill_flags & ILLF_MULTICAST))
480 		return (ENOTSUP);
481 
482 	/*
483 	 * An ill must strictly be using ARP and/or ND for address
484 	 * resolution for it to be allowed into a group.
485 	 */
486 	if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
487 		return (ENOTSUP);
488 
489 	/*
490 	 * An ill cannot also be using usesrc groups.  (Although usesrc uses
491 	 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
492 	 * all its modifications as writer.)
493 	 */
494 	if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
495 		return (ENOTSUP);
496 
497 	/*
498 	 * All ills in a group must be the same mactype.
499 	 */
500 	if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
501 		return (EINVAL);
502 
503 	return (0);
504 }
505 
506 /*
507  * Check whether `phyi' is suitable for inclusion into `grp', and return an
508  * errno describing the problem (if any).  See comment above ipmp_grp_vet_ill()
509  * regarding errno values.
510  */
511 int
512 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
513 {
514 	int err = 0;
515 	ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
516 
517 	ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
518 	ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
519 
520 	/*
521 	 * An interface cannot have address families plumbed that are not
522 	 * configured in the group.
523 	 */
524 	if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
525 	    phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
526 		return (EAFNOSUPPORT);
527 
528 	if (phyi->phyint_illv4 != NULL)
529 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
530 	if (err == 0 && phyi->phyint_illv6 != NULL)
531 		err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
532 
533 	return (err);
534 }
535 
536 /*
537  * Create a new illgrp on IPMP meta-interface `ill'.
538  */
539 ipmp_illgrp_t *
540 ipmp_illgrp_create(ill_t *ill)
541 {
542 	uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
543 	ipmp_illgrp_t *illg;
544 
545 	ASSERT(IAM_WRITER_ILL(ill));
546 	ASSERT(IS_IPMP(ill));
547 	ASSERT(ill->ill_grp == NULL);
548 
549 	if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
550 		return (NULL);
551 
552 	list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
553 	list_create(&illg->ig_actif, sizeof (ill_t),
554 	    offsetof(ill_t, ill_actnode));
555 	list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
556 	    offsetof(ipmp_arpent_t, ia_node));
557 
558 	illg->ig_ipmp_ill = ill;
559 	ill->ill_grp = illg;
560 	ipmp_illgrp_set_mtu(illg, mtu);
561 
562 	return (illg);
563 }
564 
565 /*
566  * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
567  */
568 void
569 ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
570 {
571 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
572 	ASSERT(IS_IPMP(illg->ig_ipmp_ill));
573 
574 	/*
575 	 * Verify `illg' is empty.
576 	 */
577 	ASSERT(illg->ig_next_ill == NULL);
578 	ASSERT(illg->ig_cast_ill == NULL);
579 	ASSERT(list_is_empty(&illg->ig_arpent));
580 	ASSERT(list_is_empty(&illg->ig_if));
581 	ASSERT(list_is_empty(&illg->ig_actif));
582 	ASSERT(illg->ig_nactif == 0);
583 
584 	/*
585 	 * Destroy `illg'.
586 	 */
587 	illg->ig_ipmp_ill->ill_grp = NULL;
588 	illg->ig_ipmp_ill = NULL;
589 	list_destroy(&illg->ig_if);
590 	list_destroy(&illg->ig_actif);
591 	list_destroy(&illg->ig_arpent);
592 	kmem_free(illg, sizeof (ipmp_illgrp_t));
593 }
594 
595 /*
596  * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
597  * bind it to an underlying ill, while keeping an even address distribution.
598  * If the bind is successful, return a pointer to the bound ill.
599  */
600 ill_t *
601 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
602 {
603 	ill_t *minill;
604 	ipmp_arpent_t *entp;
605 
606 	ASSERT(IAM_WRITER_IPIF(ipif));
607 	ASSERT(ipmp_ipif_is_dataaddr(ipif));
608 
609 	/*
610 	 * IPMP data address mappings are internally managed by IP itself, so
611 	 * delete any existing ARP entries associated with the address.
612 	 */
613 	if (!ipif->ipif_isv6) {
614 		entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
615 		if (entp != NULL)
616 			ipmp_illgrp_destroy_arpent(illg, entp);
617 	}
618 
619 	if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
620 		ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
621 
622 	return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
623 }
624 
625 /*
626  * Delete `ipif' from the pool of usable data addresses on `illg'.  If it's
627  * bound, unbind it from the underlying ill while keeping an even address
628  * distribution.
629  */
630 void
631 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
632 {
633 	ill_t *maxill, *boundill = ipif->ipif_bound_ill;
634 
635 	ASSERT(IAM_WRITER_IPIF(ipif));
636 
637 	if (boundill != NULL) {
638 		(void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
639 
640 		maxill = ipmp_illgrp_max_ill(illg);
641 		if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
642 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
643 			ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
644 		}
645 	}
646 }
647 
648 /*
649  * Return the active ill with the greatest number of data addresses in `illg'.
650  */
651 static ill_t *
652 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
653 {
654 	ill_t *ill, *bestill = NULL;
655 
656 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
657 
658 	ill = list_head(&illg->ig_actif);
659 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
660 		if (bestill == NULL ||
661 		    ill->ill_bound_cnt > bestill->ill_bound_cnt) {
662 			bestill = ill;
663 		}
664 	}
665 	return (bestill);
666 }
667 
668 /*
669  * Return the active ill with the fewest number of data addresses in `illg'.
670  */
671 static ill_t *
672 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
673 {
674 	ill_t *ill, *bestill = NULL;
675 
676 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
677 
678 	ill = list_head(&illg->ig_actif);
679 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
680 		if (bestill == NULL ||
681 		    ill->ill_bound_cnt < bestill->ill_bound_cnt) {
682 			if (ill->ill_bound_cnt == 0)
683 				return (ill);	 /* can't get better */
684 			bestill = ill;
685 		}
686 	}
687 	return (bestill);
688 }
689 
690 /*
691  * Return a pointer to IPMP meta-interface for `illg' (which must exist).
692  * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
693  */
694 ill_t *
695 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
696 {
697 	return (illg->ig_ipmp_ill);
698 }
699 
700 /*
701  * Return a pointer to the next available underlying ill in `illg', or NULL if
702  * one doesn't exist.  Caller must be inside the IPSQ.
703  */
704 ill_t *
705 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
706 {
707 	ill_t *ill;
708 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
709 
710 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
711 
712 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
713 	if ((ill = illg->ig_next_ill) != NULL) {
714 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
715 		if (illg->ig_next_ill == NULL)
716 			illg->ig_next_ill = list_head(&illg->ig_actif);
717 	}
718 	rw_exit(&ipst->ips_ipmp_lock);
719 
720 	return (ill);
721 }
722 
723 /*
724  * Return a held pointer to the next available underlying ill in `illg', or
725  * NULL if one doesn't exist.  Caller need not be inside the IPSQ.
726  */
727 ill_t *
728 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
729 {
730 	ill_t *ill;
731 	uint_t i;
732 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
733 
734 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
735 	for (i = 0; i < illg->ig_nactif; i++) {
736 		ill = illg->ig_next_ill;
737 		illg->ig_next_ill = list_next(&illg->ig_actif, ill);
738 		if (illg->ig_next_ill == NULL)
739 			illg->ig_next_ill = list_head(&illg->ig_actif);
740 
741 		if (ill_check_and_refhold(ill)) {
742 			rw_exit(&ipst->ips_ipmp_lock);
743 			return (ill);
744 		}
745 	}
746 	rw_exit(&ipst->ips_ipmp_lock);
747 
748 	return (NULL);
749 }
750 
751 /*
752  * Return a held pointer to the nominated multicast ill in `illg', or NULL if
753  * one doesn't exist.  Caller need not be inside the IPSQ.
754  */
755 ill_t *
756 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
757 {
758 	ill_t *castill;
759 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
760 
761 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
762 	castill = illg->ig_cast_ill;
763 	if (castill != NULL && ill_check_and_refhold(castill)) {
764 		rw_exit(&ipst->ips_ipmp_lock);
765 		return (castill);
766 	}
767 	rw_exit(&ipst->ips_ipmp_lock);
768 	return (NULL);
769 }
770 
771 /*
772  * Callback routine for ncec_walk() that deletes `nce' if it is associated with
773  * the `(ill_t *)arg' and it is not one of the local addresses.  Caller must be
774  * inside the IPSQ.
775  */
776 static void
777 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg)
778 {
779 	if ((ncec != NULL) && !NCE_MYADDR(ncec) &&
780 	    ncec->ncec_ill == (ill_t *)arg) {
781 		ncec_delete(ncec);
782 	}
783 }
784 
785 /*
786  * Set the nominated cast ill on `illg' to `castill'.  If `castill' is NULL,
787  * any existing nomination is removed.  Caller must be inside the IPSQ.
788  */
789 static void
790 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
791 {
792 	ill_t *ocastill = illg->ig_cast_ill;
793 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
794 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
795 
796 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
797 
798 	/*
799 	 * Disable old nominated ill (if any).
800 	 */
801 	if (ocastill != NULL) {
802 		DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
803 		    illg, ill_t *, ocastill);
804 		ASSERT(ocastill->ill_nom_cast);
805 		ocastill->ill_nom_cast = B_FALSE;
806 		/*
807 		 * If the IPMP meta-interface is down, we never did the join,
808 		 * so we must not try to leave.
809 		 */
810 		if (ipmp_ill->ill_dl_up)
811 			ill_leave_multicast(ipmp_ill);
812 
813 		/*
814 		 * Delete any NCEs tied to the old nomination.  We must do this
815 		 * last since ill_leave_multicast() may trigger IREs to be
816 		 * built using ig_cast_ill.
817 		 */
818 		ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
819 		    ocastill->ill_ipst);
820 	}
821 
822 	/*
823 	 * Set new nomination.
824 	 */
825 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
826 	illg->ig_cast_ill = castill;
827 	rw_exit(&ipst->ips_ipmp_lock);
828 
829 	/*
830 	 * Enable new nominated ill (if any).
831 	 */
832 	if (castill != NULL) {
833 		DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
834 		    illg, ill_t *, castill);
835 		ASSERT(!castill->ill_nom_cast);
836 		castill->ill_nom_cast = B_TRUE;
837 		/*
838 		 * If the IPMP meta-interface is down, the attempt to recover
839 		 * will silently fail but ill_need_recover_multicast will be
840 		 * erroneously cleared -- so check first.
841 		 */
842 		if (ipmp_ill->ill_dl_up)
843 			ill_recover_multicast(ipmp_ill);
844 	}
845 }
846 
847 /*
848  * Create an IPMP ARP entry and add it to the set tracked on `illg'.  If an
849  * entry for the same IP address already exists, destroy it first.  Return the
850  * created IPMP ARP entry, or NULL on failure.
851  */
852 ipmp_arpent_t *
853 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
854     ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
855 {
856 	ipmp_arpent_t *entp, *oentp;
857 
858 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
859 
860 	if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
861 	    KM_NOSLEEP)) == NULL)
862 		return (NULL);
863 
864 	/*
865 	 * Delete any existing ARP entry for this address.
866 	 */
867 	if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
868 		ipmp_illgrp_destroy_arpent(illg, oentp);
869 
870 	/*
871 	 * Prepend the new entry.
872 	 */
873 	entp->ia_ipaddr = ipaddr;
874 	entp->ia_flags = flags;
875 	entp->ia_lladdr_len = lladdr_len;
876 	entp->ia_lladdr = (uchar_t *)&entp[1];
877 	bcopy(lladdr, entp->ia_lladdr, lladdr_len);
878 	entp->ia_proxyarp = proxyarp;
879 	entp->ia_notified = B_TRUE;
880 	list_insert_head(&illg->ig_arpent, entp);
881 	return (entp);
882 }
883 
884 /*
885  * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
886  */
887 void
888 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
889 {
890 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
891 
892 	list_remove(&illg->ig_arpent, entp);
893 	kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
894 }
895 
896 /*
897  * Mark that ARP has been notified about the IP address on `entp'; `illg' is
898  * taken as a debugging aid for DTrace FBT probes.
899  */
900 /* ARGSUSED */
901 void
902 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
903 {
904 	entp->ia_notified = B_TRUE;
905 }
906 
907 /*
908  * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
909  * NULL, any IPMP ARP entry is requested.  Return NULL if it does not exist.
910  */
911 ipmp_arpent_t *
912 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
913 {
914 	ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
915 
916 	ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
917 
918 	if (addrp == NULL)
919 		return (entp);
920 
921 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
922 		if (entp->ia_ipaddr == *addrp)
923 			break;
924 	return (entp);
925 }
926 
927 /*
928  * Refresh ARP entries on `illg' to be distributed across its active
929  * interfaces.  Entries that cannot be refreshed (e.g., because there are no
930  * active interfaces) are marked so that subsequent calls can try again.
931  */
932 void
933 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
934 {
935 	ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
936 	uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
937 	ipmp_arpent_t *entp;
938 	ncec_t *ncec;
939 	nce_t  *nce;
940 
941 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
942 	ASSERT(!ipmp_ill->ill_isv6);
943 
944 	ill = list_head(&illg->ig_actif);
945 	entp = list_head(&illg->ig_arpent);
946 	for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
947 		if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
948 			entp->ia_notified = B_FALSE;
949 			continue;
950 		}
951 
952 		ASSERT(paddrlen == ill->ill_phys_addr_length);
953 
954 		/*
955 		 * If this is a proxy ARP entry, we can skip notifying ARP if
956 		 * the entry is already up-to-date.  If it has changed, we
957 		 * update the entry's hardware address before notifying ARP.
958 		 */
959 		if (entp->ia_proxyarp) {
960 			if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
961 			    paddrlen) == 0 && entp->ia_notified)
962 				continue;
963 			bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
964 		}
965 
966 		(void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
967 		    paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
968 		    &nce);
969 		if (nce == NULL || !entp->ia_proxyarp) {
970 			if (nce != NULL)
971 				nce_refrele(nce);
972 			continue;
973 		}
974 		ncec = nce->nce_common;
975 		mutex_enter(&ncec->ncec_lock);
976 		nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
977 		mutex_exit(&ncec->ncec_lock);
978 		nce_refrele(nce);
979 		ipmp_illgrp_mark_arpent(illg, entp);
980 
981 		if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
982 			ill = list_head(&illg->ig_actif);
983 	}
984 }
985 
986 /*
987  * Return an interface in `illg' with the specified `physaddr', or NULL if one
988  * doesn't exist.  Caller must hold ill_g_lock if it's not inside the IPSQ.
989  */
990 ill_t *
991 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
992 {
993 	ill_t *ill;
994 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
995 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
996 
997 	ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
998 
999 	ill = list_head(&illg->ig_if);
1000 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1001 		if (ill->ill_phys_addr_length == paddrlen &&
1002 		    bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
1003 			return (ill);
1004 	}
1005 	return (NULL);
1006 }
1007 
1008 /*
1009  * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
1010  * Caller must be inside the IPSQ unless this is initialization.
1011  */
1012 static void
1013 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu)
1014 {
1015 	ill_t *ill = illg->ig_ipmp_ill;
1016 	mblk_t *mp;
1017 
1018 	ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1019 
1020 	/*
1021 	 * If allocation fails, we have bigger problems than MTU.
1022 	 */
1023 	if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) {
1024 		illg->ig_mtu = mtu;
1025 		put(ill->ill_rq, mp);
1026 	}
1027 }
1028 
1029 /*
1030  * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1031  * ill MTU if necessary.
1032  */
1033 void
1034 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1035 {
1036 	ill_t *ill;
1037 	ill_t *ipmp_ill = illg->ig_ipmp_ill;
1038 	uint_t mtu = 0;
1039 
1040 	ASSERT(IAM_WRITER_ILL(ipmp_ill));
1041 
1042 	/*
1043 	 * Since ill_mtu can only change under ill_lock, we hold ill_lock
1044 	 * for each ill as we iterate through the list.  Any changes to the
1045 	 * ill_mtu will also trigger an update, so even if we missed it
1046 	 * this time around, the update will catch it.
1047 	 */
1048 	ill = list_head(&illg->ig_if);
1049 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1050 		mutex_enter(&ill->ill_lock);
1051 		if (mtu == 0 || ill->ill_mtu < mtu)
1052 			mtu = ill->ill_mtu;
1053 		mutex_exit(&ill->ill_lock);
1054 	}
1055 
1056 	/*
1057 	 * MTU must be at least the minimum MTU.
1058 	 */
1059 	mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1060 
1061 	if (illg->ig_mtu != mtu)
1062 		ipmp_illgrp_set_mtu(illg, mtu);
1063 }
1064 
1065 /*
1066  * Link illgrp `illg' to IPMP group `grp'.  To simplify the caller, silently
1067  * allow the same link to be established more than once.
1068  */
1069 void
1070 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1071 {
1072 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1073 
1074 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1075 
1076 	if (illg->ig_ipmp_ill->ill_isv6) {
1077 		ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1078 		grp->gr_v6 = illg;
1079 	} else {
1080 		ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1081 		grp->gr_v4 = illg;
1082 	}
1083 }
1084 
1085 /*
1086  * Unlink illgrp `illg' from its IPMP group.  Return an errno if the illgrp
1087  * cannot be unlinked (e.g., because there are still interfaces using it).
1088  */
1089 int
1090 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1091 {
1092 	ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1093 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1094 
1095 	ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1096 
1097 	if (illg->ig_ipmp_ill->ill_isv6) {
1098 		if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1099 			return (EBUSY);
1100 		grp->gr_v6 = NULL;
1101 	} else {
1102 		if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1103 			return (EBUSY);
1104 		grp->gr_v4 = NULL;
1105 	}
1106 	return (0);
1107 }
1108 
1109 /*
1110  * Place `ill' into `illg', and rebalance the data addresses on `illg'
1111  * to be spread evenly across the ills now in it.  Also, adjust the IPMP
1112  * ill as necessary to account for `ill' (e.g., MTU).
1113  */
1114 void
1115 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1116 {
1117 	ill_t *ipmp_ill;
1118 	ipif_t *ipif;
1119 	ip_stack_t *ipst = ill->ill_ipst;
1120 
1121 	/* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1122 	ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1123 	ASSERT(IAM_WRITER_ILL(ill));
1124 	ASSERT(ill->ill_grp == NULL);
1125 
1126 	ipmp_ill = illg->ig_ipmp_ill;
1127 
1128 	/*
1129 	 * Account for `ill' joining the illgrp.
1130 	 */
1131 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1132 	if (ill->ill_isv6)
1133 		ill->ill_phyint->phyint_grp->gr_nv6++;
1134 	else
1135 		ill->ill_phyint->phyint_grp->gr_nv4++;
1136 	rw_exit(&ipst->ips_ipmp_lock);
1137 
1138 	/*
1139 	 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1140 	 */
1141 	mutex_enter(&ill->ill_lock);
1142 	if (ipmp_ill->ill_flags & ILLF_ROUTER)
1143 		ill->ill_flags |= ILLF_ROUTER;
1144 	else
1145 		ill->ill_flags &= ~ILLF_ROUTER;
1146 	mutex_exit(&ill->ill_lock);
1147 
1148 	/*
1149 	 * Blow away all multicast memberships that currently exist on `ill'.
1150 	 * This may seem odd, but it's consistent with the application view
1151 	 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1152 	 */
1153 	update_conn_ill(ill, ill->ill_ipst);
1154 	if (ill->ill_isv6) {
1155 		reset_mrt_ill(ill);
1156 	} else {
1157 		ipif = ill->ill_ipif;
1158 		for (; ipif != NULL; ipif = ipif->ipif_next) {
1159 			reset_mrt_vif_ipif(ipif);
1160 		}
1161 	}
1162 	ip_purge_allmulti(ill);
1163 
1164 	/*
1165 	 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1166 	 * physical address length.  All other ills must have the same value,
1167 	 * since they are required to all be the same mactype.  Also update
1168 	 * the IPMP ill's MTU and CoS marking, if necessary.
1169 	 */
1170 	if (list_is_empty(&illg->ig_if)) {
1171 		ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1172 		/*
1173 		 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1174 		 * doesn't have a physical address.  This means that code must
1175 		 * not assume that ill_phys_addr is non-NULL just because
1176 		 * ill_phys_addr_length is non-zero.  Likewise for ill_nd_lla.
1177 		 */
1178 		ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1179 		ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1180 		ipmp_ill->ill_type = ill->ill_type;
1181 
1182 		if (ill->ill_flags & ILLF_COS_ENABLED) {
1183 			mutex_enter(&ipmp_ill->ill_lock);
1184 			ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1185 			mutex_exit(&ipmp_ill->ill_lock);
1186 		}
1187 		ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
1188 	} else {
1189 		ASSERT(ipmp_ill->ill_phys_addr_length ==
1190 		    ill->ill_phys_addr_length);
1191 		ASSERT(ipmp_ill->ill_type == ill->ill_type);
1192 
1193 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1194 			mutex_enter(&ipmp_ill->ill_lock);
1195 			ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1196 			mutex_exit(&ipmp_ill->ill_lock);
1197 		}
1198 		if (illg->ig_mtu > ill->ill_mtu)
1199 			ipmp_illgrp_set_mtu(illg, ill->ill_mtu);
1200 	}
1201 
1202 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1203 	list_insert_tail(&illg->ig_if, ill);
1204 	ill->ill_grp = illg;
1205 	rw_exit(&ipst->ips_ill_g_lock);
1206 
1207 	/*
1208 	 * Hide the IREs on `ill' so that we don't accidentally find them when
1209 	 * sending data traffic.
1210 	 */
1211 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1212 
1213 	ipmp_ill_refresh_active(ill);
1214 }
1215 
1216 /*
1217  * Remove `ill' from its illgrp, and rebalance the data addresses in that
1218  * illgrp to be spread evenly across the remaining ills.  Also, adjust the
1219  * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1220  */
1221 void
1222 ipmp_ill_leave_illgrp(ill_t *ill)
1223 {
1224 	ill_t *ipmp_ill;
1225 	ipif_t *ipif;
1226 	ipmp_arpent_t *entp;
1227 	ipmp_illgrp_t *illg = ill->ill_grp;
1228 	ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1229 
1230 	ASSERT(IS_UNDER_IPMP(ill));
1231 	ASSERT(IAM_WRITER_ILL(ill));
1232 	ASSERT(illg != NULL);
1233 
1234 	ipmp_ill = illg->ig_ipmp_ill;
1235 
1236 	/*
1237 	 * Cancel IPMP-specific ill timeouts.
1238 	 */
1239 	(void) untimeout(ill->ill_refresh_tid);
1240 
1241 	/*
1242 	 * Expose any previously-hidden IREs on `ill'.
1243 	 */
1244 	ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1245 
1246 	/*
1247 	 * Ensure the multicast state for each ipif on `ill' is down so that
1248 	 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1249 	 * all eligible groups.
1250 	 */
1251 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1252 		if (ipif->ipif_flags & IPIF_UP)
1253 			ipif_multicast_down(ipif);
1254 
1255 	/*
1256 	 * Account for `ill' leaving the illgrp.
1257 	 */
1258 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1259 	if (ill->ill_isv6)
1260 		ill->ill_phyint->phyint_grp->gr_nv6--;
1261 	else
1262 		ill->ill_phyint->phyint_grp->gr_nv4--;
1263 	rw_exit(&ipst->ips_ipmp_lock);
1264 
1265 	/*
1266 	 * Pull `ill' out of the interface lists.
1267 	 */
1268 	if (list_link_active(&ill->ill_actnode))
1269 		ipmp_ill_deactivate(ill);
1270 	rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1271 	list_remove(&illg->ig_if, ill);
1272 	ill->ill_grp = NULL;
1273 	rw_exit(&ipst->ips_ill_g_lock);
1274 
1275 	/*
1276 	 * Re-establish multicast memberships that were previously being
1277 	 * handled by the IPMP meta-interface.
1278 	 */
1279 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1280 		if (ipif->ipif_flags & IPIF_UP)
1281 			ipif_multicast_up(ipif);
1282 
1283 	/*
1284 	 * Refresh the group MTU based on the new interface list.
1285 	 */
1286 	ipmp_illgrp_refresh_mtu(illg);
1287 
1288 	if (list_is_empty(&illg->ig_if)) {
1289 		/*
1290 		 * No ills left in the illgrp; we no longer have a physical
1291 		 * address length, nor can we support ARP, CoS, or anything
1292 		 * else that depends on knowing the link layer type.
1293 		 */
1294 		while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1295 			ipmp_illgrp_destroy_arpent(illg, entp);
1296 
1297 		ipmp_ill->ill_phys_addr_length = 0;
1298 		ipmp_ill->ill_nd_lla_len = 0;
1299 		ipmp_ill->ill_type = IFT_OTHER;
1300 		mutex_enter(&ipmp_ill->ill_lock);
1301 		ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1302 		mutex_exit(&ipmp_ill->ill_lock);
1303 	} else {
1304 		/*
1305 		 * If `ill' didn't support CoS, see if it can now be enabled.
1306 		 */
1307 		if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1308 			ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1309 
1310 			ill = list_head(&illg->ig_if);
1311 			do {
1312 				if (!(ill->ill_flags & ILLF_COS_ENABLED))
1313 					break;
1314 			} while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1315 
1316 			if (ill == NULL) {
1317 				mutex_enter(&ipmp_ill->ill_lock);
1318 				ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1319 				mutex_exit(&ipmp_ill->ill_lock);
1320 			}
1321 		}
1322 	}
1323 }
1324 
1325 /*
1326  * Check if `ill' should be active, and activate or deactivate if need be.
1327  * Return B_FALSE if a refresh was necessary but could not be performed.
1328  */
1329 static boolean_t
1330 ipmp_ill_try_refresh_active(ill_t *ill)
1331 {
1332 	boolean_t refreshed = B_TRUE;
1333 
1334 	ASSERT(IAM_WRITER_ILL(ill));
1335 	ASSERT(IS_UNDER_IPMP(ill));
1336 
1337 	if (ipmp_ill_is_active(ill)) {
1338 		if (!list_link_active(&ill->ill_actnode))
1339 			refreshed = ipmp_ill_activate(ill);
1340 	} else {
1341 		if (list_link_active(&ill->ill_actnode))
1342 			ipmp_ill_deactivate(ill);
1343 	}
1344 
1345 	return (refreshed);
1346 }
1347 
1348 /*
1349  * Check if `ill' should be active, and activate or deactivate if need be.
1350  * If the refresh fails, schedule a timer to try again later.
1351  */
1352 void
1353 ipmp_ill_refresh_active(ill_t *ill)
1354 {
1355 	if (!ipmp_ill_try_refresh_active(ill))
1356 		ipmp_ill_refresh_active_timer_start(ill);
1357 }
1358 
1359 /*
1360  * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1361  */
1362 static void
1363 ipmp_ill_refresh_active_timer(void *ill_arg)
1364 {
1365 	ill_t *ill = ill_arg;
1366 	boolean_t refreshed = B_FALSE;
1367 
1368 	/*
1369 	 * Clear ill_refresh_tid to indicate that no timeout is pending
1370 	 * (another thread could schedule a new timeout while we're still
1371 	 * running, but that's harmless).  If the ill is going away, bail.
1372 	 */
1373 	mutex_enter(&ill->ill_lock);
1374 	ill->ill_refresh_tid = 0;
1375 	if (ill->ill_state_flags & ILL_CONDEMNED) {
1376 		mutex_exit(&ill->ill_lock);
1377 		return;
1378 	}
1379 	mutex_exit(&ill->ill_lock);
1380 
1381 	if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1382 		refreshed = ipmp_ill_try_refresh_active(ill);
1383 		ipsq_exit(ill->ill_phyint->phyint_ipsq);
1384 	}
1385 
1386 	/*
1387 	 * If the refresh failed, schedule another attempt.
1388 	 */
1389 	if (!refreshed)
1390 		ipmp_ill_refresh_active_timer_start(ill);
1391 }
1392 
1393 /*
1394  * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1395  */
1396 static void
1397 ipmp_ill_refresh_active_timer_start(ill_t *ill)
1398 {
1399 	mutex_enter(&ill->ill_lock);
1400 
1401 	/*
1402 	 * If the ill is going away or a refresh is already scheduled, bail.
1403 	 */
1404 	if (ill->ill_refresh_tid != 0 ||
1405 	    (ill->ill_state_flags & ILL_CONDEMNED)) {
1406 		mutex_exit(&ill->ill_lock);
1407 		return;
1408 	}
1409 
1410 	ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1411 	    SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1412 
1413 	mutex_exit(&ill->ill_lock);
1414 }
1415 
1416 /*
1417  * Activate `ill' so it will be used to send and receive data traffic.  Return
1418  * B_FALSE if `ill' cannot be activated.  Note that we allocate any messages
1419  * needed to deactivate `ill' here as well so that deactivation cannot fail.
1420  */
1421 static boolean_t
1422 ipmp_ill_activate(ill_t *ill)
1423 {
1424 	ipif_t		*ipif;
1425 	mblk_t		*linkupmp = NULL, *linkdownmp = NULL;
1426 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1427 	ipmp_illgrp_t	*illg = ill->ill_grp;
1428 	ill_t		*maxill;
1429 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1430 
1431 	ASSERT(IAM_WRITER_ILL(ill));
1432 	ASSERT(IS_UNDER_IPMP(ill));
1433 
1434 	/*
1435 	 * If this will be the first active interface in the group, allocate
1436 	 * the link-up and link-down messages.
1437 	 */
1438 	if (grp->gr_nactif == 0) {
1439 		linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1440 		linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1441 		if (linkupmp == NULL || linkdownmp == NULL)
1442 			goto fail;
1443 	}
1444 
1445 	if (list_is_empty(&illg->ig_actif)) {
1446 		/*
1447 		 * Now that we have an active ill, nominate it for multicast
1448 		 * and broadcast duties.  Do this before ipmp_ill_bind_ipif()
1449 		 * since that may need to send multicast packets (e.g., IPv6
1450 		 * neighbor discovery probes).
1451 		 */
1452 		ipmp_illgrp_set_cast(illg, ill);
1453 
1454 		/*
1455 		 * This is the first active ill in the illgrp -- add 'em all.
1456 		 * We can access/walk ig_ipmp_ill's ipif list since we're
1457 		 * writer on its IPSQ as well.
1458 		 */
1459 		ipif = illg->ig_ipmp_ill->ill_ipif;
1460 		for (; ipif != NULL; ipif = ipif->ipif_next)
1461 			if (ipmp_ipif_is_up_dataaddr(ipif))
1462 				ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1463 	} else {
1464 		/*
1465 		 * Redistribute the addresses by moving them from the ill with
1466 		 * the most addresses until the ill being activated is at the
1467 		 * same level as the rest of the ills.
1468 		 */
1469 		for (;;) {
1470 			maxill = ipmp_illgrp_max_ill(illg);
1471 			ASSERT(maxill != NULL);
1472 			if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1473 				break;
1474 			ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1475 			ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1476 		}
1477 	}
1478 
1479 	/*
1480 	 * Put the interface in the active list.
1481 	 */
1482 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1483 	list_insert_tail(&illg->ig_actif, ill);
1484 	illg->ig_nactif++;
1485 	illg->ig_next_ill = ill;
1486 	rw_exit(&ipst->ips_ipmp_lock);
1487 
1488 	/*
1489 	 * Refresh static/proxy ARP entries to use `ill', if need be.
1490 	 */
1491 	if (!ill->ill_isv6)
1492 		ipmp_illgrp_refresh_arpent(illg);
1493 
1494 	/*
1495 	 * Finally, mark the group link up, if necessary.
1496 	 */
1497 	if (grp->gr_nactif++ == 0) {
1498 		ASSERT(grp->gr_linkdownmp == NULL);
1499 		grp->gr_linkdownmp = linkdownmp;
1500 		put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1501 	}
1502 	return (B_TRUE);
1503 fail:
1504 	freemsg(linkupmp);
1505 	freemsg(linkdownmp);
1506 	return (B_FALSE);
1507 }
1508 
1509 /*
1510  * Deactivate `ill' so it will not be used to send or receive data traffic.
1511  */
1512 static void
1513 ipmp_ill_deactivate(ill_t *ill)
1514 {
1515 	ill_t		*minill;
1516 	ipif_t		*ipif, *ubnextipif, *ubheadipif = NULL;
1517 	mblk_t		*mp;
1518 	ipmp_grp_t	*grp = ill->ill_phyint->phyint_grp;
1519 	ipmp_illgrp_t	*illg = ill->ill_grp;
1520 	ip_stack_t	*ipst = IPMP_ILLGRP_TO_IPST(illg);
1521 
1522 	ASSERT(IAM_WRITER_ILL(ill));
1523 	ASSERT(IS_UNDER_IPMP(ill));
1524 
1525 	/*
1526 	 * Pull the interface out of the active list.
1527 	 */
1528 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1529 	list_remove(&illg->ig_actif, ill);
1530 	illg->ig_nactif--;
1531 	illg->ig_next_ill = list_head(&illg->ig_actif);
1532 	rw_exit(&ipst->ips_ipmp_lock);
1533 
1534 	/*
1535 	 * If the ill that's being deactivated had been nominated for
1536 	 * multicast/broadcast, nominate a new one.
1537 	 */
1538 	if (ill == illg->ig_cast_ill)
1539 		ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1540 
1541 	/*
1542 	 * Delete all nce_t entries using this ill, so that the next attempt
1543 	 * to send data traffic will revalidate cached nce's.
1544 	 */
1545 	nce_flush(ill, B_TRUE);
1546 
1547 	/*
1548 	 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1549 	 * we'll rebind them after we tell the resolver the ill is no longer
1550 	 * active.  We must do things in this order or the resolver could
1551 	 * accidentally rebind to the ill we're trying to remove if multiple
1552 	 * ills in the group have the same hardware address (which is
1553 	 * unsupported, but shouldn't lead to a wedged machine).
1554 	 */
1555 	while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1556 		ipif->ipif_bound_next = ubheadipif;
1557 		ubheadipif = ipif;
1558 	}
1559 	if (!ill->ill_isv6) {
1560 
1561 		/*
1562 		 * Refresh static/proxy ARP entries that had been using `ill'.
1563 		 */
1564 		ipmp_illgrp_refresh_arpent(illg);
1565 	}
1566 
1567 	/*
1568 	 * Rebind each ipif from the deactivated ill to the active ill with
1569 	 * the fewest ipifs.  If there are no active ills, the ipifs will
1570 	 * remain unbound.
1571 	 */
1572 	for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1573 		ubnextipif = ipif->ipif_bound_next;
1574 		ipif->ipif_bound_next = NULL;
1575 
1576 		if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1577 			ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1578 	}
1579 
1580 	if (list_is_empty(&illg->ig_actif)) {
1581 		ill_t *ipmp_ill = illg->ig_ipmp_ill;
1582 
1583 		ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill,
1584 		    (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst);
1585 	}
1586 
1587 	/*
1588 	 * Remove any IRE_IF_CLONE for this ill since they might have
1589 	 * an ire_nce_cache/nce_common which refers to another ill in the group.
1590 	 */
1591 	ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone,
1592 	    ill, ill);
1593 
1594 	/*
1595 	 * Finally, mark the group link down, if necessary.
1596 	 */
1597 	if (--grp->gr_nactif == 0) {
1598 		mp = grp->gr_linkdownmp;
1599 		grp->gr_linkdownmp = NULL;
1600 		ASSERT(mp != NULL);
1601 		put(illg->ig_ipmp_ill->ill_rq, mp);
1602 	}
1603 }
1604 
1605 /*
1606  * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1607  * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1608  */
1609 static void
1610 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1611 {
1612 	ipif_t *ipif;
1613 
1614 	ASSERT(IAM_WRITER_ILL(ill));
1615 	ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1616 
1617 	/*
1618 	 * If `ill' is truly down, there are no messages to generate since:
1619 	 *
1620 	 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1621 	 *    and its addresses by bringing them down.  But that's already
1622 	 *    true, so there's nothing to hide.
1623 	 *
1624 	 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1625 	 *    indicating that any previously-hidden up addresses are again
1626 	 *    back up (along with the interface).  But they aren't, so
1627 	 *    there's nothing to expose.
1628 	 */
1629 	if (ill->ill_ipif_up_count == 0)
1630 		return;
1631 
1632 	if (cmd == RTM_ADD)
1633 		ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1634 
1635 	for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1636 		if (ipif->ipif_flags & IPIF_UP)
1637 			ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1638 
1639 	if (cmd == RTM_DELETE)
1640 		ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1641 }
1642 
1643 /*
1644  * Bind the address named by `ipif' to the underlying ill named by `ill'.
1645  * If `act' is Res_act_none, don't notify the resolver.  Otherwise, `act'
1646  * will indicate to the resolver whether this is an initial bringup of
1647  * `ipif', or just a rebind to another ill.
1648  */
1649 static void
1650 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1651 {
1652 	int err = 0;
1653 	ip_stack_t *ipst = ill->ill_ipst;
1654 
1655 	ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1656 	ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1657 	ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1658 	ASSERT(ipif->ipif_bound_ill == NULL);
1659 	ASSERT(ipif->ipif_bound_next == NULL);
1660 
1661 	ipif->ipif_bound_next = ill->ill_bound_ipif;
1662 	ill->ill_bound_ipif = ipif;
1663 	ill->ill_bound_cnt++;
1664 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1665 	ipif->ipif_bound_ill = ill;
1666 	rw_exit(&ipst->ips_ipmp_lock);
1667 
1668 	/*
1669 	 * If necessary, tell ARP/NDP about the new mapping.  Note that
1670 	 * ipif_resolver_up() cannot fail for IPv6 ills.
1671 	 */
1672 	if (act != Res_act_none) {
1673 		if (ill->ill_isv6) {
1674 			VERIFY(ipif_resolver_up(ipif, act) == 0);
1675 			err = ipif_ndp_up(ipif, act == Res_act_initial);
1676 		} else {
1677 			err = ipif_resolver_up(ipif, act);
1678 		}
1679 
1680 		/*
1681 		 * Since ipif_ndp_up() never returns EINPROGRESS and
1682 		 * ipif_resolver_up() only returns EINPROGRESS when the
1683 		 * associated ill is not up, we should never be here with
1684 		 * EINPROGRESS.  We rely on this to simplify the design.
1685 		 */
1686 		ASSERT(err != EINPROGRESS);
1687 	}
1688 	/* TODO: retry binding on failure? when? */
1689 	ipif->ipif_bound = (err == 0);
1690 }
1691 
1692 /*
1693  * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1694  * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1695  * If no ipifs are bound to `ill', NULL is returned.  If `notifyres' is
1696  * B_TRUE, notify the resolver about the change.
1697  */
1698 static ipif_t *
1699 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1700 {
1701 	ipif_t *previpif;
1702 	ip_stack_t *ipst = ill->ill_ipst;
1703 
1704 	ASSERT(IAM_WRITER_ILL(ill));
1705 	ASSERT(IS_UNDER_IPMP(ill));
1706 
1707 	/*
1708 	 * If necessary, find an ipif to unbind.
1709 	 */
1710 	if (ipif == NULL) {
1711 		if ((ipif = ill->ill_bound_ipif) == NULL) {
1712 			ASSERT(ill->ill_bound_cnt == 0);
1713 			return (NULL);
1714 		}
1715 	}
1716 
1717 	ASSERT(IAM_WRITER_IPIF(ipif));
1718 	ASSERT(IS_IPMP(ipif->ipif_ill));
1719 	ASSERT(ipif->ipif_bound_ill == ill);
1720 	ASSERT(ill->ill_bound_cnt > 0);
1721 
1722 	/*
1723 	 * Unbind it.
1724 	 */
1725 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1726 	ipif->ipif_bound_ill = NULL;
1727 	rw_exit(&ipst->ips_ipmp_lock);
1728 	ill->ill_bound_cnt--;
1729 
1730 	if (ill->ill_bound_ipif == ipif) {
1731 		ill->ill_bound_ipif = ipif->ipif_bound_next;
1732 	} else {
1733 		previpif = ill->ill_bound_ipif;
1734 		while (previpif->ipif_bound_next != ipif)
1735 			previpif = previpif->ipif_bound_next;
1736 
1737 		previpif->ipif_bound_next = ipif->ipif_bound_next;
1738 	}
1739 	ipif->ipif_bound_next = NULL;
1740 
1741 	/*
1742 	 * If requested, notify the resolvers (provided we're bound).
1743 	 */
1744 	if (notifyres && ipif->ipif_bound) {
1745 		if (ill->ill_isv6)
1746 			ipif_ndp_down(ipif);
1747 		else
1748 			(void) ipif_arp_down(ipif);
1749 	}
1750 	ipif->ipif_bound = B_FALSE;
1751 
1752 	return (ipif);
1753 }
1754 
1755 /*
1756  * Check if `ill' is active.  Caller must hold ill_lock and phyint_lock if
1757  * it's not inside the IPSQ.  Since ipmp_ill_try_refresh_active() calls this
1758  * to determine whether an ill should be considered active, other consumers
1759  * may race and learn about an ill that should be deactivated/activated before
1760  * IPMP has performed the activation/deactivation.  This should be safe though
1761  * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1762  * would've been cleaned up by ipmp_ill_deactivate().
1763  */
1764 boolean_t
1765 ipmp_ill_is_active(ill_t *ill)
1766 {
1767 	phyint_t *phyi = ill->ill_phyint;
1768 
1769 	ASSERT(IS_UNDER_IPMP(ill));
1770 	ASSERT(IAM_WRITER_ILL(ill) ||
1771 	    (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1772 
1773 	/*
1774 	 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1775 	 * set PHYI_FAILED whenever PHYI_RUNNING is cleared.  This allows the
1776 	 * link flapping logic to be just in in.mpathd and allows us to ignore
1777 	 * changes to PHYI_RUNNING.
1778 	 */
1779 	return (!(ill->ill_ipif_up_count == 0 ||
1780 	    (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1781 }
1782 
1783 /*
1784  * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
1785  * with `ill_arg'.
1786  */
1787 static void
1788 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1789 {
1790 	ill_t *ill = (ill_t *)ill_arg;
1791 
1792 	ASSERT(IAM_WRITER_ILL(ill));
1793 	ASSERT(!IS_IPMP(ill));
1794 
1795 	if (ire->ire_ill != ill)
1796 		return;
1797 
1798 	if (IRE_HIDDEN_TYPE(ire->ire_type)) {
1799 		DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1800 		ire->ire_testhidden = B_TRUE;
1801 	}
1802 }
1803 
1804 /*
1805  * IRE walker callback: clear ire_testhidden if the IRE has a source address
1806  * on `ill_arg'.
1807  */
1808 static void
1809 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1810 {
1811 	ill_t *ill = (ill_t *)ill_arg;
1812 
1813 	ASSERT(IAM_WRITER_ILL(ill));
1814 	ASSERT(!IS_IPMP(ill));
1815 
1816 	if (ire->ire_ill == ill) {
1817 		DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1818 		ire->ire_testhidden = B_FALSE;
1819 	}
1820 }
1821 
1822 /*
1823  * Return a held pointer to the IPMP ill for underlying interface `ill', or
1824  * NULL if one doesn't exist.  (Unfortunately, this function needs to take an
1825  * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1826  * ill_grp pointer may become stale when not inside an IPSQ and not holding
1827  * ipmp_lock.)  Caller need not be inside the IPSQ.
1828  */
1829 ill_t *
1830 ipmp_ill_hold_ipmp_ill(ill_t *ill)
1831 {
1832 	ip_stack_t *ipst = ill->ill_ipst;
1833 	ipmp_illgrp_t *illg;
1834 
1835 	ASSERT(!IS_IPMP(ill));
1836 
1837 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1838 	illg = ill->ill_grp;
1839 	if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
1840 		rw_exit(&ipst->ips_ipmp_lock);
1841 		return (illg->ig_ipmp_ill);
1842 	}
1843 	/*
1844 	 * Assume `ill' was removed from the illgrp in the meantime.
1845 	 */
1846 	rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1847 	return (NULL);
1848 }
1849 
1850 /*
1851  * Return the interface index for the IPMP ill tied to underlying interface
1852  * `ill', or zero if one doesn't exist.  Caller need not be inside the IPSQ.
1853  */
1854 uint_t
1855 ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1856 {
1857 	uint_t ifindex = 0;
1858 	ip_stack_t *ipst = ill->ill_ipst;
1859 	ipmp_grp_t *grp;
1860 
1861 	ASSERT(!IS_IPMP(ill));
1862 
1863 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1864 	if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1865 		ifindex = grp->gr_phyint->phyint_ifindex;
1866 	rw_exit(&ipst->ips_ipmp_lock);
1867 	return (ifindex);
1868 }
1869 
1870 /*
1871  * Place phyint `phyi' into IPMP group `grp'.
1872  */
1873 void
1874 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1875 {
1876 	ill_t *ill;
1877 	ipsq_t *ipsq = phyi->phyint_ipsq;
1878 	ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1879 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1880 
1881 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1882 	ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1883 
1884 	/*
1885 	 * Send routing socket messages indicating that the phyint's ills
1886 	 * and ipifs vanished.
1887 	 */
1888 	if (phyi->phyint_illv4 != NULL) {
1889 		ill = phyi->phyint_illv4;
1890 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1891 	}
1892 
1893 	if (phyi->phyint_illv6 != NULL) {
1894 		ill = phyi->phyint_illv6;
1895 		ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1896 	}
1897 
1898 	/*
1899 	 * Snapshot the phyint's initial kstats as a baseline.
1900 	 */
1901 	ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1902 
1903 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1904 
1905 	phyi->phyint_grp = grp;
1906 	if (++grp->gr_nif == 1)
1907 		grp->gr_mactype = ill->ill_mactype;
1908 	else
1909 		ASSERT(grp->gr_mactype == ill->ill_mactype);
1910 
1911 	/*
1912 	 * Now that we're in the group, request a switch to the group's xop
1913 	 * when we ipsq_exit().  All future operations will be exclusive on
1914 	 * the group xop until ipmp_phyint_leave_grp() is called.
1915 	 */
1916 	ASSERT(ipsq->ipsq_swxop == NULL);
1917 	ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1918 	ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1919 
1920 	rw_exit(&ipst->ips_ipmp_lock);
1921 }
1922 
1923 /*
1924  * Remove phyint `phyi' from its current IPMP group.
1925  */
1926 void
1927 ipmp_phyint_leave_grp(phyint_t *phyi)
1928 {
1929 	uint_t i;
1930 	ipsq_t *ipsq = phyi->phyint_ipsq;
1931 	ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1932 	uint64_t phyi_kstats[IPMP_KSTAT_MAX];
1933 
1934 	ASSERT(IAM_WRITER_IPSQ(ipsq));
1935 
1936 	/*
1937 	 * If any of the phyint's ills are still in an illgrp, kick 'em out.
1938 	 */
1939 	if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
1940 		ipmp_ill_leave_illgrp(phyi->phyint_illv4);
1941 	if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
1942 		ipmp_ill_leave_illgrp(phyi->phyint_illv6);
1943 
1944 	/*
1945 	 * Send routing socket messages indicating that the phyint's ills
1946 	 * and ipifs have reappeared.
1947 	 */
1948 	if (phyi->phyint_illv4 != NULL)
1949 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
1950 	if (phyi->phyint_illv6 != NULL)
1951 		ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
1952 
1953 	/*
1954 	 * Calculate the phyint's cumulative kstats while it was in the group,
1955 	 * and add that to the group's baseline.
1956 	 */
1957 	ipmp_phyint_get_kstats(phyi, phyi_kstats);
1958 	for (i = 0; i < IPMP_KSTAT_MAX; i++) {
1959 		phyi_kstats[i] -= phyi->phyint_kstats0[i];
1960 		atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
1961 	}
1962 
1963 	rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1964 
1965 	phyi->phyint_grp->gr_nif--;
1966 	phyi->phyint_grp = NULL;
1967 
1968 	/*
1969 	 * As our final act in leaving the group, request a switch back to our
1970 	 * IPSQ's own xop when we ipsq_exit().
1971 	 */
1972 	ASSERT(ipsq->ipsq_swxop == NULL);
1973 	ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
1974 
1975 	rw_exit(&ipst->ips_ipmp_lock);
1976 }
1977 
1978 /*
1979  * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
1980  * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
1981  */
1982 static void
1983 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
1984 {
1985 	uint_t		i, j;
1986 	const char	*name;
1987 	kstat_t		*ksp;
1988 	kstat_named_t	*kn;
1989 	ip_stack_t	*ipst = PHYINT_TO_IPST(phyi);
1990 	zoneid_t	zoneid;
1991 
1992 	bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
1993 	zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
1994 	ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
1995 	if (ksp == NULL)
1996 		return;
1997 
1998 	KSTAT_ENTER(ksp);
1999 
2000 	if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2001 		/*
2002 		 * Bring kstats up-to-date before recording.
2003 		 */
2004 		(void) KSTAT_UPDATE(ksp, KSTAT_READ);
2005 
2006 		kn = KSTAT_NAMED_PTR(ksp);
2007 		for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2008 			name = ipmp_kstats[i].name;
2009 			kstats[i] = 0;
2010 			for (j = 0; j < ksp->ks_ndata; j++) {
2011 				if (strcmp(kn[j].name, name) != 0)
2012 					continue;
2013 
2014 				switch (kn[j].data_type) {
2015 				case KSTAT_DATA_INT32:
2016 				case KSTAT_DATA_UINT32:
2017 					kstats[i] = kn[j].value.ui32;
2018 					break;
2019 #ifdef	_LP64
2020 				case KSTAT_DATA_LONG:
2021 				case KSTAT_DATA_ULONG:
2022 					kstats[i] = kn[j].value.ul;
2023 					break;
2024 #endif
2025 				case KSTAT_DATA_INT64:
2026 				case KSTAT_DATA_UINT64:
2027 					kstats[i] = kn[j].value.ui64;
2028 					break;
2029 				}
2030 				break;
2031 			}
2032 		}
2033 	}
2034 
2035 	KSTAT_EXIT(ksp);
2036 	kstat_rele(ksp);
2037 }
2038 
2039 /*
2040  * Refresh the active state of all ills on `phyi'.
2041  */
2042 void
2043 ipmp_phyint_refresh_active(phyint_t *phyi)
2044 {
2045 	if (phyi->phyint_illv4 != NULL)
2046 		ipmp_ill_refresh_active(phyi->phyint_illv4);
2047 	if (phyi->phyint_illv6 != NULL)
2048 		ipmp_ill_refresh_active(phyi->phyint_illv6);
2049 }
2050 
2051 /*
2052  * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2053  * doesn't exist.  Caller need not be inside the IPSQ.
2054  */
2055 ill_t *
2056 ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2057 {
2058 	ill_t *boundill;
2059 	ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2060 
2061 	ASSERT(IS_IPMP(ipif->ipif_ill));
2062 
2063 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2064 	boundill = ipif->ipif_bound_ill;
2065 	if (boundill != NULL && ill_check_and_refhold(boundill)) {
2066 		rw_exit(&ipst->ips_ipmp_lock);
2067 		return (boundill);
2068 	}
2069 	rw_exit(&ipst->ips_ipmp_lock);
2070 	return (NULL);
2071 }
2072 
2073 /*
2074  * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2075  * doesn't exist.  Caller must be inside the IPSQ.
2076  */
2077 ill_t *
2078 ipmp_ipif_bound_ill(const ipif_t *ipif)
2079 {
2080 	ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2081 	ASSERT(IS_IPMP(ipif->ipif_ill));
2082 
2083 	return (ipif->ipif_bound_ill);
2084 }
2085 
2086 /*
2087  * Check if `ipif' is a "stub" (placeholder address not being used).
2088  */
2089 boolean_t
2090 ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2091 {
2092 	if (ipif->ipif_flags & IPIF_UP)
2093 		return (B_FALSE);
2094 	if (ipif->ipif_ill->ill_isv6)
2095 		return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2096 	else
2097 		return (ipif->ipif_lcl_addr == INADDR_ANY);
2098 }
2099 
2100 /*
2101  * Check if `ipif' is an IPMP data address.
2102  */
2103 boolean_t
2104 ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2105 {
2106 	if (ipif->ipif_flags & IPIF_NOFAILOVER)
2107 		return (B_FALSE);
2108 	if (ipif->ipif_ill->ill_isv6)
2109 		return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2110 	else
2111 		return (ipif->ipif_lcl_addr != INADDR_ANY);
2112 }
2113 
2114 /*
2115  * Check if `ipif' is an IPIF_UP IPMP data address.
2116  */
2117 static boolean_t
2118 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2119 {
2120 	return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2121 }
2122 
2123 /*
2124  * Check if `mp' contains a probe packet by verifying if the IP source address
2125  * is a test address on an underlying interface `ill'. Caller need not be inside
2126  * the IPSQ.
2127  */
2128 boolean_t
2129 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
2130 {
2131 	ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2132 	ipha_t *ipha = (ipha_t *)mp->b_rptr;
2133 
2134 	ASSERT(DB_TYPE(mp) != M_CTL);
2135 
2136 	if (!IS_UNDER_IPMP(ill))
2137 		return (B_FALSE);
2138 
2139 	if (ill->ill_isv6) {
2140 		if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
2141 		    ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
2142 			return (B_TRUE);
2143 	} else {
2144 		if ((ipha->ipha_src != INADDR_ANY) &&
2145 		    ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
2146 			return (B_TRUE);
2147 	}
2148 	return (B_FALSE);
2149 }
2150 
2151 /*
2152  * Pick out an appropriate underlying interface for packet transmit.  This
2153  * function may be called from the data path, so we need to verify that the
2154  * IPMP group associated with `ill' is non-null after holding the ill_g_lock.
2155  * Caller need not be inside the IPSQ.
2156  */
2157 ill_t *
2158 ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast)
2159 {
2160 	ill_t *xmit_ill;
2161 	ip_stack_t *ipst = ill->ill_ipst;
2162 
2163 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2164 	if (ill->ill_grp == NULL) {
2165 		/*
2166 		 * The interface was taken out of the group. Return ill itself,
2167 		 * but take a ref so that callers will always be able to do
2168 		 * ill_refrele(ill);
2169 		 */
2170 		rw_exit(&ipst->ips_ill_g_lock);
2171 		ill_refhold(ill);
2172 		return (ill);
2173 	}
2174 	if (!is_unicast)
2175 		xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
2176 	else
2177 		xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
2178 	rw_exit(&ipst->ips_ill_g_lock);
2179 	return (xmit_ill);
2180 }
2181 
2182 /*
2183  * Flush out any nce that points at `ncec' from an underlying interface
2184  */
2185 void
2186 ipmp_ncec_flush_nce(ncec_t *ncec)
2187 {
2188 	ill_t		*ncec_ill = ncec->ncec_ill;
2189 	ill_t		*ill;
2190 	ipmp_illgrp_t	*illg;
2191 	ip_stack_t	*ipst = ncec_ill->ill_ipst;
2192 	list_t		dead;
2193 	nce_t		*nce;
2194 
2195 	if (!IS_IPMP(ncec_ill))
2196 		return;
2197 
2198 	illg = ncec_ill->ill_grp;
2199 	list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
2200 
2201 	rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2202 	ill = list_head(&illg->ig_if);
2203 	for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
2204 		nce_fastpath_list_delete(ill, ncec, &dead);
2205 	}
2206 	rw_exit(&ipst->ips_ill_g_lock);
2207 
2208 	/*
2209 	 * we may now nce_refrele() all dead entries since all locks have been
2210 	 * dropped.
2211 	 */
2212 	while ((nce = list_head(&dead)) != NULL) {
2213 		list_remove(&dead, nce);
2214 		nce_refrele(nce);
2215 	}
2216 	ASSERT(list_is_empty(&dead));
2217 	list_destroy(&dead);
2218 }
2219 
2220 /*
2221  * For each interface in the IPMP group, if there are nce_t entries for the IP
2222  * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath
2223  * information must be updated to match the link-layer address information in
2224  * `ncec'.
2225  */
2226 void
2227 ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill)
2228 {
2229 	ill_t		*ill;
2230 	ipmp_illgrp_t	*illg = ipmp_ill->ill_grp;
2231 	ip_stack_t	*ipst = ipmp_ill->ill_ipst;
2232 	nce_t		*nce, *nce_next;
2233 	list_t		replace;
2234 
2235 	ASSERT(IS_IPMP(ipmp_ill));
2236 
2237 	/*
2238 	 * if ncec itself is not reachable, there is no use in creating nce_t
2239 	 * entries on the underlying interfaces in the group.
2240 	 */
2241 	if (!NCE_ISREACHABLE(ncec))
2242 		return;
2243 
2244 	list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
2245 	rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2246 	ill = list_head(&illg->ig_actif);
2247 	for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
2248 		/*
2249 		 * For each underlying interface, we first check if there is an
2250 		 * nce_t for the address in ncec->ncec_addr. If one exists,
2251 		 * we should trigger nce_fastpath for that nce_t. However, the
2252 		 * catch is that we are holding the ips_ipmp_lock to prevent
2253 		 * changes to the IPMP group membership, so that we cannot
2254 		 * putnext() to the driver.  So we nce_delete the
2255 		 * list nce_t entries that need to be updated into the
2256 		 * `replace' list, and then process the `replace' list
2257 		 * after dropping the ips_ipmp_lock.
2258 		 */
2259 		mutex_enter(&ill->ill_lock);
2260 		for (nce = list_head(&ill->ill_nce); nce != NULL; ) {
2261 			nce_next = list_next(&ill->ill_nce, nce);
2262 			if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
2263 			    &ncec->ncec_addr)) {
2264 				nce = nce_next;
2265 				continue;
2266 			}
2267 			nce_refhold(nce);
2268 			nce_delete(nce);
2269 			list_insert_tail(&replace, nce);
2270 			nce = nce_next;
2271 		}
2272 		mutex_exit(&ill->ill_lock);
2273 	}
2274 	rw_exit(&ipst->ips_ipmp_lock);
2275 	/*
2276 	 * `replace' now has the list of nce's on which we should be triggering
2277 	 * nce_fastpath(). We now retrigger fastpath by setting up the nce
2278 	 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill
2279 	 * is still in the group for ncec->ncec_ill
2280 	 */
2281 	while ((nce = list_head(&replace)) != NULL) {
2282 		list_remove(&replace, nce);
2283 		if (ncec->ncec_ill->ill_isv6) {
2284 			(void) nce_lookup_then_add_v6(nce->nce_ill,
2285 			    ncec->ncec_lladdr,  ncec->ncec_lladdr_length,
2286 			    &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
2287 			    NULL);
2288 		} else {
2289 			ipaddr_t ipaddr;
2290 
2291 			IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
2292 			(void) nce_lookup_then_add_v4(nce->nce_ill,
2293 			    ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2294 			    &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
2295 		}
2296 		nce_refrele(nce);
2297 	}
2298 	ASSERT(list_is_empty(&replace));
2299 	list_destroy(&replace);
2300 }
2301