1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 *
21 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
22 */
23
24 #include <inet/ip.h>
25 #include <inet/ip6.h>
26 #include <inet/ip_if.h>
27 #include <inet/ip_ire.h>
28 #include <inet/ip_multi.h>
29 #include <inet/ip_ndp.h>
30 #include <inet/ip_rts.h>
31 #include <inet/mi.h>
32 #include <net/if_types.h>
33 #include <sys/dlpi.h>
34 #include <sys/kmem.h>
35 #include <sys/modhash.h>
36 #include <sys/sdt.h>
37 #include <sys/strsun.h>
38 #include <sys/sunddi.h>
39 #include <sys/types.h>
40
41 /*
42 * Convenience macros for getting the ip_stack_t associated with an
43 * ipmp_illgrp_t or ipmp_grp_t.
44 */
45 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint)
46 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst)
47
48 /*
49 * Assorted constants that aren't important enough to be tunable.
50 */
51 #define IPMP_GRP_HASH_SIZE 64
52 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */
53
54 /*
55 * IPMP meta-interface kstats (based on those in PSARC/1997/198).
56 */
57 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = {
58 { "obytes", KSTAT_DATA_UINT32 },
59 { "obytes64", KSTAT_DATA_UINT64 },
60 { "rbytes", KSTAT_DATA_UINT32 },
61 { "rbytes64", KSTAT_DATA_UINT64 },
62 { "opackets", KSTAT_DATA_UINT32 },
63 { "opackets64", KSTAT_DATA_UINT64 },
64 { "oerrors", KSTAT_DATA_UINT32 },
65 { "ipackets", KSTAT_DATA_UINT32 },
66 { "ipackets64", KSTAT_DATA_UINT64 },
67 { "ierrors", KSTAT_DATA_UINT32 },
68 { "multircv", KSTAT_DATA_UINT32 },
69 { "multixmt", KSTAT_DATA_UINT32 },
70 { "brdcstrcv", KSTAT_DATA_UINT32 },
71 { "brdcstxmt", KSTAT_DATA_UINT32 },
72 { "link_up", KSTAT_DATA_UINT32 }
73 };
74
75 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t);
76 static int ipmp_grp_create_kstats(ipmp_grp_t *);
77 static int ipmp_grp_update_kstats(kstat_t *, int);
78 static void ipmp_grp_destroy_kstats(ipmp_grp_t *);
79 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *);
80 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *);
81 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *);
82 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t);
83 static boolean_t ipmp_ill_activate(ill_t *);
84 static void ipmp_ill_deactivate(ill_t *);
85 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *);
86 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *);
87 static void ipmp_ill_refresh_active_timer_start(ill_t *);
88 static void ipmp_ill_rtsaddrmsg(ill_t *, int);
89 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action);
90 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t);
91 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *);
92 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *);
93 static void ipmp_ncec_delete_nonlocal(ncec_t *, uchar_t *);
94
95 /*
96 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init().
97 */
98 void
ipmp_init(ip_stack_t * ipst)99 ipmp_init(ip_stack_t *ipst)
100 {
101 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash",
102 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
103 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP);
104 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0);
105 }
106
107 /*
108 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini().
109 */
110 void
ipmp_destroy(ip_stack_t * ipst)111 ipmp_destroy(ip_stack_t *ipst)
112 {
113 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash);
114 rw_destroy(&ipst->ips_ipmp_lock);
115 }
116
117 /*
118 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi',
119 * and add it to the hash. On success, return a pointer to the created group.
120 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP
121 * meta-interface associated with the group also has the same name (but they
122 * may differ later via ipmp_grp_rename()).
123 */
124 ipmp_grp_t *
ipmp_grp_create(const char * grname,phyint_t * phyi)125 ipmp_grp_create(const char *grname, phyint_t *phyi)
126 {
127 ipmp_grp_t *grp;
128 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
129 mod_hash_hndl_t mh;
130
131 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
132
133 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL)
134 return (NULL);
135
136 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
137 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname));
138
139 /*
140 * Cache the group's phyint. This is safe since a phyint_t will
141 * outlive its ipmp_grp_t.
142 */
143 grp->gr_phyint = phyi;
144
145 /*
146 * Create IPMP group kstats.
147 */
148 if (ipmp_grp_create_kstats(grp) != 0) {
149 kmem_free(grp, sizeof (ipmp_grp_t));
150 return (NULL);
151 }
152
153 /*
154 * Insert the group into the hash.
155 */
156 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) {
157 ipmp_grp_destroy_kstats(grp);
158 kmem_free(grp, sizeof (ipmp_grp_t));
159 return (NULL);
160 }
161 ipmp_grp_insert(grp, mh);
162
163 return (grp);
164 }
165
166 /*
167 * Create IPMP kstat structures for `grp'. Return an errno upon failure.
168 */
169 static int
ipmp_grp_create_kstats(ipmp_grp_t * grp)170 ipmp_grp_create_kstats(ipmp_grp_t *grp)
171 {
172 kstat_t *ksp;
173 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
174
175 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net",
176 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id);
177 if (ksp == NULL)
178 return (ENOMEM);
179
180 ksp->ks_update = ipmp_grp_update_kstats;
181 ksp->ks_private = grp;
182 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats));
183
184 kstat_install(ksp);
185 grp->gr_ksp = ksp;
186 return (0);
187 }
188
189 /*
190 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework.
191 */
192 static int
ipmp_grp_update_kstats(kstat_t * ksp,int rw)193 ipmp_grp_update_kstats(kstat_t *ksp, int rw)
194 {
195 uint_t i;
196 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp);
197 ipmp_grp_t *grp = ksp->ks_private;
198 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
199 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq;
200 phyint_t *phyi;
201 uint64_t phyi_kstats[IPMP_KSTAT_MAX];
202
203 if (rw == KSTAT_WRITE)
204 return (EACCES);
205
206 /*
207 * Start with the group's baseline values.
208 */
209 for (i = 0; i < IPMP_KSTAT_MAX; i++) {
210 if (kn[i].data_type == KSTAT_DATA_UINT32) {
211 kn[i].value.ui32 = grp->gr_kstats0[i];
212 } else {
213 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64);
214 kn[i].value.ui64 = grp->gr_kstats0[i];
215 }
216 }
217
218 /*
219 * Add in the stats of each phyint currently in the group. Since we
220 * don't directly track the phyints in a group, we cheat by walking
221 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while
222 * ill_g_lock is held.)
223 */
224 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
225 ipsq = grp_ipsq->ipsq_next;
226 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) {
227 phyi = ipsq->ipsq_phyint;
228
229 /*
230 * If a phyint in a group is being unplumbed, it's possible
231 * that ill_glist_delete() -> phyint_free() already freed the
232 * phyint (and set ipsq_phyint to NULL), but the unplumb
233 * operation has yet to complete (and thus ipsq_dq() has yet
234 * to remove the phyint's IPSQ from the group IPSQ's phyint
235 * list). We skip those phyints here (note that their kstats
236 * have already been added to gr_kstats0[]).
237 */
238 if (phyi == NULL)
239 continue;
240
241 ipmp_phyint_get_kstats(phyi, phyi_kstats);
242
243 for (i = 0; i < IPMP_KSTAT_MAX; i++) {
244 phyi_kstats[i] -= phyi->phyint_kstats0[i];
245 if (kn[i].data_type == KSTAT_DATA_UINT32)
246 kn[i].value.ui32 += phyi_kstats[i];
247 else
248 kn[i].value.ui64 += phyi_kstats[i];
249 }
250 }
251
252 kn[IPMP_KSTAT_LINK_UP].value.ui32 =
253 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0;
254
255 rw_exit(&ipst->ips_ill_g_lock);
256 return (0);
257 }
258
259 /*
260 * Destroy IPMP kstat structures for `grp'.
261 */
262 static void
ipmp_grp_destroy_kstats(ipmp_grp_t * grp)263 ipmp_grp_destroy_kstats(ipmp_grp_t *grp)
264 {
265 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid;
266
267 kstat_delete_netstack(grp->gr_ksp, id);
268 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0));
269 grp->gr_ksp = NULL;
270 }
271
272 /*
273 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it
274 * does not exist.
275 */
276 ipmp_grp_t *
ipmp_grp_lookup(const char * grname,ip_stack_t * ipst)277 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst)
278 {
279 ipmp_grp_t *grp;
280
281 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
282
283 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
284 (mod_hash_val_t *)&grp) == 0)
285 return (grp);
286
287 return (NULL);
288 }
289
290 /*
291 * Place information about group `grp' into `lifgr'.
292 */
293 void
ipmp_grp_info(const ipmp_grp_t * grp,lifgroupinfo_t * lifgr)294 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr)
295 {
296 ill_t *ill;
297 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
298
299 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
300
301 lifgr->gi_v4 = (grp->gr_v4 != NULL);
302 lifgr->gi_v6 = (grp->gr_v6 != NULL);
303 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4;
304 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6;
305 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP;
306 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ);
307 lifgr->gi_m4ifname[0] = '\0';
308 lifgr->gi_m6ifname[0] = '\0';
309 lifgr->gi_bcifname[0] = '\0';
310
311 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) {
312 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ);
313 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ);
314 }
315
316 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL)
317 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ);
318 }
319
320 /*
321 * Insert `grp' into the hash using the reserved hash entry `mh'.
322 * Caller must ensure `grp' is not yet in the hash.
323 */
324 static void
ipmp_grp_insert(ipmp_grp_t * grp,mod_hash_hndl_t mh)325 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh)
326 {
327 int err;
328 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
329
330 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
331
332 /*
333 * Since grp->gr_name will exist at least as long as `grp' is in the
334 * hash, we use it directly as the key.
335 */
336 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash,
337 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh);
338 if (err != 0) {
339 /*
340 * This should never happen since `mh' was preallocated.
341 */
342 panic("cannot insert IPMP group \"%s\" (err %d)",
343 grp->gr_name, err);
344 }
345 }
346
347 /*
348 * Remove `grp' from the hash. Caller must ensure `grp' is in it.
349 */
350 static void
ipmp_grp_remove(ipmp_grp_t * grp)351 ipmp_grp_remove(ipmp_grp_t *grp)
352 {
353 int err;
354 mod_hash_val_t val;
355 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name;
356 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
357
358 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
359
360 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val);
361 if (err != 0 || val != grp) {
362 panic("cannot remove IPMP group \"%s\" (err %d)",
363 grp->gr_name, err);
364 }
365 }
366
367 /*
368 * Attempt to rename `grp' to new name `grname'. Return an errno if the new
369 * group name already exists or is invalid, or if there isn't enough memory.
370 */
371 int
ipmp_grp_rename(ipmp_grp_t * grp,const char * grname)372 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname)
373 {
374 mod_hash_hndl_t mh;
375 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
376
377 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
378
379 if (grname[0] == '\0')
380 return (EINVAL);
381
382 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname,
383 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND)
384 return (EEXIST);
385
386 /*
387 * Before we remove the group from the hash, ensure we'll be able to
388 * re-insert it by reserving space.
389 */
390 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0)
391 return (ENOMEM);
392
393 ipmp_grp_remove(grp);
394 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name));
395 ipmp_grp_insert(grp, mh);
396
397 return (0);
398 }
399
400 /*
401 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in
402 * the hash, and that there are no interfaces on it.
403 */
404 void
ipmp_grp_destroy(ipmp_grp_t * grp)405 ipmp_grp_destroy(ipmp_grp_t *grp)
406 {
407 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
408
409 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
410
411 /*
412 * If there are still interfaces using this group, panic before things
413 * go really off the rails.
414 */
415 if (grp->gr_nif != 0)
416 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name);
417
418 ipmp_grp_remove(grp);
419 ipmp_grp_destroy_kstats(grp);
420
421 ASSERT(grp->gr_v4 == NULL);
422 ASSERT(grp->gr_v6 == NULL);
423 ASSERT(grp->gr_nv4 == 0);
424 ASSERT(grp->gr_nv6 == 0);
425 ASSERT(grp->gr_nactif == 0);
426 ASSERT(grp->gr_linkdownmp == NULL);
427 grp->gr_phyint = NULL;
428
429 kmem_free(grp, sizeof (ipmp_grp_t));
430 }
431
432 /*
433 * Check whether `ill' is suitable for inclusion into `grp', and return an
434 * errno describing the problem (if any). NOTE: many of these errno values
435 * are interpreted by ifconfig, which will take corrective action and retry
436 * the SIOCSLIFGROUPNAME, so please exercise care when changing them.
437 */
438 static int
ipmp_grp_vet_ill(ipmp_grp_t * grp,ill_t * ill)439 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill)
440 {
441 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
442
443 ASSERT(IAM_WRITER_ILL(ill));
444 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
445
446 /*
447 * To sidestep complicated address migration logic in the kernel and
448 * to force the kernel's all-hosts multicast memberships to be blown
449 * away, all addresses that had been brought up must be brought back
450 * down prior to adding an interface to a group. (This includes
451 * addresses currently down due to DAD.) Once the interface has been
452 * added to the group, its addresses can then be brought back up, at
453 * which point they will be moved to the IPMP meta-interface.
454 * NOTE: we do this before ill_appaddr_cnt() since bringing down the
455 * link-local causes in.ndpd to remove its ADDRCONF'd addresses.
456 */
457 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0)
458 return (EADDRINUSE);
459
460 /*
461 * To avoid confusing applications by changing addresses that are
462 * under their control, all such control must be removed prior to
463 * adding an interface into a group.
464 */
465 if (ill_appaddr_cnt(ill) != 0)
466 return (EADDRNOTAVAIL);
467
468 /*
469 * Since PTP addresses do not share the same broadcast domain, they
470 * are not allowed to be in an IPMP group.
471 */
472 if (ill_ptpaddr_cnt(ill) != 0)
473 return (EINVAL);
474
475 /*
476 * An ill must support multicast to be allowed into a group.
477 */
478 if (!(ill->ill_flags & ILLF_MULTICAST))
479 return (ENOTSUP);
480
481 /*
482 * An ill must strictly be using ARP and/or ND for address
483 * resolution for it to be allowed into a group.
484 */
485 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP))
486 return (ENOTSUP);
487
488 /*
489 * An ill cannot also be using usesrc groups. (Although usesrc uses
490 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does
491 * all its modifications as writer.)
492 */
493 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill))
494 return (ENOTSUP);
495
496 /*
497 * All ills in a group must be the same mactype.
498 */
499 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype)
500 return (EINVAL);
501
502 return (0);
503 }
504
505 /*
506 * Check whether `phyi' is suitable for inclusion into `grp', and return an
507 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill()
508 * regarding errno values.
509 */
510 int
ipmp_grp_vet_phyint(ipmp_grp_t * grp,phyint_t * phyi)511 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi)
512 {
513 int err = 0;
514 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp);
515
516 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq));
517 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock));
518
519 /*
520 * An interface cannot have address families plumbed that are not
521 * configured in the group.
522 */
523 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL ||
524 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL)
525 return (EAFNOSUPPORT);
526
527 if (phyi->phyint_illv4 != NULL)
528 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4);
529 if (err == 0 && phyi->phyint_illv6 != NULL)
530 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6);
531
532 return (err);
533 }
534
535 /*
536 * Create a new illgrp on IPMP meta-interface `ill'.
537 */
538 ipmp_illgrp_t *
ipmp_illgrp_create(ill_t * ill)539 ipmp_illgrp_create(ill_t *ill)
540 {
541 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU;
542 ipmp_illgrp_t *illg;
543
544 ASSERT(IAM_WRITER_ILL(ill));
545 ASSERT(IS_IPMP(ill));
546 ASSERT(ill->ill_grp == NULL);
547
548 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL)
549 return (NULL);
550
551 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode));
552 list_create(&illg->ig_actif, sizeof (ill_t),
553 offsetof(ill_t, ill_actnode));
554 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t),
555 offsetof(ipmp_arpent_t, ia_node));
556
557 illg->ig_ipmp_ill = ill;
558 ill->ill_grp = illg;
559 ipmp_illgrp_set_mtu(illg, mtu, mtu);
560
561 return (illg);
562 }
563
564 /*
565 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface.
566 */
567 void
ipmp_illgrp_destroy(ipmp_illgrp_t * illg)568 ipmp_illgrp_destroy(ipmp_illgrp_t *illg)
569 {
570 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
571 ASSERT(IS_IPMP(illg->ig_ipmp_ill));
572
573 /*
574 * Verify `illg' is empty.
575 */
576 ASSERT(illg->ig_next_ill == NULL);
577 ASSERT(illg->ig_cast_ill == NULL);
578 ASSERT(list_is_empty(&illg->ig_arpent));
579 ASSERT(list_is_empty(&illg->ig_if));
580 ASSERT(list_is_empty(&illg->ig_actif));
581 ASSERT(illg->ig_nactif == 0);
582
583 /*
584 * Destroy `illg'.
585 */
586 illg->ig_ipmp_ill->ill_grp = NULL;
587 illg->ig_ipmp_ill = NULL;
588 list_destroy(&illg->ig_if);
589 list_destroy(&illg->ig_actif);
590 list_destroy(&illg->ig_arpent);
591 kmem_free(illg, sizeof (ipmp_illgrp_t));
592 }
593
594 /*
595 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to
596 * bind it to an underlying ill, while keeping an even address distribution.
597 * If the bind is successful, return a pointer to the bound ill.
598 */
599 ill_t *
ipmp_illgrp_add_ipif(ipmp_illgrp_t * illg,ipif_t * ipif)600 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
601 {
602 ill_t *minill;
603 ipmp_arpent_t *entp;
604
605 ASSERT(IAM_WRITER_IPIF(ipif));
606 ASSERT(ipmp_ipif_is_dataaddr(ipif));
607
608 /*
609 * IPMP data address mappings are internally managed by IP itself, so
610 * delete any existing ARP entries associated with the address.
611 */
612 if (!ipif->ipif_isv6) {
613 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr);
614 if (entp != NULL)
615 ipmp_illgrp_destroy_arpent(illg, entp);
616 }
617
618 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
619 ipmp_ill_bind_ipif(minill, ipif, Res_act_none);
620
621 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL);
622 }
623
624 /*
625 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's
626 * bound, unbind it from the underlying ill while keeping an even address
627 * distribution.
628 */
629 void
ipmp_illgrp_del_ipif(ipmp_illgrp_t * illg,ipif_t * ipif)630 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif)
631 {
632 ill_t *maxill, *boundill = ipif->ipif_bound_ill;
633
634 ASSERT(IAM_WRITER_IPIF(ipif));
635
636 if (boundill != NULL) {
637 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE);
638
639 maxill = ipmp_illgrp_max_ill(illg);
640 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) {
641 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
642 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind);
643 }
644 }
645 }
646
647 /*
648 * Return the active ill with the greatest number of data addresses in `illg'.
649 */
650 static ill_t *
ipmp_illgrp_max_ill(ipmp_illgrp_t * illg)651 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg)
652 {
653 ill_t *ill, *bestill = NULL;
654
655 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
656
657 ill = list_head(&illg->ig_actif);
658 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
659 if (bestill == NULL ||
660 ill->ill_bound_cnt > bestill->ill_bound_cnt) {
661 bestill = ill;
662 }
663 }
664 return (bestill);
665 }
666
667 /*
668 * Return the active ill with the fewest number of data addresses in `illg'.
669 */
670 static ill_t *
ipmp_illgrp_min_ill(ipmp_illgrp_t * illg)671 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg)
672 {
673 ill_t *ill, *bestill = NULL;
674
675 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
676
677 ill = list_head(&illg->ig_actif);
678 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
679 if (bestill == NULL ||
680 ill->ill_bound_cnt < bestill->ill_bound_cnt) {
681 if (ill->ill_bound_cnt == 0)
682 return (ill); /* can't get better */
683 bestill = ill;
684 }
685 }
686 return (bestill);
687 }
688
689 /*
690 * Return a pointer to IPMP meta-interface for `illg' (which must exist).
691 * Since ig_ipmp_ill never changes for a given illg, no locks are needed.
692 */
693 ill_t *
ipmp_illgrp_ipmp_ill(ipmp_illgrp_t * illg)694 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg)
695 {
696 return (illg->ig_ipmp_ill);
697 }
698
699 /*
700 * Return a pointer to the next available underlying ill in `illg', or NULL if
701 * one doesn't exist. Caller must be inside the IPSQ.
702 */
703 ill_t *
ipmp_illgrp_next_ill(ipmp_illgrp_t * illg)704 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg)
705 {
706 ill_t *ill;
707 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
708
709 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
710
711 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
712 if ((ill = illg->ig_next_ill) != NULL) {
713 illg->ig_next_ill = list_next(&illg->ig_actif, ill);
714 if (illg->ig_next_ill == NULL)
715 illg->ig_next_ill = list_head(&illg->ig_actif);
716 }
717 rw_exit(&ipst->ips_ipmp_lock);
718
719 return (ill);
720 }
721
722 /*
723 * Return a held pointer to the next available underlying ill in `illg', or
724 * NULL if one doesn't exist. Caller need not be inside the IPSQ.
725 */
726 ill_t *
ipmp_illgrp_hold_next_ill(ipmp_illgrp_t * illg)727 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg)
728 {
729 ill_t *ill;
730 uint_t i;
731 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
732
733 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
734 for (i = 0; i < illg->ig_nactif; i++) {
735 ill = illg->ig_next_ill;
736 illg->ig_next_ill = list_next(&illg->ig_actif, ill);
737 if (illg->ig_next_ill == NULL)
738 illg->ig_next_ill = list_head(&illg->ig_actif);
739
740 if (ill_check_and_refhold(ill)) {
741 rw_exit(&ipst->ips_ipmp_lock);
742 return (ill);
743 }
744 }
745 rw_exit(&ipst->ips_ipmp_lock);
746
747 return (NULL);
748 }
749
750 /*
751 * Return a held pointer to the nominated multicast ill in `illg', or NULL if
752 * one doesn't exist. Caller need not be inside the IPSQ.
753 */
754 ill_t *
ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t * illg)755 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg)
756 {
757 ill_t *castill;
758 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
759
760 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
761 castill = illg->ig_cast_ill;
762 if (castill != NULL && ill_check_and_refhold(castill)) {
763 rw_exit(&ipst->ips_ipmp_lock);
764 return (castill);
765 }
766 rw_exit(&ipst->ips_ipmp_lock);
767 return (NULL);
768 }
769
770 /*
771 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL,
772 * any existing nomination is removed. Caller must be inside the IPSQ.
773 */
774 static void
ipmp_illgrp_set_cast(ipmp_illgrp_t * illg,ill_t * castill)775 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill)
776 {
777 ill_t *ocastill = illg->ig_cast_ill;
778 ill_t *ipmp_ill = illg->ig_ipmp_ill;
779 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
780
781 ASSERT(IAM_WRITER_ILL(ipmp_ill));
782
783 /*
784 * Disable old nominated ill (if any).
785 */
786 if (ocastill != NULL) {
787 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *,
788 illg, ill_t *, ocastill);
789 ASSERT(ocastill->ill_nom_cast);
790 ocastill->ill_nom_cast = B_FALSE;
791 /*
792 * If the IPMP meta-interface is down, we never did the join,
793 * so we must not try to leave.
794 */
795 if (ipmp_ill->ill_dl_up)
796 ill_leave_multicast(ipmp_ill);
797
798 /*
799 * Delete any NCEs tied to the old nomination. We must do this
800 * last since ill_leave_multicast() may trigger IREs to be
801 * built using ig_cast_ill.
802 */
803 ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill,
804 ocastill->ill_ipst);
805 }
806
807 /*
808 * Set new nomination.
809 */
810 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
811 illg->ig_cast_ill = castill;
812 rw_exit(&ipst->ips_ipmp_lock);
813
814 /*
815 * Enable new nominated ill (if any).
816 */
817 if (castill != NULL) {
818 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *,
819 illg, ill_t *, castill);
820 ASSERT(!castill->ill_nom_cast);
821 castill->ill_nom_cast = B_TRUE;
822 /*
823 * If the IPMP meta-interface is down, the attempt to recover
824 * will silently fail but ill_need_recover_multicast will be
825 * erroneously cleared -- so check first.
826 */
827 if (ipmp_ill->ill_dl_up)
828 ill_recover_multicast(ipmp_ill);
829 }
830 }
831
832 /*
833 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an
834 * entry for the same IP address already exists, destroy it first. Return the
835 * created IPMP ARP entry, or NULL on failure.
836 */
837 ipmp_arpent_t *
ipmp_illgrp_create_arpent(ipmp_illgrp_t * illg,boolean_t proxyarp,ipaddr_t ipaddr,uchar_t * lladdr,size_t lladdr_len,uint16_t flags)838 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp,
839 ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags)
840 {
841 ipmp_arpent_t *entp, *oentp;
842
843 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
844
845 if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len,
846 KM_NOSLEEP)) == NULL)
847 return (NULL);
848
849 /*
850 * Delete any existing ARP entry for this address.
851 */
852 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL)
853 ipmp_illgrp_destroy_arpent(illg, oentp);
854
855 /*
856 * Prepend the new entry.
857 */
858 entp->ia_ipaddr = ipaddr;
859 entp->ia_flags = flags;
860 entp->ia_lladdr_len = lladdr_len;
861 entp->ia_lladdr = (uchar_t *)&entp[1];
862 bcopy(lladdr, entp->ia_lladdr, lladdr_len);
863 entp->ia_proxyarp = proxyarp;
864 entp->ia_notified = B_TRUE;
865 list_insert_head(&illg->ig_arpent, entp);
866 return (entp);
867 }
868
869 /*
870 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it.
871 */
872 void
ipmp_illgrp_destroy_arpent(ipmp_illgrp_t * illg,ipmp_arpent_t * entp)873 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
874 {
875 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
876
877 list_remove(&illg->ig_arpent, entp);
878 kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len);
879 }
880
881 /*
882 * Mark that ARP has been notified about the IP address on `entp'; `illg' is
883 * taken as a debugging aid for DTrace FBT probes.
884 */
885 /* ARGSUSED */
886 void
ipmp_illgrp_mark_arpent(ipmp_illgrp_t * illg,ipmp_arpent_t * entp)887 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp)
888 {
889 entp->ia_notified = B_TRUE;
890 }
891
892 /*
893 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is
894 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist.
895 */
896 ipmp_arpent_t *
ipmp_illgrp_lookup_arpent(ipmp_illgrp_t * illg,ipaddr_t * addrp)897 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp)
898 {
899 ipmp_arpent_t *entp = list_head(&illg->ig_arpent);
900
901 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill));
902
903 if (addrp == NULL)
904 return (entp);
905
906 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp))
907 if (entp->ia_ipaddr == *addrp)
908 break;
909 return (entp);
910 }
911
912 /*
913 * Refresh ARP entries on `illg' to be distributed across its active
914 * interfaces. Entries that cannot be refreshed (e.g., because there are no
915 * active interfaces) are marked so that subsequent calls can try again.
916 */
917 void
ipmp_illgrp_refresh_arpent(ipmp_illgrp_t * illg)918 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg)
919 {
920 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill;
921 uint_t paddrlen = ipmp_ill->ill_phys_addr_length;
922 ipmp_arpent_t *entp;
923 ncec_t *ncec;
924 nce_t *nce;
925
926 ASSERT(IAM_WRITER_ILL(ipmp_ill));
927 ASSERT(!ipmp_ill->ill_isv6);
928
929 ill = list_head(&illg->ig_actif);
930 entp = list_head(&illg->ig_arpent);
931 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) {
932 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) {
933 entp->ia_notified = B_FALSE;
934 continue;
935 }
936
937 ASSERT(paddrlen == ill->ill_phys_addr_length);
938
939 /*
940 * If this is a proxy ARP entry, we can skip notifying ARP if
941 * the entry is already up-to-date. If it has changed, we
942 * update the entry's hardware address before notifying ARP.
943 */
944 if (entp->ia_proxyarp) {
945 if (bcmp(ill->ill_phys_addr, entp->ia_lladdr,
946 paddrlen) == 0 && entp->ia_notified)
947 continue;
948 bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen);
949 }
950
951 (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr,
952 paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED,
953 &nce);
954 if (nce == NULL || !entp->ia_proxyarp) {
955 if (nce != NULL)
956 nce_refrele(nce);
957 continue;
958 }
959 ncec = nce->nce_common;
960 mutex_enter(&ncec->ncec_lock);
961 nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr);
962 mutex_exit(&ncec->ncec_lock);
963 nce_refrele(nce);
964 ipmp_illgrp_mark_arpent(illg, entp);
965
966 if ((ill = list_next(&illg->ig_actif, ill)) == NULL)
967 ill = list_head(&illg->ig_actif);
968 }
969 }
970
971 /*
972 * Return an interface in `illg' with the specified `physaddr', or NULL if one
973 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ.
974 */
975 ill_t *
ipmp_illgrp_find_ill(ipmp_illgrp_t * illg,uchar_t * physaddr,uint_t paddrlen)976 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen)
977 {
978 ill_t *ill;
979 ill_t *ipmp_ill = illg->ig_ipmp_ill;
980 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
981
982 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock));
983
984 ill = list_head(&illg->ig_if);
985 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
986 if (ill->ill_phys_addr_length == paddrlen &&
987 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0)
988 return (ill);
989 }
990 return (NULL);
991 }
992
993 /*
994 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND.
995 * Caller must be inside the IPSQ unless this is initialization.
996 */
997 static void
ipmp_illgrp_set_mtu(ipmp_illgrp_t * illg,uint_t mtu,uint_t mc_mtu)998 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu)
999 {
1000 ill_t *ill = illg->ig_ipmp_ill;
1001 mblk_t *mp;
1002
1003 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill));
1004
1005 /*
1006 * If allocation fails, we have bigger problems than MTU.
1007 */
1008 if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) {
1009 illg->ig_mtu = mtu;
1010 illg->ig_mc_mtu = mc_mtu;
1011 put(ill->ill_rq, mp);
1012 }
1013 }
1014
1015 /*
1016 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP
1017 * ill MTU if necessary.
1018 */
1019 void
ipmp_illgrp_refresh_mtu(ipmp_illgrp_t * illg)1020 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg)
1021 {
1022 ill_t *ill;
1023 ill_t *ipmp_ill = illg->ig_ipmp_ill;
1024 uint_t mtu = 0;
1025 uint_t mc_mtu = 0;
1026
1027 ASSERT(IAM_WRITER_ILL(ipmp_ill));
1028
1029 /*
1030 * Since ill_mtu can only change under ill_lock, we hold ill_lock
1031 * for each ill as we iterate through the list. Any changes to the
1032 * ill_mtu will also trigger an update, so even if we missed it
1033 * this time around, the update will catch it.
1034 */
1035 ill = list_head(&illg->ig_if);
1036 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) {
1037 mutex_enter(&ill->ill_lock);
1038 if (mtu == 0 || ill->ill_mtu < mtu)
1039 mtu = ill->ill_mtu;
1040 if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu)
1041 mc_mtu = ill->ill_mc_mtu;
1042 mutex_exit(&ill->ill_lock);
1043 }
1044
1045 /*
1046 * MTU must be at least the minimum MTU.
1047 */
1048 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1049 mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU);
1050 if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu)
1051 ipmp_illgrp_set_mtu(illg, mtu, mc_mtu);
1052 }
1053
1054 /*
1055 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently
1056 * allow the same link to be established more than once.
1057 */
1058 void
ipmp_illgrp_link_grp(ipmp_illgrp_t * illg,ipmp_grp_t * grp)1059 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp)
1060 {
1061 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1062
1063 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1064
1065 if (illg->ig_ipmp_ill->ill_isv6) {
1066 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg);
1067 grp->gr_v6 = illg;
1068 } else {
1069 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg);
1070 grp->gr_v4 = illg;
1071 }
1072 }
1073
1074 /*
1075 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp
1076 * cannot be unlinked (e.g., because there are still interfaces using it).
1077 */
1078 int
ipmp_illgrp_unlink_grp(ipmp_illgrp_t * illg)1079 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg)
1080 {
1081 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp;
1082 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1083
1084 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock));
1085
1086 if (illg->ig_ipmp_ill->ill_isv6) {
1087 if (grp->gr_nv6 + grp->gr_pendv6 != 0)
1088 return (EBUSY);
1089 grp->gr_v6 = NULL;
1090 } else {
1091 if (grp->gr_nv4 + grp->gr_pendv4 != 0)
1092 return (EBUSY);
1093 grp->gr_v4 = NULL;
1094 }
1095 return (0);
1096 }
1097
1098 /*
1099 * Place `ill' into `illg', and rebalance the data addresses on `illg'
1100 * to be spread evenly across the ills now in it. Also, adjust the IPMP
1101 * ill as necessary to account for `ill' (e.g., MTU).
1102 */
1103 void
ipmp_ill_join_illgrp(ill_t * ill,ipmp_illgrp_t * illg)1104 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg)
1105 {
1106 ill_t *ipmp_ill;
1107 ipif_t *ipif;
1108 ip_stack_t *ipst = ill->ill_ipst;
1109
1110 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */
1111 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL);
1112 ASSERT(IAM_WRITER_ILL(ill));
1113 ASSERT(ill->ill_grp == NULL);
1114
1115 ipmp_ill = illg->ig_ipmp_ill;
1116
1117 /*
1118 * Account for `ill' joining the illgrp.
1119 */
1120 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1121 if (ill->ill_isv6)
1122 ill->ill_phyint->phyint_grp->gr_nv6++;
1123 else
1124 ill->ill_phyint->phyint_grp->gr_nv4++;
1125 rw_exit(&ipst->ips_ipmp_lock);
1126
1127 /*
1128 * Ensure the ILLF_ROUTER flag remains consistent across the group.
1129 */
1130 mutex_enter(&ill->ill_lock);
1131 if (ipmp_ill->ill_flags & ILLF_ROUTER)
1132 ill->ill_flags |= ILLF_ROUTER;
1133 else
1134 ill->ill_flags &= ~ILLF_ROUTER;
1135 mutex_exit(&ill->ill_lock);
1136
1137 /*
1138 * Blow away all multicast memberships that currently exist on `ill'.
1139 * This may seem odd, but it's consistent with the application view
1140 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()).
1141 * The ill_grp_pending bit prevents multicast group joins after
1142 * update_conn_ill() and before ill_grp assignment.
1143 */
1144 mutex_enter(&ill->ill_mcast_serializer);
1145 ill->ill_grp_pending = 1;
1146 mutex_exit(&ill->ill_mcast_serializer);
1147 update_conn_ill(ill, ill->ill_ipst);
1148 if (ill->ill_isv6) {
1149 reset_mrt_ill(ill);
1150 } else {
1151 ipif = ill->ill_ipif;
1152 for (; ipif != NULL; ipif = ipif->ipif_next) {
1153 reset_mrt_vif_ipif(ipif);
1154 }
1155 }
1156 ip_purge_allmulti(ill);
1157
1158 /*
1159 * Borrow the first ill's ill_phys_addr_length value for the illgrp's
1160 * physical address length. All other ills must have the same value,
1161 * since they are required to all be the same mactype. Also update
1162 * the IPMP ill's MTU and CoS marking, if necessary.
1163 */
1164 if (list_is_empty(&illg->ig_if)) {
1165 ASSERT(ipmp_ill->ill_phys_addr_length == 0);
1166 /*
1167 * NOTE: we leave ill_phys_addr NULL since the IPMP group
1168 * doesn't have a physical address. This means that code must
1169 * not assume that ill_phys_addr is non-NULL just because
1170 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla.
1171 */
1172 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length;
1173 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length;
1174 ipmp_ill->ill_type = ill->ill_type;
1175
1176 if (ill->ill_flags & ILLF_COS_ENABLED) {
1177 mutex_enter(&ipmp_ill->ill_lock);
1178 ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1179 mutex_exit(&ipmp_ill->ill_lock);
1180 }
1181 ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu);
1182 } else {
1183 ASSERT(ipmp_ill->ill_phys_addr_length ==
1184 ill->ill_phys_addr_length);
1185 ASSERT(ipmp_ill->ill_type == ill->ill_type);
1186
1187 if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1188 mutex_enter(&ipmp_ill->ill_lock);
1189 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1190 mutex_exit(&ipmp_ill->ill_lock);
1191 }
1192 if (illg->ig_mtu > ill->ill_mtu ||
1193 illg->ig_mc_mtu > ill->ill_mc_mtu) {
1194 ipmp_illgrp_set_mtu(illg, ill->ill_mtu,
1195 ill->ill_mc_mtu);
1196 }
1197 }
1198
1199 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1200 list_insert_tail(&illg->ig_if, ill);
1201 ill->ill_grp = illg;
1202 rw_exit(&ipst->ips_ill_g_lock);
1203
1204 mutex_enter(&ill->ill_mcast_serializer);
1205 ill->ill_grp_pending = 0;
1206 mutex_exit(&ill->ill_mcast_serializer);
1207
1208 /*
1209 * Hide the IREs on `ill' so that we don't accidentally find them when
1210 * sending data traffic.
1211 */
1212 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill);
1213
1214 ipmp_ill_refresh_active(ill);
1215 }
1216
1217 /*
1218 * Remove `ill' from its illgrp, and rebalance the data addresses in that
1219 * illgrp to be spread evenly across the remaining ills. Also, adjust the
1220 * IPMP ill as necessary now that `ill' is removed (e.g., MTU).
1221 */
1222 void
ipmp_ill_leave_illgrp(ill_t * ill)1223 ipmp_ill_leave_illgrp(ill_t *ill)
1224 {
1225 ill_t *ipmp_ill;
1226 ipif_t *ipif;
1227 ipmp_arpent_t *entp;
1228 ipmp_illgrp_t *illg = ill->ill_grp;
1229 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1230
1231 ASSERT(IS_UNDER_IPMP(ill));
1232 ASSERT(IAM_WRITER_ILL(ill));
1233 ASSERT(illg != NULL);
1234
1235 ipmp_ill = illg->ig_ipmp_ill;
1236
1237 /*
1238 * Cancel IPMP-specific ill timeouts.
1239 */
1240 (void) untimeout(ill->ill_refresh_tid);
1241
1242 /*
1243 * Expose any previously-hidden IREs on `ill'.
1244 */
1245 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill);
1246
1247 /*
1248 * Ensure the multicast state for each ipif on `ill' is down so that
1249 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin
1250 * all eligible groups.
1251 */
1252 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1253 if (ipif->ipif_flags & IPIF_UP)
1254 ipif_multicast_down(ipif);
1255
1256 /*
1257 * Account for `ill' leaving the illgrp.
1258 */
1259 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1260 if (ill->ill_isv6)
1261 ill->ill_phyint->phyint_grp->gr_nv6--;
1262 else
1263 ill->ill_phyint->phyint_grp->gr_nv4--;
1264 rw_exit(&ipst->ips_ipmp_lock);
1265
1266 /*
1267 * Pull `ill' out of the interface lists.
1268 */
1269 if (list_link_active(&ill->ill_actnode))
1270 ipmp_ill_deactivate(ill);
1271 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER);
1272 list_remove(&illg->ig_if, ill);
1273 ill->ill_grp = NULL;
1274 rw_exit(&ipst->ips_ill_g_lock);
1275
1276 /*
1277 * Re-establish multicast memberships that were previously being
1278 * handled by the IPMP meta-interface.
1279 */
1280 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1281 if (ipif->ipif_flags & IPIF_UP)
1282 ipif_multicast_up(ipif);
1283
1284 /*
1285 * Refresh the group MTU based on the new interface list.
1286 */
1287 ipmp_illgrp_refresh_mtu(illg);
1288
1289 if (list_is_empty(&illg->ig_if)) {
1290 /*
1291 * No ills left in the illgrp; we no longer have a physical
1292 * address length, nor can we support ARP, CoS, or anything
1293 * else that depends on knowing the link layer type.
1294 */
1295 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL)
1296 ipmp_illgrp_destroy_arpent(illg, entp);
1297
1298 ipmp_ill->ill_phys_addr_length = 0;
1299 ipmp_ill->ill_nd_lla_len = 0;
1300 ipmp_ill->ill_type = IFT_OTHER;
1301 mutex_enter(&ipmp_ill->ill_lock);
1302 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED;
1303 mutex_exit(&ipmp_ill->ill_lock);
1304 } else {
1305 /*
1306 * If `ill' didn't support CoS, see if it can now be enabled.
1307 */
1308 if (!(ill->ill_flags & ILLF_COS_ENABLED)) {
1309 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED));
1310
1311 ill = list_head(&illg->ig_if);
1312 do {
1313 if (!(ill->ill_flags & ILLF_COS_ENABLED))
1314 break;
1315 } while ((ill = list_next(&illg->ig_if, ill)) != NULL);
1316
1317 if (ill == NULL) {
1318 mutex_enter(&ipmp_ill->ill_lock);
1319 ipmp_ill->ill_flags |= ILLF_COS_ENABLED;
1320 mutex_exit(&ipmp_ill->ill_lock);
1321 }
1322 }
1323 }
1324 }
1325
1326 /*
1327 * Check if `ill' should be active, and activate or deactivate if need be.
1328 * Return B_FALSE if a refresh was necessary but could not be performed.
1329 */
1330 static boolean_t
ipmp_ill_try_refresh_active(ill_t * ill)1331 ipmp_ill_try_refresh_active(ill_t *ill)
1332 {
1333 boolean_t refreshed = B_TRUE;
1334
1335 ASSERT(IAM_WRITER_ILL(ill));
1336 ASSERT(IS_UNDER_IPMP(ill));
1337
1338 if (ipmp_ill_is_active(ill)) {
1339 if (!list_link_active(&ill->ill_actnode))
1340 refreshed = ipmp_ill_activate(ill);
1341 } else {
1342 if (list_link_active(&ill->ill_actnode))
1343 ipmp_ill_deactivate(ill);
1344 }
1345
1346 return (refreshed);
1347 }
1348
1349 /*
1350 * Check if `ill' should be active, and activate or deactivate if need be.
1351 * If the refresh fails, schedule a timer to try again later.
1352 */
1353 void
ipmp_ill_refresh_active(ill_t * ill)1354 ipmp_ill_refresh_active(ill_t *ill)
1355 {
1356 if (!ipmp_ill_try_refresh_active(ill))
1357 ipmp_ill_refresh_active_timer_start(ill);
1358 }
1359
1360 /*
1361 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'.
1362 */
1363 static void
ipmp_ill_refresh_active_timer(void * ill_arg)1364 ipmp_ill_refresh_active_timer(void *ill_arg)
1365 {
1366 ill_t *ill = ill_arg;
1367 boolean_t refreshed = B_FALSE;
1368
1369 /*
1370 * Clear ill_refresh_tid to indicate that no timeout is pending
1371 * (another thread could schedule a new timeout while we're still
1372 * running, but that's harmless). If the ill is going away, bail.
1373 */
1374 mutex_enter(&ill->ill_lock);
1375 ill->ill_refresh_tid = 0;
1376 if (ill->ill_state_flags & ILL_CONDEMNED) {
1377 mutex_exit(&ill->ill_lock);
1378 return;
1379 }
1380 mutex_exit(&ill->ill_lock);
1381
1382 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) {
1383 refreshed = ipmp_ill_try_refresh_active(ill);
1384 ipsq_exit(ill->ill_phyint->phyint_ipsq);
1385 }
1386
1387 /*
1388 * If the refresh failed, schedule another attempt.
1389 */
1390 if (!refreshed)
1391 ipmp_ill_refresh_active_timer_start(ill);
1392 }
1393
1394 /*
1395 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'.
1396 */
1397 static void
ipmp_ill_refresh_active_timer_start(ill_t * ill)1398 ipmp_ill_refresh_active_timer_start(ill_t *ill)
1399 {
1400 mutex_enter(&ill->ill_lock);
1401
1402 /*
1403 * If the ill is going away or a refresh is already scheduled, bail.
1404 */
1405 if (ill->ill_refresh_tid != 0 ||
1406 (ill->ill_state_flags & ILL_CONDEMNED)) {
1407 mutex_exit(&ill->ill_lock);
1408 return;
1409 }
1410
1411 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill,
1412 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT));
1413
1414 mutex_exit(&ill->ill_lock);
1415 }
1416
1417 /*
1418 * Activate `ill' so it will be used to send and receive data traffic. Return
1419 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages
1420 * needed to deactivate `ill' here as well so that deactivation cannot fail.
1421 */
1422 static boolean_t
ipmp_ill_activate(ill_t * ill)1423 ipmp_ill_activate(ill_t *ill)
1424 {
1425 ipif_t *ipif;
1426 mblk_t *linkupmp = NULL, *linkdownmp = NULL;
1427 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
1428 ipmp_illgrp_t *illg = ill->ill_grp;
1429 ill_t *maxill;
1430 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1431
1432 ASSERT(IAM_WRITER_ILL(ill));
1433 ASSERT(IS_UNDER_IPMP(ill));
1434
1435 /*
1436 * If this will be the first active interface in the group, allocate
1437 * the link-up and link-down messages.
1438 */
1439 if (grp->gr_nactif == 0) {
1440 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0);
1441 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0);
1442 if (linkupmp == NULL || linkdownmp == NULL)
1443 goto fail;
1444 }
1445
1446 if (list_is_empty(&illg->ig_actif)) {
1447 /*
1448 * Now that we have an active ill, nominate it for multicast
1449 * and broadcast duties. Do this before ipmp_ill_bind_ipif()
1450 * since that may need to send multicast packets (e.g., IPv6
1451 * neighbor discovery probes).
1452 */
1453 ipmp_illgrp_set_cast(illg, ill);
1454
1455 /*
1456 * This is the first active ill in the illgrp -- add 'em all.
1457 * We can access/walk ig_ipmp_ill's ipif list since we're
1458 * writer on its IPSQ as well.
1459 */
1460 ipif = illg->ig_ipmp_ill->ill_ipif;
1461 for (; ipif != NULL; ipif = ipif->ipif_next)
1462 if (ipmp_ipif_is_up_dataaddr(ipif))
1463 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial);
1464 } else {
1465 /*
1466 * Redistribute the addresses by moving them from the ill with
1467 * the most addresses until the ill being activated is at the
1468 * same level as the rest of the ills.
1469 */
1470 for (;;) {
1471 maxill = ipmp_illgrp_max_ill(illg);
1472 ASSERT(maxill != NULL);
1473 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt)
1474 break;
1475 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE);
1476 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind);
1477 }
1478 }
1479
1480 /*
1481 * Put the interface in the active list.
1482 */
1483 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1484 list_insert_tail(&illg->ig_actif, ill);
1485 illg->ig_nactif++;
1486 illg->ig_next_ill = ill;
1487 rw_exit(&ipst->ips_ipmp_lock);
1488
1489 /*
1490 * Refresh static/proxy ARP entries to use `ill', if need be.
1491 */
1492 if (!ill->ill_isv6)
1493 ipmp_illgrp_refresh_arpent(illg);
1494
1495 /*
1496 * Finally, mark the group link up, if necessary.
1497 */
1498 if (grp->gr_nactif++ == 0) {
1499 ASSERT(grp->gr_linkdownmp == NULL);
1500 grp->gr_linkdownmp = linkdownmp;
1501 put(illg->ig_ipmp_ill->ill_rq, linkupmp);
1502 }
1503 return (B_TRUE);
1504 fail:
1505 freemsg(linkupmp);
1506 freemsg(linkdownmp);
1507 return (B_FALSE);
1508 }
1509
1510 /*
1511 * Deactivate `ill' so it will not be used to send or receive data traffic.
1512 */
1513 static void
ipmp_ill_deactivate(ill_t * ill)1514 ipmp_ill_deactivate(ill_t *ill)
1515 {
1516 ill_t *minill, *ipmp_ill;
1517 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL;
1518 mblk_t *mp;
1519 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp;
1520 ipmp_illgrp_t *illg = ill->ill_grp;
1521 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg);
1522
1523 ASSERT(IAM_WRITER_ILL(ill));
1524 ASSERT(IS_UNDER_IPMP(ill));
1525
1526 ipmp_ill = illg->ig_ipmp_ill;
1527
1528 /*
1529 * Pull the interface out of the active list.
1530 */
1531 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1532 list_remove(&illg->ig_actif, ill);
1533 illg->ig_nactif--;
1534 illg->ig_next_ill = list_head(&illg->ig_actif);
1535 rw_exit(&ipst->ips_ipmp_lock);
1536
1537 /*
1538 * If the ill that's being deactivated had been nominated for
1539 * multicast/broadcast, nominate a new one.
1540 */
1541 if (ill == illg->ig_cast_ill)
1542 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif));
1543
1544 /*
1545 * Delete all nce_t entries using this ill, so that the next attempt
1546 * to send data traffic will revalidate cached nce's.
1547 */
1548 nce_flush(ill, B_TRUE);
1549
1550 /*
1551 * Unbind all of the ipifs bound to this ill, and save 'em in a list;
1552 * we'll rebind them after we tell the resolver the ill is no longer
1553 * active. We must do things in this order or the resolver could
1554 * accidentally rebind to the ill we're trying to remove if multiple
1555 * ills in the group have the same hardware address (which is
1556 * unsupported, but shouldn't lead to a wedged machine).
1557 */
1558 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) {
1559 ipif->ipif_bound_next = ubheadipif;
1560 ubheadipif = ipif;
1561 }
1562
1563 if (!ill->ill_isv6) {
1564 /*
1565 * Refresh static/proxy ARP entries that had been using `ill'.
1566 */
1567 ipmp_illgrp_refresh_arpent(illg);
1568 }
1569
1570 /*
1571 * Rebind each ipif from the deactivated ill to the active ill with
1572 * the fewest ipifs. If there are no active ills, the ipifs will
1573 * remain unbound.
1574 */
1575 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) {
1576 ubnextipif = ipif->ipif_bound_next;
1577 ipif->ipif_bound_next = NULL;
1578
1579 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL)
1580 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind);
1581 }
1582
1583 /*
1584 * Remove any IRE_IF_CLONEs for this ill since they might have an
1585 * ire_nce_cache/nce_common which refers to another ill in the group.
1586 */
1587 ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, ill,
1588 ill);
1589
1590 /*
1591 * Finally, if there are no longer any active interfaces, then delete
1592 * any NCECs associated with the group and mark the group link down.
1593 */
1594 if (--grp->gr_nactif == 0) {
1595 ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, ipmp_ill, ipst);
1596 mp = grp->gr_linkdownmp;
1597 grp->gr_linkdownmp = NULL;
1598 ASSERT(mp != NULL);
1599 put(ipmp_ill->ill_rq, mp);
1600 }
1601 }
1602
1603 /*
1604 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD)
1605 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners.
1606 */
1607 static void
ipmp_ill_rtsaddrmsg(ill_t * ill,int cmd)1608 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd)
1609 {
1610 ipif_t *ipif;
1611
1612 ASSERT(IAM_WRITER_ILL(ill));
1613 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE);
1614
1615 /*
1616 * If `ill' is truly down, there are no messages to generate since:
1617 *
1618 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface
1619 * and its addresses by bringing them down. But that's already
1620 * true, so there's nothing to hide.
1621 *
1622 * 2. If cmd == RTM_ADD, then we're supposed to generate messages
1623 * indicating that any previously-hidden up addresses are again
1624 * back up (along with the interface). But they aren't, so
1625 * there's nothing to expose.
1626 */
1627 if (ill->ill_ipif_up_count == 0)
1628 return;
1629
1630 if (cmd == RTM_ADD)
1631 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL);
1632
1633 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next)
1634 if (ipif->ipif_flags & IPIF_UP)
1635 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL);
1636
1637 if (cmd == RTM_DELETE)
1638 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL);
1639 }
1640
1641 /*
1642 * Bind the address named by `ipif' to the underlying ill named by `ill'.
1643 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act'
1644 * will indicate to the resolver whether this is an initial bringup of
1645 * `ipif', or just a rebind to another ill.
1646 */
1647 static void
ipmp_ill_bind_ipif(ill_t * ill,ipif_t * ipif,enum ip_resolver_action act)1648 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act)
1649 {
1650 int err = 0;
1651 ip_stack_t *ipst = ill->ill_ipst;
1652
1653 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif));
1654 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill));
1655 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif));
1656 ASSERT(ipif->ipif_bound_ill == NULL);
1657 ASSERT(ipif->ipif_bound_next == NULL);
1658
1659 ipif->ipif_bound_next = ill->ill_bound_ipif;
1660 ill->ill_bound_ipif = ipif;
1661 ill->ill_bound_cnt++;
1662 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1663 ipif->ipif_bound_ill = ill;
1664 rw_exit(&ipst->ips_ipmp_lock);
1665
1666 /*
1667 * If necessary, tell ARP/NDP about the new mapping. Note that
1668 * ipif_resolver_up() cannot fail for IPv6 ills.
1669 */
1670 if (act != Res_act_none) {
1671 if (ill->ill_isv6) {
1672 VERIFY(ipif_resolver_up(ipif, act) == 0);
1673 err = ipif_ndp_up(ipif, act == Res_act_initial);
1674 } else {
1675 err = ipif_resolver_up(ipif, act);
1676 }
1677
1678 /*
1679 * Since ipif_ndp_up() never returns EINPROGRESS and
1680 * ipif_resolver_up() only returns EINPROGRESS when the
1681 * associated ill is not up, we should never be here with
1682 * EINPROGRESS. We rely on this to simplify the design.
1683 */
1684 ASSERT(err != EINPROGRESS);
1685 }
1686 /* TODO: retry binding on failure? when? */
1687 ipif->ipif_bound = (err == 0);
1688 }
1689
1690 /*
1691 * Unbind the address named by `ipif' from the underlying ill named by `ill'.
1692 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned.
1693 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is
1694 * B_TRUE, notify the resolver about the change.
1695 */
1696 static ipif_t *
ipmp_ill_unbind_ipif(ill_t * ill,ipif_t * ipif,boolean_t notifyres)1697 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres)
1698 {
1699 ipif_t *previpif;
1700 ip_stack_t *ipst = ill->ill_ipst;
1701
1702 ASSERT(IAM_WRITER_ILL(ill));
1703 ASSERT(IS_UNDER_IPMP(ill));
1704
1705 /*
1706 * If necessary, find an ipif to unbind.
1707 */
1708 if (ipif == NULL) {
1709 if ((ipif = ill->ill_bound_ipif) == NULL) {
1710 ASSERT(ill->ill_bound_cnt == 0);
1711 return (NULL);
1712 }
1713 }
1714
1715 ASSERT(IAM_WRITER_IPIF(ipif));
1716 ASSERT(IS_IPMP(ipif->ipif_ill));
1717 ASSERT(ipif->ipif_bound_ill == ill);
1718 ASSERT(ill->ill_bound_cnt > 0);
1719
1720 /*
1721 * Unbind it.
1722 */
1723 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1724 ipif->ipif_bound_ill = NULL;
1725 rw_exit(&ipst->ips_ipmp_lock);
1726 ill->ill_bound_cnt--;
1727
1728 if (ill->ill_bound_ipif == ipif) {
1729 ill->ill_bound_ipif = ipif->ipif_bound_next;
1730 } else {
1731 previpif = ill->ill_bound_ipif;
1732 while (previpif->ipif_bound_next != ipif)
1733 previpif = previpif->ipif_bound_next;
1734
1735 previpif->ipif_bound_next = ipif->ipif_bound_next;
1736 }
1737 ipif->ipif_bound_next = NULL;
1738
1739 /*
1740 * If requested, notify the resolvers (provided we're bound).
1741 */
1742 if (notifyres && ipif->ipif_bound) {
1743 if (ill->ill_isv6)
1744 ipif_ndp_down(ipif);
1745 else
1746 (void) ipif_arp_down(ipif);
1747 }
1748 ipif->ipif_bound = B_FALSE;
1749
1750 return (ipif);
1751 }
1752
1753 /*
1754 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if
1755 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this
1756 * to determine whether an ill should be considered active, other consumers
1757 * may race and learn about an ill that should be deactivated/activated before
1758 * IPMP has performed the activation/deactivation. This should be safe though
1759 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that
1760 * would've been cleaned up by ipmp_ill_deactivate().
1761 */
1762 boolean_t
ipmp_ill_is_active(ill_t * ill)1763 ipmp_ill_is_active(ill_t *ill)
1764 {
1765 phyint_t *phyi = ill->ill_phyint;
1766
1767 ASSERT(IS_UNDER_IPMP(ill));
1768 ASSERT(IAM_WRITER_ILL(ill) ||
1769 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock)));
1770
1771 /*
1772 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to
1773 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the
1774 * link flapping logic to be just in in.mpathd and allows us to ignore
1775 * changes to PHYI_RUNNING.
1776 */
1777 return (!(ill->ill_ipif_up_count == 0 ||
1778 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED))));
1779 }
1780
1781 /*
1782 * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated
1783 * with `ill_arg'.
1784 */
1785 static void
ipmp_ill_ire_mark_testhidden(ire_t * ire,char * ill_arg)1786 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg)
1787 {
1788 ill_t *ill = (ill_t *)ill_arg;
1789
1790 ASSERT(IAM_WRITER_ILL(ill));
1791 ASSERT(!IS_IPMP(ill));
1792
1793 if (ire->ire_ill != ill)
1794 return;
1795
1796 if (IRE_HIDDEN_TYPE(ire->ire_type)) {
1797 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire);
1798 ire->ire_testhidden = B_TRUE;
1799 }
1800 }
1801
1802 /*
1803 * IRE walker callback: clear ire_testhidden if the IRE has a source address
1804 * on `ill_arg'.
1805 */
1806 static void
ipmp_ill_ire_clear_testhidden(ire_t * ire,char * ill_arg)1807 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg)
1808 {
1809 ill_t *ill = (ill_t *)ill_arg;
1810
1811 ASSERT(IAM_WRITER_ILL(ill));
1812 ASSERT(!IS_IPMP(ill));
1813
1814 if (ire->ire_ill == ill) {
1815 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire);
1816 ire->ire_testhidden = B_FALSE;
1817 }
1818 }
1819
1820 /*
1821 * Return a held pointer to the IPMP ill for underlying interface `ill', or
1822 * NULL if one doesn't exist. (Unfortunately, this function needs to take an
1823 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1824 * ill_grp pointer may become stale when not inside an IPSQ and not holding
1825 * ipmp_lock.) Caller need not be inside the IPSQ.
1826 */
1827 ill_t *
ipmp_ill_hold_ipmp_ill(ill_t * ill)1828 ipmp_ill_hold_ipmp_ill(ill_t *ill)
1829 {
1830 ip_stack_t *ipst = ill->ill_ipst;
1831 ipmp_illgrp_t *illg;
1832
1833 ASSERT(!IS_IPMP(ill));
1834
1835 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1836 illg = ill->ill_grp;
1837 if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) {
1838 rw_exit(&ipst->ips_ipmp_lock);
1839 return (illg->ig_ipmp_ill);
1840 }
1841 /*
1842 * Assume `ill' was removed from the illgrp in the meantime.
1843 */
1844 rw_exit(&ill->ill_ipst->ips_ipmp_lock);
1845 return (NULL);
1846 }
1847
1848 /*
1849 * Return a held pointer to the appropriate underlying ill for sending the
1850 * specified type of packet. (Unfortunately, this function needs to take an
1851 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's
1852 * ill_grp pointer may become stale when not inside an IPSQ and not holding
1853 * ipmp_lock.) Caller need not be inside the IPSQ.
1854 */
1855 ill_t *
ipmp_ill_hold_xmit_ill(ill_t * ill,boolean_t is_unicast)1856 ipmp_ill_hold_xmit_ill(ill_t *ill, boolean_t is_unicast)
1857 {
1858 ill_t *xmit_ill;
1859 ip_stack_t *ipst = ill->ill_ipst;
1860
1861 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
1862 if (ill->ill_grp == NULL) {
1863 /*
1864 * The ill was taken out of the group, so just send on it.
1865 */
1866 rw_exit(&ipst->ips_ill_g_lock);
1867 ill_refhold(ill);
1868 return (ill);
1869 }
1870 if (is_unicast)
1871 xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp);
1872 else
1873 xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp);
1874 rw_exit(&ipst->ips_ill_g_lock);
1875
1876 return (xmit_ill);
1877 }
1878
1879 /*
1880 * Return the interface index for the IPMP ill tied to underlying interface
1881 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ.
1882 */
1883 uint_t
ipmp_ill_get_ipmp_ifindex(const ill_t * ill)1884 ipmp_ill_get_ipmp_ifindex(const ill_t *ill)
1885 {
1886 uint_t ifindex = 0;
1887 ip_stack_t *ipst = ill->ill_ipst;
1888 ipmp_grp_t *grp;
1889
1890 ASSERT(!IS_IPMP(ill));
1891
1892 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
1893 if ((grp = ill->ill_phyint->phyint_grp) != NULL)
1894 ifindex = grp->gr_phyint->phyint_ifindex;
1895 rw_exit(&ipst->ips_ipmp_lock);
1896 return (ifindex);
1897 }
1898
1899 /*
1900 * Place phyint `phyi' into IPMP group `grp'.
1901 */
1902 void
ipmp_phyint_join_grp(phyint_t * phyi,ipmp_grp_t * grp)1903 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp)
1904 {
1905 ill_t *ill;
1906 ipsq_t *ipsq = phyi->phyint_ipsq;
1907 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq;
1908 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1909
1910 ASSERT(IAM_WRITER_IPSQ(ipsq));
1911 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL);
1912
1913 /*
1914 * Send routing socket messages indicating that the phyint's ills
1915 * and ipifs vanished.
1916 */
1917 if (phyi->phyint_illv4 != NULL) {
1918 ill = phyi->phyint_illv4;
1919 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1920 }
1921
1922 if (phyi->phyint_illv6 != NULL) {
1923 ill = phyi->phyint_illv6;
1924 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE);
1925 }
1926
1927 /*
1928 * Snapshot the phyint's initial kstats as a baseline.
1929 */
1930 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0);
1931
1932 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1933
1934 phyi->phyint_grp = grp;
1935 if (++grp->gr_nif == 1)
1936 grp->gr_mactype = ill->ill_mactype;
1937 else
1938 ASSERT(grp->gr_mactype == ill->ill_mactype);
1939
1940 /*
1941 * Now that we're in the group, request a switch to the group's xop
1942 * when we ipsq_exit(). All future operations will be exclusive on
1943 * the group xop until ipmp_phyint_leave_grp() is called.
1944 */
1945 ASSERT(ipsq->ipsq_swxop == NULL);
1946 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop);
1947 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop;
1948
1949 rw_exit(&ipst->ips_ipmp_lock);
1950 }
1951
1952 /*
1953 * Remove phyint `phyi' from its current IPMP group.
1954 */
1955 void
ipmp_phyint_leave_grp(phyint_t * phyi)1956 ipmp_phyint_leave_grp(phyint_t *phyi)
1957 {
1958 uint_t i;
1959 ipsq_t *ipsq = phyi->phyint_ipsq;
1960 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
1961 uint64_t phyi_kstats[IPMP_KSTAT_MAX];
1962
1963 ASSERT(IAM_WRITER_IPSQ(ipsq));
1964
1965 /*
1966 * If any of the phyint's ills are still in an illgrp, kick 'em out.
1967 */
1968 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4))
1969 ipmp_ill_leave_illgrp(phyi->phyint_illv4);
1970 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6))
1971 ipmp_ill_leave_illgrp(phyi->phyint_illv6);
1972
1973 /*
1974 * Send routing socket messages indicating that the phyint's ills
1975 * and ipifs have reappeared.
1976 */
1977 if (phyi->phyint_illv4 != NULL)
1978 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD);
1979 if (phyi->phyint_illv6 != NULL)
1980 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD);
1981
1982 /*
1983 * Calculate the phyint's cumulative kstats while it was in the group,
1984 * and add that to the group's baseline.
1985 */
1986 ipmp_phyint_get_kstats(phyi, phyi_kstats);
1987 for (i = 0; i < IPMP_KSTAT_MAX; i++) {
1988 phyi_kstats[i] -= phyi->phyint_kstats0[i];
1989 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]);
1990 }
1991
1992 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER);
1993
1994 phyi->phyint_grp->gr_nif--;
1995 phyi->phyint_grp = NULL;
1996
1997 /*
1998 * As our final act in leaving the group, request a switch back to our
1999 * IPSQ's own xop when we ipsq_exit().
2000 */
2001 ASSERT(ipsq->ipsq_swxop == NULL);
2002 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop;
2003
2004 rw_exit(&ipst->ips_ipmp_lock);
2005 }
2006
2007 /*
2008 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'.
2009 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements.
2010 */
2011 static void
ipmp_phyint_get_kstats(phyint_t * phyi,uint64_t kstats[])2012 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[])
2013 {
2014 uint_t i, j;
2015 const char *name;
2016 kstat_t *ksp;
2017 kstat_named_t *kn;
2018 ip_stack_t *ipst = PHYINT_TO_IPST(phyi);
2019 zoneid_t zoneid;
2020
2021 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX);
2022 zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid);
2023 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid);
2024 if (ksp == NULL)
2025 return;
2026
2027 KSTAT_ENTER(ksp);
2028
2029 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) {
2030 /*
2031 * Bring kstats up-to-date before recording.
2032 */
2033 (void) KSTAT_UPDATE(ksp, KSTAT_READ);
2034
2035 kn = KSTAT_NAMED_PTR(ksp);
2036 for (i = 0; i < IPMP_KSTAT_MAX; i++) {
2037 name = ipmp_kstats[i].name;
2038 kstats[i] = 0;
2039 for (j = 0; j < ksp->ks_ndata; j++) {
2040 if (strcmp(kn[j].name, name) != 0)
2041 continue;
2042
2043 switch (kn[j].data_type) {
2044 case KSTAT_DATA_INT32:
2045 case KSTAT_DATA_UINT32:
2046 kstats[i] = kn[j].value.ui32;
2047 break;
2048 #ifdef _LP64
2049 case KSTAT_DATA_LONG:
2050 case KSTAT_DATA_ULONG:
2051 kstats[i] = kn[j].value.ul;
2052 break;
2053 #endif
2054 case KSTAT_DATA_INT64:
2055 case KSTAT_DATA_UINT64:
2056 kstats[i] = kn[j].value.ui64;
2057 break;
2058 }
2059 break;
2060 }
2061 }
2062 }
2063
2064 KSTAT_EXIT(ksp);
2065 kstat_rele(ksp);
2066 }
2067
2068 /*
2069 * Refresh the active state of all ills on `phyi'.
2070 */
2071 void
ipmp_phyint_refresh_active(phyint_t * phyi)2072 ipmp_phyint_refresh_active(phyint_t *phyi)
2073 {
2074 if (phyi->phyint_illv4 != NULL)
2075 ipmp_ill_refresh_active(phyi->phyint_illv4);
2076 if (phyi->phyint_illv6 != NULL)
2077 ipmp_ill_refresh_active(phyi->phyint_illv6);
2078 }
2079
2080 /*
2081 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one
2082 * doesn't exist. Caller need not be inside the IPSQ.
2083 */
2084 ill_t *
ipmp_ipif_hold_bound_ill(const ipif_t * ipif)2085 ipmp_ipif_hold_bound_ill(const ipif_t *ipif)
2086 {
2087 ill_t *boundill;
2088 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst;
2089
2090 ASSERT(IS_IPMP(ipif->ipif_ill));
2091
2092 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2093 boundill = ipif->ipif_bound_ill;
2094 if (boundill != NULL && ill_check_and_refhold(boundill)) {
2095 rw_exit(&ipst->ips_ipmp_lock);
2096 return (boundill);
2097 }
2098 rw_exit(&ipst->ips_ipmp_lock);
2099 return (NULL);
2100 }
2101
2102 /*
2103 * Return a pointer to the underlying ill bound to `ipif', or NULL if one
2104 * doesn't exist. Caller must be inside the IPSQ.
2105 */
2106 ill_t *
ipmp_ipif_bound_ill(const ipif_t * ipif)2107 ipmp_ipif_bound_ill(const ipif_t *ipif)
2108 {
2109 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill));
2110 ASSERT(IS_IPMP(ipif->ipif_ill));
2111
2112 return (ipif->ipif_bound_ill);
2113 }
2114
2115 /*
2116 * Check if `ipif' is a "stub" (placeholder address not being used).
2117 */
2118 boolean_t
ipmp_ipif_is_stubaddr(const ipif_t * ipif)2119 ipmp_ipif_is_stubaddr(const ipif_t *ipif)
2120 {
2121 if (ipif->ipif_flags & IPIF_UP)
2122 return (B_FALSE);
2123 if (ipif->ipif_ill->ill_isv6)
2124 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2125 else
2126 return (ipif->ipif_lcl_addr == INADDR_ANY);
2127 }
2128
2129 /*
2130 * Check if `ipif' is an IPMP data address.
2131 */
2132 boolean_t
ipmp_ipif_is_dataaddr(const ipif_t * ipif)2133 ipmp_ipif_is_dataaddr(const ipif_t *ipif)
2134 {
2135 if (ipif->ipif_flags & IPIF_NOFAILOVER)
2136 return (B_FALSE);
2137 if (ipif->ipif_ill->ill_isv6)
2138 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr));
2139 else
2140 return (ipif->ipif_lcl_addr != INADDR_ANY);
2141 }
2142
2143 /*
2144 * Check if `ipif' is an IPIF_UP IPMP data address.
2145 */
2146 static boolean_t
ipmp_ipif_is_up_dataaddr(const ipif_t * ipif)2147 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif)
2148 {
2149 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP));
2150 }
2151
2152 /*
2153 * Check if `mp' contains a probe packet by checking if the IP source address
2154 * is a test address on underlying interface `ill'. Caller need not be inside
2155 * the IPSQ.
2156 */
2157 boolean_t
ipmp_packet_is_probe(mblk_t * mp,ill_t * ill)2158 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill)
2159 {
2160 ip6_t *ip6h = (ip6_t *)mp->b_rptr;
2161 ipha_t *ipha = (ipha_t *)mp->b_rptr;
2162
2163 ASSERT(DB_TYPE(mp) != M_CTL);
2164
2165 if (!IS_UNDER_IPMP(ill))
2166 return (B_FALSE);
2167
2168 if (ill->ill_isv6) {
2169 if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) &&
2170 ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL))
2171 return (B_TRUE);
2172 } else {
2173 if (ipha->ipha_src != INADDR_ANY &&
2174 ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL))
2175 return (B_TRUE);
2176 }
2177 return (B_FALSE);
2178 }
2179
2180 /*
2181 * NCEC walker callback: delete `ncec' if it is associated with `ill_arg' and
2182 * is not one of our local addresses. Caller must be inside the IPSQ.
2183 */
2184 static void
ipmp_ncec_delete_nonlocal(ncec_t * ncec,uchar_t * ill_arg)2185 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *ill_arg)
2186 {
2187 if (!NCE_MYADDR(ncec) && ncec->ncec_ill == (ill_t *)ill_arg)
2188 ncec_delete(ncec);
2189 }
2190
2191 /*
2192 * Delete any NCEs tied to the illgrp associated with `ncec'. Caller need not
2193 * be inside the IPSQ.
2194 */
2195 void
ipmp_ncec_delete_nce(ncec_t * ncec)2196 ipmp_ncec_delete_nce(ncec_t *ncec)
2197 {
2198 ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp;
2199 ip_stack_t *ipst = ncec->ncec_ipst;
2200 ill_t *ill;
2201 nce_t *nce;
2202 list_t dead;
2203
2204 ASSERT(IS_IPMP(ncec->ncec_ill));
2205
2206 /*
2207 * For each underlying interface, delete `ncec' from its ill_nce list
2208 * via nce_fastpath_list_delete(). Defer the actual nce_refrele()
2209 * until we've dropped ill_g_lock.
2210 */
2211 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node));
2212
2213 rw_enter(&ipst->ips_ill_g_lock, RW_READER);
2214 ill = list_head(&illg->ig_if);
2215 for (; ill != NULL; ill = list_next(&illg->ig_if, ill))
2216 nce_fastpath_list_delete(ill, ncec, &dead);
2217 rw_exit(&ipst->ips_ill_g_lock);
2218
2219 while ((nce = list_remove_head(&dead)) != NULL)
2220 nce_refrele(nce);
2221
2222 list_destroy(&dead);
2223 }
2224
2225 /*
2226 * Refresh any NCE entries tied to the illgrp associated with `ncec' to
2227 * use the information in `ncec'. Caller need not be inside the IPSQ.
2228 */
2229 void
ipmp_ncec_refresh_nce(ncec_t * ncec)2230 ipmp_ncec_refresh_nce(ncec_t *ncec)
2231 {
2232 ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp;
2233 ip_stack_t *ipst = ncec->ncec_ipst;
2234 ill_t *ill;
2235 nce_t *nce, *nce_next;
2236 list_t replace;
2237
2238 ASSERT(IS_IPMP(ncec->ncec_ill));
2239
2240 /*
2241 * If `ncec' is not reachable, there is no use in refreshing NCEs.
2242 */
2243 if (!NCE_ISREACHABLE(ncec))
2244 return;
2245
2246 /*
2247 * Find all the NCEs matching ncec->ncec_addr. We cannot update them
2248 * in-situ because we're holding ipmp_lock to prevent changes to IPMP
2249 * group membership and updating indirectly calls nce_fastpath_probe()
2250 * -> putnext() which cannot hold locks. Thus, move the NCEs to a
2251 * separate list and process that list after dropping ipmp_lock.
2252 */
2253 list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node));
2254 rw_enter(&ipst->ips_ipmp_lock, RW_READER);
2255 ill = list_head(&illg->ig_actif);
2256 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) {
2257 mutex_enter(&ill->ill_lock);
2258 nce = list_head(&ill->ill_nce);
2259 for (; nce != NULL; nce = nce_next) {
2260 nce_next = list_next(&ill->ill_nce, nce);
2261 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr,
2262 &ncec->ncec_addr)) {
2263 nce_refhold(nce);
2264 nce_delete(nce);
2265 list_insert_tail(&replace, nce);
2266 }
2267 }
2268 mutex_exit(&ill->ill_lock);
2269 }
2270 rw_exit(&ipst->ips_ipmp_lock);
2271
2272 /*
2273 * Process the list; nce_lookup_then_add_v* ensures that nce->nce_ill
2274 * is still in the group for ncec->ncec_ill.
2275 */
2276 while ((nce = list_remove_head(&replace)) != NULL) {
2277 if (ncec->ncec_ill->ill_isv6) {
2278 (void) nce_lookup_then_add_v6(nce->nce_ill,
2279 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2280 &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED,
2281 NULL);
2282 } else {
2283 ipaddr_t ipaddr;
2284
2285 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr);
2286 (void) nce_lookup_then_add_v4(nce->nce_ill,
2287 ncec->ncec_lladdr, ncec->ncec_lladdr_length,
2288 &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL);
2289 }
2290 nce_refrele(nce);
2291 }
2292
2293 list_destroy(&replace);
2294 }
2295