1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. 22 */ 23 24 #include <inet/ip.h> 25 #include <inet/ip6.h> 26 #include <inet/ip_if.h> 27 #include <inet/ip_ire.h> 28 #include <inet/ip_multi.h> 29 #include <inet/ip_ndp.h> 30 #include <inet/ip_rts.h> 31 #include <inet/mi.h> 32 #include <net/if_types.h> 33 #include <sys/dlpi.h> 34 #include <sys/kmem.h> 35 #include <sys/modhash.h> 36 #include <sys/sdt.h> 37 #include <sys/strsun.h> 38 #include <sys/sunddi.h> 39 #include <sys/types.h> 40 41 /* 42 * Convenience macros for getting the ip_stack_t associated with an 43 * ipmp_illgrp_t or ipmp_grp_t. 44 */ 45 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) 46 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) 47 48 /* 49 * Assorted constants that aren't important enough to be tunable. 50 */ 51 #define IPMP_GRP_HASH_SIZE 64 52 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ 53 54 /* 55 * IPMP meta-interface kstats (based on those in PSARC/1997/198). 56 */ 57 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { 58 { "obytes", KSTAT_DATA_UINT32 }, 59 { "obytes64", KSTAT_DATA_UINT64 }, 60 { "rbytes", KSTAT_DATA_UINT32 }, 61 { "rbytes64", KSTAT_DATA_UINT64 }, 62 { "opackets", KSTAT_DATA_UINT32 }, 63 { "opackets64", KSTAT_DATA_UINT64 }, 64 { "oerrors", KSTAT_DATA_UINT32 }, 65 { "ipackets", KSTAT_DATA_UINT32 }, 66 { "ipackets64", KSTAT_DATA_UINT64 }, 67 { "ierrors", KSTAT_DATA_UINT32 }, 68 { "multircv", KSTAT_DATA_UINT32 }, 69 { "multixmt", KSTAT_DATA_UINT32 }, 70 { "brdcstrcv", KSTAT_DATA_UINT32 }, 71 { "brdcstxmt", KSTAT_DATA_UINT32 }, 72 { "link_up", KSTAT_DATA_UINT32 } 73 }; 74 75 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); 76 static int ipmp_grp_create_kstats(ipmp_grp_t *); 77 static int ipmp_grp_update_kstats(kstat_t *, int); 78 static void ipmp_grp_destroy_kstats(ipmp_grp_t *); 79 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); 80 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); 81 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); 82 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t, uint_t); 83 static boolean_t ipmp_ill_activate(ill_t *); 84 static void ipmp_ill_deactivate(ill_t *); 85 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); 86 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); 87 static void ipmp_ill_refresh_active_timer_start(ill_t *); 88 static void ipmp_ill_rtsaddrmsg(ill_t *, int); 89 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); 90 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); 91 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); 92 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); 93 static void ipmp_ncec_delete_nonlocal(ncec_t *, uchar_t *); 94 95 /* 96 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). 97 */ 98 void 99 ipmp_init(ip_stack_t *ipst) 100 { 101 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", 102 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 103 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 104 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); 105 } 106 107 /* 108 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). 109 */ 110 void 111 ipmp_destroy(ip_stack_t *ipst) 112 { 113 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); 114 rw_destroy(&ipst->ips_ipmp_lock); 115 } 116 117 /* 118 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', 119 * and add it to the hash. On success, return a pointer to the created group. 120 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP 121 * meta-interface associated with the group also has the same name (but they 122 * may differ later via ipmp_grp_rename()). 123 */ 124 ipmp_grp_t * 125 ipmp_grp_create(const char *grname, phyint_t *phyi) 126 { 127 ipmp_grp_t *grp; 128 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 129 mod_hash_hndl_t mh; 130 131 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 132 133 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) 134 return (NULL); 135 136 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 137 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); 138 139 /* 140 * Cache the group's phyint. This is safe since a phyint_t will 141 * outlive its ipmp_grp_t. 142 */ 143 grp->gr_phyint = phyi; 144 145 /* 146 * Create IPMP group kstats. 147 */ 148 if (ipmp_grp_create_kstats(grp) != 0) { 149 kmem_free(grp, sizeof (ipmp_grp_t)); 150 return (NULL); 151 } 152 153 /* 154 * Insert the group into the hash. 155 */ 156 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { 157 ipmp_grp_destroy_kstats(grp); 158 kmem_free(grp, sizeof (ipmp_grp_t)); 159 return (NULL); 160 } 161 ipmp_grp_insert(grp, mh); 162 163 return (grp); 164 } 165 166 /* 167 * Create IPMP kstat structures for `grp'. Return an errno upon failure. 168 */ 169 static int 170 ipmp_grp_create_kstats(ipmp_grp_t *grp) 171 { 172 kstat_t *ksp; 173 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 174 175 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", 176 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); 177 if (ksp == NULL) 178 return (ENOMEM); 179 180 ksp->ks_update = ipmp_grp_update_kstats; 181 ksp->ks_private = grp; 182 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); 183 184 kstat_install(ksp); 185 grp->gr_ksp = ksp; 186 return (0); 187 } 188 189 /* 190 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. 191 */ 192 static int 193 ipmp_grp_update_kstats(kstat_t *ksp, int rw) 194 { 195 uint_t i; 196 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 197 ipmp_grp_t *grp = ksp->ks_private; 198 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 199 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; 200 phyint_t *phyi; 201 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 202 203 if (rw == KSTAT_WRITE) 204 return (EACCES); 205 206 /* 207 * Start with the group's baseline values. 208 */ 209 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 210 if (kn[i].data_type == KSTAT_DATA_UINT32) { 211 kn[i].value.ui32 = grp->gr_kstats0[i]; 212 } else { 213 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); 214 kn[i].value.ui64 = grp->gr_kstats0[i]; 215 } 216 } 217 218 /* 219 * Add in the stats of each phyint currently in the group. Since we 220 * don't directly track the phyints in a group, we cheat by walking 221 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while 222 * ill_g_lock is held.) 223 */ 224 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 225 ipsq = grp_ipsq->ipsq_next; 226 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { 227 phyi = ipsq->ipsq_phyint; 228 229 /* 230 * If a phyint in a group is being unplumbed, it's possible 231 * that ill_glist_delete() -> phyint_free() already freed the 232 * phyint (and set ipsq_phyint to NULL), but the unplumb 233 * operation has yet to complete (and thus ipsq_dq() has yet 234 * to remove the phyint's IPSQ from the group IPSQ's phyint 235 * list). We skip those phyints here (note that their kstats 236 * have already been added to gr_kstats0[]). 237 */ 238 if (phyi == NULL) 239 continue; 240 241 ipmp_phyint_get_kstats(phyi, phyi_kstats); 242 243 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 244 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 245 if (kn[i].data_type == KSTAT_DATA_UINT32) 246 kn[i].value.ui32 += phyi_kstats[i]; 247 else 248 kn[i].value.ui64 += phyi_kstats[i]; 249 } 250 } 251 252 kn[IPMP_KSTAT_LINK_UP].value.ui32 = 253 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; 254 255 rw_exit(&ipst->ips_ill_g_lock); 256 return (0); 257 } 258 259 /* 260 * Destroy IPMP kstat structures for `grp'. 261 */ 262 static void 263 ipmp_grp_destroy_kstats(ipmp_grp_t *grp) 264 { 265 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 266 267 kstat_delete_netstack(grp->gr_ksp, id); 268 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); 269 grp->gr_ksp = NULL; 270 } 271 272 /* 273 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it 274 * does not exist. 275 */ 276 ipmp_grp_t * 277 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) 278 { 279 ipmp_grp_t *grp; 280 281 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 282 283 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 284 (mod_hash_val_t *)&grp) == 0) 285 return (grp); 286 287 return (NULL); 288 } 289 290 /* 291 * Place information about group `grp' into `lifgr'. 292 */ 293 void 294 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) 295 { 296 ill_t *ill; 297 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 298 299 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 300 301 lifgr->gi_v4 = (grp->gr_v4 != NULL); 302 lifgr->gi_v6 = (grp->gr_v6 != NULL); 303 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; 304 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; 305 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; 306 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); 307 lifgr->gi_m4ifname[0] = '\0'; 308 lifgr->gi_m6ifname[0] = '\0'; 309 lifgr->gi_bcifname[0] = '\0'; 310 311 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { 312 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); 313 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); 314 } 315 316 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) 317 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); 318 } 319 320 /* 321 * Insert `grp' into the hash using the reserved hash entry `mh'. 322 * Caller must ensure `grp' is not yet in the hash. 323 */ 324 static void 325 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) 326 { 327 int err; 328 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 329 330 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 331 332 /* 333 * Since grp->gr_name will exist at least as long as `grp' is in the 334 * hash, we use it directly as the key. 335 */ 336 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, 337 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); 338 if (err != 0) { 339 /* 340 * This should never happen since `mh' was preallocated. 341 */ 342 panic("cannot insert IPMP group \"%s\" (err %d)", 343 grp->gr_name, err); 344 } 345 } 346 347 /* 348 * Remove `grp' from the hash. Caller must ensure `grp' is in it. 349 */ 350 static void 351 ipmp_grp_remove(ipmp_grp_t *grp) 352 { 353 int err; 354 mod_hash_val_t val; 355 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; 356 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 357 358 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 359 360 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); 361 if (err != 0 || val != grp) { 362 panic("cannot remove IPMP group \"%s\" (err %d)", 363 grp->gr_name, err); 364 } 365 } 366 367 /* 368 * Attempt to rename `grp' to new name `grname'. Return an errno if the new 369 * group name already exists or is invalid, or if there isn't enough memory. 370 */ 371 int 372 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) 373 { 374 mod_hash_hndl_t mh; 375 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 376 377 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 378 379 if (grname[0] == '\0') 380 return (EINVAL); 381 382 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 383 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) 384 return (EEXIST); 385 386 /* 387 * Before we remove the group from the hash, ensure we'll be able to 388 * re-insert it by reserving space. 389 */ 390 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) 391 return (ENOMEM); 392 393 ipmp_grp_remove(grp); 394 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 395 ipmp_grp_insert(grp, mh); 396 397 return (0); 398 } 399 400 /* 401 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in 402 * the hash, and that there are no interfaces on it. 403 */ 404 void 405 ipmp_grp_destroy(ipmp_grp_t *grp) 406 { 407 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 408 409 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 410 411 /* 412 * If there are still interfaces using this group, panic before things 413 * go really off the rails. 414 */ 415 if (grp->gr_nif != 0) 416 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); 417 418 ipmp_grp_remove(grp); 419 ipmp_grp_destroy_kstats(grp); 420 421 ASSERT(grp->gr_v4 == NULL); 422 ASSERT(grp->gr_v6 == NULL); 423 ASSERT(grp->gr_nv4 == 0); 424 ASSERT(grp->gr_nv6 == 0); 425 ASSERT(grp->gr_nactif == 0); 426 ASSERT(grp->gr_linkdownmp == NULL); 427 grp->gr_phyint = NULL; 428 429 kmem_free(grp, sizeof (ipmp_grp_t)); 430 } 431 432 /* 433 * Check whether `ill' is suitable for inclusion into `grp', and return an 434 * errno describing the problem (if any). NOTE: many of these errno values 435 * are interpreted by ifconfig, which will take corrective action and retry 436 * the SIOCSLIFGROUPNAME, so please exercise care when changing them. 437 */ 438 static int 439 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) 440 { 441 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 442 443 ASSERT(IAM_WRITER_ILL(ill)); 444 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 445 446 /* 447 * To sidestep complicated address migration logic in the kernel and 448 * to force the kernel's all-hosts multicast memberships to be blown 449 * away, all addresses that had been brought up must be brought back 450 * down prior to adding an interface to a group. (This includes 451 * addresses currently down due to DAD.) Once the interface has been 452 * added to the group, its addresses can then be brought back up, at 453 * which point they will be moved to the IPMP meta-interface. 454 * NOTE: we do this before ill_appaddr_cnt() since bringing down the 455 * link-local causes in.ndpd to remove its ADDRCONF'd addresses. 456 */ 457 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 458 return (EADDRINUSE); 459 460 /* 461 * To avoid confusing applications by changing addresses that are 462 * under their control, all such control must be removed prior to 463 * adding an interface into a group. 464 */ 465 if (ill_appaddr_cnt(ill) != 0) 466 return (EADDRNOTAVAIL); 467 468 /* 469 * Since PTP addresses do not share the same broadcast domain, they 470 * are not allowed to be in an IPMP group. 471 */ 472 if (ill_ptpaddr_cnt(ill) != 0) 473 return (EINVAL); 474 475 /* 476 * An ill must support multicast to be allowed into a group. 477 */ 478 if (!(ill->ill_flags & ILLF_MULTICAST)) 479 return (ENOTSUP); 480 481 /* 482 * An ill must strictly be using ARP and/or ND for address 483 * resolution for it to be allowed into a group. 484 */ 485 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP)) 486 return (ENOTSUP); 487 488 /* 489 * An ill cannot also be using usesrc groups. (Although usesrc uses 490 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does 491 * all its modifications as writer.) 492 */ 493 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) 494 return (ENOTSUP); 495 496 /* 497 * All ills in a group must be the same mactype. 498 */ 499 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) 500 return (EINVAL); 501 502 return (0); 503 } 504 505 /* 506 * Check whether `phyi' is suitable for inclusion into `grp', and return an 507 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() 508 * regarding errno values. 509 */ 510 int 511 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) 512 { 513 int err = 0; 514 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 515 516 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); 517 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 518 519 /* 520 * An interface cannot have address families plumbed that are not 521 * configured in the group. 522 */ 523 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || 524 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) 525 return (EAFNOSUPPORT); 526 527 if (phyi->phyint_illv4 != NULL) 528 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); 529 if (err == 0 && phyi->phyint_illv6 != NULL) 530 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); 531 532 return (err); 533 } 534 535 /* 536 * Create a new illgrp on IPMP meta-interface `ill'. 537 */ 538 ipmp_illgrp_t * 539 ipmp_illgrp_create(ill_t *ill) 540 { 541 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 542 ipmp_illgrp_t *illg; 543 544 ASSERT(IAM_WRITER_ILL(ill)); 545 ASSERT(IS_IPMP(ill)); 546 ASSERT(ill->ill_grp == NULL); 547 548 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) 549 return (NULL); 550 551 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); 552 list_create(&illg->ig_actif, sizeof (ill_t), 553 offsetof(ill_t, ill_actnode)); 554 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), 555 offsetof(ipmp_arpent_t, ia_node)); 556 557 illg->ig_ipmp_ill = ill; 558 ill->ill_grp = illg; 559 ipmp_illgrp_set_mtu(illg, mtu, mtu); 560 561 return (illg); 562 } 563 564 /* 565 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. 566 */ 567 void 568 ipmp_illgrp_destroy(ipmp_illgrp_t *illg) 569 { 570 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 571 ASSERT(IS_IPMP(illg->ig_ipmp_ill)); 572 573 /* 574 * Verify `illg' is empty. 575 */ 576 ASSERT(illg->ig_next_ill == NULL); 577 ASSERT(illg->ig_cast_ill == NULL); 578 ASSERT(list_is_empty(&illg->ig_arpent)); 579 ASSERT(list_is_empty(&illg->ig_if)); 580 ASSERT(list_is_empty(&illg->ig_actif)); 581 ASSERT(illg->ig_nactif == 0); 582 583 /* 584 * Destroy `illg'. 585 */ 586 illg->ig_ipmp_ill->ill_grp = NULL; 587 illg->ig_ipmp_ill = NULL; 588 list_destroy(&illg->ig_if); 589 list_destroy(&illg->ig_actif); 590 list_destroy(&illg->ig_arpent); 591 kmem_free(illg, sizeof (ipmp_illgrp_t)); 592 } 593 594 /* 595 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to 596 * bind it to an underlying ill, while keeping an even address distribution. 597 * If the bind is successful, return a pointer to the bound ill. 598 */ 599 ill_t * 600 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 601 { 602 ill_t *minill; 603 ipmp_arpent_t *entp; 604 605 ASSERT(IAM_WRITER_IPIF(ipif)); 606 ASSERT(ipmp_ipif_is_dataaddr(ipif)); 607 608 /* 609 * IPMP data address mappings are internally managed by IP itself, so 610 * delete any existing ARP entries associated with the address. 611 */ 612 if (!ipif->ipif_isv6) { 613 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); 614 if (entp != NULL) 615 ipmp_illgrp_destroy_arpent(illg, entp); 616 } 617 618 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 619 ipmp_ill_bind_ipif(minill, ipif, Res_act_none); 620 621 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); 622 } 623 624 /* 625 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's 626 * bound, unbind it from the underlying ill while keeping an even address 627 * distribution. 628 */ 629 void 630 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 631 { 632 ill_t *maxill, *boundill = ipif->ipif_bound_ill; 633 634 ASSERT(IAM_WRITER_IPIF(ipif)); 635 636 if (boundill != NULL) { 637 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); 638 639 maxill = ipmp_illgrp_max_ill(illg); 640 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { 641 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 642 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); 643 } 644 } 645 } 646 647 /* 648 * Return the active ill with the greatest number of data addresses in `illg'. 649 */ 650 static ill_t * 651 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) 652 { 653 ill_t *ill, *bestill = NULL; 654 655 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 656 657 ill = list_head(&illg->ig_actif); 658 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 659 if (bestill == NULL || 660 ill->ill_bound_cnt > bestill->ill_bound_cnt) { 661 bestill = ill; 662 } 663 } 664 return (bestill); 665 } 666 667 /* 668 * Return the active ill with the fewest number of data addresses in `illg'. 669 */ 670 static ill_t * 671 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) 672 { 673 ill_t *ill, *bestill = NULL; 674 675 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 676 677 ill = list_head(&illg->ig_actif); 678 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 679 if (bestill == NULL || 680 ill->ill_bound_cnt < bestill->ill_bound_cnt) { 681 if (ill->ill_bound_cnt == 0) 682 return (ill); /* can't get better */ 683 bestill = ill; 684 } 685 } 686 return (bestill); 687 } 688 689 /* 690 * Return a pointer to IPMP meta-interface for `illg' (which must exist). 691 * Since ig_ipmp_ill never changes for a given illg, no locks are needed. 692 */ 693 ill_t * 694 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) 695 { 696 return (illg->ig_ipmp_ill); 697 } 698 699 /* 700 * Return a pointer to the next available underlying ill in `illg', or NULL if 701 * one doesn't exist. Caller must be inside the IPSQ. 702 */ 703 ill_t * 704 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) 705 { 706 ill_t *ill; 707 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 708 709 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 710 711 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 712 if ((ill = illg->ig_next_ill) != NULL) { 713 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 714 if (illg->ig_next_ill == NULL) 715 illg->ig_next_ill = list_head(&illg->ig_actif); 716 } 717 rw_exit(&ipst->ips_ipmp_lock); 718 719 return (ill); 720 } 721 722 /* 723 * Return a held pointer to the next available underlying ill in `illg', or 724 * NULL if one doesn't exist. Caller need not be inside the IPSQ. 725 */ 726 ill_t * 727 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) 728 { 729 ill_t *ill; 730 uint_t i; 731 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 732 733 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 734 for (i = 0; i < illg->ig_nactif; i++) { 735 ill = illg->ig_next_ill; 736 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 737 if (illg->ig_next_ill == NULL) 738 illg->ig_next_ill = list_head(&illg->ig_actif); 739 740 if (ill_check_and_refhold(ill)) { 741 rw_exit(&ipst->ips_ipmp_lock); 742 return (ill); 743 } 744 } 745 rw_exit(&ipst->ips_ipmp_lock); 746 747 return (NULL); 748 } 749 750 /* 751 * Return a held pointer to the nominated multicast ill in `illg', or NULL if 752 * one doesn't exist. Caller need not be inside the IPSQ. 753 */ 754 ill_t * 755 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) 756 { 757 ill_t *castill; 758 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 759 760 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 761 castill = illg->ig_cast_ill; 762 if (castill != NULL && ill_check_and_refhold(castill)) { 763 rw_exit(&ipst->ips_ipmp_lock); 764 return (castill); 765 } 766 rw_exit(&ipst->ips_ipmp_lock); 767 return (NULL); 768 } 769 770 /* 771 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, 772 * any existing nomination is removed. Caller must be inside the IPSQ. 773 */ 774 static void 775 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) 776 { 777 ill_t *ocastill = illg->ig_cast_ill; 778 ill_t *ipmp_ill = illg->ig_ipmp_ill; 779 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 780 781 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 782 783 /* 784 * Disable old nominated ill (if any). 785 */ 786 if (ocastill != NULL) { 787 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, 788 illg, ill_t *, ocastill); 789 ASSERT(ocastill->ill_nom_cast); 790 ocastill->ill_nom_cast = B_FALSE; 791 /* 792 * If the IPMP meta-interface is down, we never did the join, 793 * so we must not try to leave. 794 */ 795 if (ipmp_ill->ill_dl_up) 796 ill_leave_multicast(ipmp_ill); 797 798 /* 799 * Delete any NCEs tied to the old nomination. We must do this 800 * last since ill_leave_multicast() may trigger IREs to be 801 * built using ig_cast_ill. 802 */ 803 ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill, 804 ocastill->ill_ipst); 805 } 806 807 /* 808 * Set new nomination. 809 */ 810 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 811 illg->ig_cast_ill = castill; 812 rw_exit(&ipst->ips_ipmp_lock); 813 814 /* 815 * Enable new nominated ill (if any). 816 */ 817 if (castill != NULL) { 818 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, 819 illg, ill_t *, castill); 820 ASSERT(!castill->ill_nom_cast); 821 castill->ill_nom_cast = B_TRUE; 822 /* 823 * If the IPMP meta-interface is down, the attempt to recover 824 * will silently fail but ill_need_recover_multicast will be 825 * erroneously cleared -- so check first. 826 */ 827 if (ipmp_ill->ill_dl_up) 828 ill_recover_multicast(ipmp_ill); 829 } 830 } 831 832 /* 833 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an 834 * entry for the same IP address already exists, destroy it first. Return the 835 * created IPMP ARP entry, or NULL on failure. 836 */ 837 ipmp_arpent_t * 838 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp, 839 ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags) 840 { 841 ipmp_arpent_t *entp, *oentp; 842 843 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 844 845 if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len, 846 KM_NOSLEEP)) == NULL) 847 return (NULL); 848 849 /* 850 * Delete any existing ARP entry for this address. 851 */ 852 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) 853 ipmp_illgrp_destroy_arpent(illg, oentp); 854 855 /* 856 * Prepend the new entry. 857 */ 858 entp->ia_ipaddr = ipaddr; 859 entp->ia_flags = flags; 860 entp->ia_lladdr_len = lladdr_len; 861 entp->ia_lladdr = (uchar_t *)&entp[1]; 862 bcopy(lladdr, entp->ia_lladdr, lladdr_len); 863 entp->ia_proxyarp = proxyarp; 864 entp->ia_notified = B_TRUE; 865 list_insert_head(&illg->ig_arpent, entp); 866 return (entp); 867 } 868 869 /* 870 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. 871 */ 872 void 873 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 874 { 875 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 876 877 list_remove(&illg->ig_arpent, entp); 878 kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len); 879 } 880 881 /* 882 * Mark that ARP has been notified about the IP address on `entp'; `illg' is 883 * taken as a debugging aid for DTrace FBT probes. 884 */ 885 /* ARGSUSED */ 886 void 887 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 888 { 889 entp->ia_notified = B_TRUE; 890 } 891 892 /* 893 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is 894 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. 895 */ 896 ipmp_arpent_t * 897 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) 898 { 899 ipmp_arpent_t *entp = list_head(&illg->ig_arpent); 900 901 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 902 903 if (addrp == NULL) 904 return (entp); 905 906 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) 907 if (entp->ia_ipaddr == *addrp) 908 break; 909 return (entp); 910 } 911 912 /* 913 * Refresh ARP entries on `illg' to be distributed across its active 914 * interfaces. Entries that cannot be refreshed (e.g., because there are no 915 * active interfaces) are marked so that subsequent calls can try again. 916 */ 917 void 918 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) 919 { 920 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; 921 uint_t paddrlen = ipmp_ill->ill_phys_addr_length; 922 ipmp_arpent_t *entp; 923 ncec_t *ncec; 924 nce_t *nce; 925 926 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 927 ASSERT(!ipmp_ill->ill_isv6); 928 929 ill = list_head(&illg->ig_actif); 930 entp = list_head(&illg->ig_arpent); 931 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { 932 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { 933 entp->ia_notified = B_FALSE; 934 continue; 935 } 936 937 ASSERT(paddrlen == ill->ill_phys_addr_length); 938 939 /* 940 * If this is a proxy ARP entry, we can skip notifying ARP if 941 * the entry is already up-to-date. If it has changed, we 942 * update the entry's hardware address before notifying ARP. 943 */ 944 if (entp->ia_proxyarp) { 945 if (bcmp(ill->ill_phys_addr, entp->ia_lladdr, 946 paddrlen) == 0 && entp->ia_notified) 947 continue; 948 bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen); 949 } 950 951 (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr, 952 paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED, 953 &nce); 954 if (nce == NULL || !entp->ia_proxyarp) { 955 if (nce != NULL) 956 nce_refrele(nce); 957 continue; 958 } 959 ncec = nce->nce_common; 960 mutex_enter(&ncec->ncec_lock); 961 nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr); 962 mutex_exit(&ncec->ncec_lock); 963 nce_refrele(nce); 964 ipmp_illgrp_mark_arpent(illg, entp); 965 966 if ((ill = list_next(&illg->ig_actif, ill)) == NULL) 967 ill = list_head(&illg->ig_actif); 968 } 969 } 970 971 /* 972 * Return an interface in `illg' with the specified `physaddr', or NULL if one 973 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. 974 */ 975 ill_t * 976 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) 977 { 978 ill_t *ill; 979 ill_t *ipmp_ill = illg->ig_ipmp_ill; 980 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 981 982 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 983 984 ill = list_head(&illg->ig_if); 985 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 986 if (ill->ill_phys_addr_length == paddrlen && 987 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) 988 return (ill); 989 } 990 return (NULL); 991 } 992 993 /* 994 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. 995 * Caller must be inside the IPSQ unless this is initialization. 996 */ 997 static void 998 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu, uint_t mc_mtu) 999 { 1000 ill_t *ill = illg->ig_ipmp_ill; 1001 mblk_t *mp; 1002 1003 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); 1004 1005 /* 1006 * If allocation fails, we have bigger problems than MTU. 1007 */ 1008 if ((mp = ip_dlnotify_alloc2(DL_NOTE_SDU_SIZE2, mtu, mc_mtu)) != NULL) { 1009 illg->ig_mtu = mtu; 1010 illg->ig_mc_mtu = mc_mtu; 1011 put(ill->ill_rq, mp); 1012 } 1013 } 1014 1015 /* 1016 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP 1017 * ill MTU if necessary. 1018 */ 1019 void 1020 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) 1021 { 1022 ill_t *ill; 1023 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1024 uint_t mtu = 0; 1025 uint_t mc_mtu = 0; 1026 1027 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 1028 1029 /* 1030 * Since ill_mtu can only change under ill_lock, we hold ill_lock 1031 * for each ill as we iterate through the list. Any changes to the 1032 * ill_mtu will also trigger an update, so even if we missed it 1033 * this time around, the update will catch it. 1034 */ 1035 ill = list_head(&illg->ig_if); 1036 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1037 mutex_enter(&ill->ill_lock); 1038 if (mtu == 0 || ill->ill_mtu < mtu) 1039 mtu = ill->ill_mtu; 1040 if (mc_mtu == 0 || ill->ill_mc_mtu < mc_mtu) 1041 mc_mtu = ill->ill_mc_mtu; 1042 mutex_exit(&ill->ill_lock); 1043 } 1044 1045 /* 1046 * MTU must be at least the minimum MTU. 1047 */ 1048 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1049 mc_mtu = MAX(mc_mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1050 if (illg->ig_mtu != mtu || illg->ig_mc_mtu != mc_mtu) 1051 ipmp_illgrp_set_mtu(illg, mtu, mc_mtu); 1052 } 1053 1054 /* 1055 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently 1056 * allow the same link to be established more than once. 1057 */ 1058 void 1059 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) 1060 { 1061 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1062 1063 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1064 1065 if (illg->ig_ipmp_ill->ill_isv6) { 1066 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); 1067 grp->gr_v6 = illg; 1068 } else { 1069 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); 1070 grp->gr_v4 = illg; 1071 } 1072 } 1073 1074 /* 1075 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp 1076 * cannot be unlinked (e.g., because there are still interfaces using it). 1077 */ 1078 int 1079 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) 1080 { 1081 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; 1082 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1083 1084 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1085 1086 if (illg->ig_ipmp_ill->ill_isv6) { 1087 if (grp->gr_nv6 + grp->gr_pendv6 != 0) 1088 return (EBUSY); 1089 grp->gr_v6 = NULL; 1090 } else { 1091 if (grp->gr_nv4 + grp->gr_pendv4 != 0) 1092 return (EBUSY); 1093 grp->gr_v4 = NULL; 1094 } 1095 return (0); 1096 } 1097 1098 /* 1099 * Place `ill' into `illg', and rebalance the data addresses on `illg' 1100 * to be spread evenly across the ills now in it. Also, adjust the IPMP 1101 * ill as necessary to account for `ill' (e.g., MTU). 1102 */ 1103 void 1104 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) 1105 { 1106 ill_t *ipmp_ill; 1107 ipif_t *ipif; 1108 ip_stack_t *ipst = ill->ill_ipst; 1109 1110 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ 1111 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); 1112 ASSERT(IAM_WRITER_ILL(ill)); 1113 ASSERT(ill->ill_grp == NULL); 1114 1115 ipmp_ill = illg->ig_ipmp_ill; 1116 1117 /* 1118 * Account for `ill' joining the illgrp. 1119 */ 1120 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1121 if (ill->ill_isv6) 1122 ill->ill_phyint->phyint_grp->gr_nv6++; 1123 else 1124 ill->ill_phyint->phyint_grp->gr_nv4++; 1125 rw_exit(&ipst->ips_ipmp_lock); 1126 1127 /* 1128 * Ensure the ILLF_ROUTER flag remains consistent across the group. 1129 */ 1130 mutex_enter(&ill->ill_lock); 1131 if (ipmp_ill->ill_flags & ILLF_ROUTER) 1132 ill->ill_flags |= ILLF_ROUTER; 1133 else 1134 ill->ill_flags &= ~ILLF_ROUTER; 1135 mutex_exit(&ill->ill_lock); 1136 1137 /* 1138 * Blow away all multicast memberships that currently exist on `ill'. 1139 * This may seem odd, but it's consistent with the application view 1140 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). 1141 * The ill_grp_pending bit prevents multicast group joins after 1142 * update_conn_ill() and before ill_grp assignment. 1143 */ 1144 mutex_enter(&ill->ill_mcast_serializer); 1145 ill->ill_grp_pending = 1; 1146 mutex_exit(&ill->ill_mcast_serializer); 1147 update_conn_ill(ill, ill->ill_ipst); 1148 if (ill->ill_isv6) { 1149 reset_mrt_ill(ill); 1150 } else { 1151 ipif = ill->ill_ipif; 1152 for (; ipif != NULL; ipif = ipif->ipif_next) { 1153 reset_mrt_vif_ipif(ipif); 1154 } 1155 } 1156 ip_purge_allmulti(ill); 1157 1158 /* 1159 * Borrow the first ill's ill_phys_addr_length value for the illgrp's 1160 * physical address length. All other ills must have the same value, 1161 * since they are required to all be the same mactype. Also update 1162 * the IPMP ill's MTU and CoS marking, if necessary. 1163 */ 1164 if (list_is_empty(&illg->ig_if)) { 1165 ASSERT(ipmp_ill->ill_phys_addr_length == 0); 1166 /* 1167 * NOTE: we leave ill_phys_addr NULL since the IPMP group 1168 * doesn't have a physical address. This means that code must 1169 * not assume that ill_phys_addr is non-NULL just because 1170 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. 1171 */ 1172 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; 1173 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; 1174 ipmp_ill->ill_type = ill->ill_type; 1175 1176 if (ill->ill_flags & ILLF_COS_ENABLED) { 1177 mutex_enter(&ipmp_ill->ill_lock); 1178 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1179 mutex_exit(&ipmp_ill->ill_lock); 1180 } 1181 ipmp_illgrp_set_mtu(illg, ill->ill_mtu, ill->ill_mc_mtu); 1182 } else { 1183 ASSERT(ipmp_ill->ill_phys_addr_length == 1184 ill->ill_phys_addr_length); 1185 ASSERT(ipmp_ill->ill_type == ill->ill_type); 1186 1187 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1188 mutex_enter(&ipmp_ill->ill_lock); 1189 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1190 mutex_exit(&ipmp_ill->ill_lock); 1191 } 1192 if (illg->ig_mtu > ill->ill_mtu || 1193 illg->ig_mc_mtu > ill->ill_mc_mtu) { 1194 ipmp_illgrp_set_mtu(illg, ill->ill_mtu, 1195 ill->ill_mc_mtu); 1196 } 1197 } 1198 1199 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1200 list_insert_tail(&illg->ig_if, ill); 1201 ill->ill_grp = illg; 1202 rw_exit(&ipst->ips_ill_g_lock); 1203 1204 mutex_enter(&ill->ill_mcast_serializer); 1205 ill->ill_grp_pending = 0; 1206 mutex_exit(&ill->ill_mcast_serializer); 1207 1208 /* 1209 * Hide the IREs on `ill' so that we don't accidentally find them when 1210 * sending data traffic. 1211 */ 1212 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); 1213 1214 ipmp_ill_refresh_active(ill); 1215 } 1216 1217 /* 1218 * Remove `ill' from its illgrp, and rebalance the data addresses in that 1219 * illgrp to be spread evenly across the remaining ills. Also, adjust the 1220 * IPMP ill as necessary now that `ill' is removed (e.g., MTU). 1221 */ 1222 void 1223 ipmp_ill_leave_illgrp(ill_t *ill) 1224 { 1225 ill_t *ipmp_ill; 1226 ipif_t *ipif; 1227 ipmp_arpent_t *entp; 1228 ipmp_illgrp_t *illg = ill->ill_grp; 1229 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1230 1231 ASSERT(IS_UNDER_IPMP(ill)); 1232 ASSERT(IAM_WRITER_ILL(ill)); 1233 ASSERT(illg != NULL); 1234 1235 ipmp_ill = illg->ig_ipmp_ill; 1236 1237 /* 1238 * Cancel IPMP-specific ill timeouts. 1239 */ 1240 (void) untimeout(ill->ill_refresh_tid); 1241 1242 /* 1243 * Expose any previously-hidden IREs on `ill'. 1244 */ 1245 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); 1246 1247 /* 1248 * Ensure the multicast state for each ipif on `ill' is down so that 1249 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin 1250 * all eligible groups. 1251 */ 1252 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1253 if (ipif->ipif_flags & IPIF_UP) 1254 ipif_multicast_down(ipif); 1255 1256 /* 1257 * Account for `ill' leaving the illgrp. 1258 */ 1259 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1260 if (ill->ill_isv6) 1261 ill->ill_phyint->phyint_grp->gr_nv6--; 1262 else 1263 ill->ill_phyint->phyint_grp->gr_nv4--; 1264 rw_exit(&ipst->ips_ipmp_lock); 1265 1266 /* 1267 * Pull `ill' out of the interface lists. 1268 */ 1269 if (list_link_active(&ill->ill_actnode)) 1270 ipmp_ill_deactivate(ill); 1271 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1272 list_remove(&illg->ig_if, ill); 1273 ill->ill_grp = NULL; 1274 rw_exit(&ipst->ips_ill_g_lock); 1275 1276 /* 1277 * Re-establish multicast memberships that were previously being 1278 * handled by the IPMP meta-interface. 1279 */ 1280 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1281 if (ipif->ipif_flags & IPIF_UP) 1282 ipif_multicast_up(ipif); 1283 1284 /* 1285 * Refresh the group MTU based on the new interface list. 1286 */ 1287 ipmp_illgrp_refresh_mtu(illg); 1288 1289 if (list_is_empty(&illg->ig_if)) { 1290 /* 1291 * No ills left in the illgrp; we no longer have a physical 1292 * address length, nor can we support ARP, CoS, or anything 1293 * else that depends on knowing the link layer type. 1294 */ 1295 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) 1296 ipmp_illgrp_destroy_arpent(illg, entp); 1297 1298 ipmp_ill->ill_phys_addr_length = 0; 1299 ipmp_ill->ill_nd_lla_len = 0; 1300 ipmp_ill->ill_type = IFT_OTHER; 1301 mutex_enter(&ipmp_ill->ill_lock); 1302 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1303 mutex_exit(&ipmp_ill->ill_lock); 1304 } else { 1305 /* 1306 * If `ill' didn't support CoS, see if it can now be enabled. 1307 */ 1308 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1309 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); 1310 1311 ill = list_head(&illg->ig_if); 1312 do { 1313 if (!(ill->ill_flags & ILLF_COS_ENABLED)) 1314 break; 1315 } while ((ill = list_next(&illg->ig_if, ill)) != NULL); 1316 1317 if (ill == NULL) { 1318 mutex_enter(&ipmp_ill->ill_lock); 1319 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1320 mutex_exit(&ipmp_ill->ill_lock); 1321 } 1322 } 1323 } 1324 } 1325 1326 /* 1327 * Check if `ill' should be active, and activate or deactivate if need be. 1328 * Return B_FALSE if a refresh was necessary but could not be performed. 1329 */ 1330 static boolean_t 1331 ipmp_ill_try_refresh_active(ill_t *ill) 1332 { 1333 boolean_t refreshed = B_TRUE; 1334 1335 ASSERT(IAM_WRITER_ILL(ill)); 1336 ASSERT(IS_UNDER_IPMP(ill)); 1337 1338 if (ipmp_ill_is_active(ill)) { 1339 if (!list_link_active(&ill->ill_actnode)) 1340 refreshed = ipmp_ill_activate(ill); 1341 } else { 1342 if (list_link_active(&ill->ill_actnode)) 1343 ipmp_ill_deactivate(ill); 1344 } 1345 1346 return (refreshed); 1347 } 1348 1349 /* 1350 * Check if `ill' should be active, and activate or deactivate if need be. 1351 * If the refresh fails, schedule a timer to try again later. 1352 */ 1353 void 1354 ipmp_ill_refresh_active(ill_t *ill) 1355 { 1356 if (!ipmp_ill_try_refresh_active(ill)) 1357 ipmp_ill_refresh_active_timer_start(ill); 1358 } 1359 1360 /* 1361 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. 1362 */ 1363 static void 1364 ipmp_ill_refresh_active_timer(void *ill_arg) 1365 { 1366 ill_t *ill = ill_arg; 1367 boolean_t refreshed = B_FALSE; 1368 1369 /* 1370 * Clear ill_refresh_tid to indicate that no timeout is pending 1371 * (another thread could schedule a new timeout while we're still 1372 * running, but that's harmless). If the ill is going away, bail. 1373 */ 1374 mutex_enter(&ill->ill_lock); 1375 ill->ill_refresh_tid = 0; 1376 if (ill->ill_state_flags & ILL_CONDEMNED) { 1377 mutex_exit(&ill->ill_lock); 1378 return; 1379 } 1380 mutex_exit(&ill->ill_lock); 1381 1382 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { 1383 refreshed = ipmp_ill_try_refresh_active(ill); 1384 ipsq_exit(ill->ill_phyint->phyint_ipsq); 1385 } 1386 1387 /* 1388 * If the refresh failed, schedule another attempt. 1389 */ 1390 if (!refreshed) 1391 ipmp_ill_refresh_active_timer_start(ill); 1392 } 1393 1394 /* 1395 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. 1396 */ 1397 static void 1398 ipmp_ill_refresh_active_timer_start(ill_t *ill) 1399 { 1400 mutex_enter(&ill->ill_lock); 1401 1402 /* 1403 * If the ill is going away or a refresh is already scheduled, bail. 1404 */ 1405 if (ill->ill_refresh_tid != 0 || 1406 (ill->ill_state_flags & ILL_CONDEMNED)) { 1407 mutex_exit(&ill->ill_lock); 1408 return; 1409 } 1410 1411 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, 1412 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); 1413 1414 mutex_exit(&ill->ill_lock); 1415 } 1416 1417 /* 1418 * Activate `ill' so it will be used to send and receive data traffic. Return 1419 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages 1420 * needed to deactivate `ill' here as well so that deactivation cannot fail. 1421 */ 1422 static boolean_t 1423 ipmp_ill_activate(ill_t *ill) 1424 { 1425 ipif_t *ipif; 1426 mblk_t *linkupmp = NULL, *linkdownmp = NULL; 1427 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1428 ipmp_illgrp_t *illg = ill->ill_grp; 1429 ill_t *maxill; 1430 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1431 1432 ASSERT(IAM_WRITER_ILL(ill)); 1433 ASSERT(IS_UNDER_IPMP(ill)); 1434 1435 /* 1436 * If this will be the first active interface in the group, allocate 1437 * the link-up and link-down messages. 1438 */ 1439 if (grp->gr_nactif == 0) { 1440 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); 1441 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); 1442 if (linkupmp == NULL || linkdownmp == NULL) 1443 goto fail; 1444 } 1445 1446 if (list_is_empty(&illg->ig_actif)) { 1447 /* 1448 * Now that we have an active ill, nominate it for multicast 1449 * and broadcast duties. Do this before ipmp_ill_bind_ipif() 1450 * since that may need to send multicast packets (e.g., IPv6 1451 * neighbor discovery probes). 1452 */ 1453 ipmp_illgrp_set_cast(illg, ill); 1454 1455 /* 1456 * This is the first active ill in the illgrp -- add 'em all. 1457 * We can access/walk ig_ipmp_ill's ipif list since we're 1458 * writer on its IPSQ as well. 1459 */ 1460 ipif = illg->ig_ipmp_ill->ill_ipif; 1461 for (; ipif != NULL; ipif = ipif->ipif_next) 1462 if (ipmp_ipif_is_up_dataaddr(ipif)) 1463 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); 1464 } else { 1465 /* 1466 * Redistribute the addresses by moving them from the ill with 1467 * the most addresses until the ill being activated is at the 1468 * same level as the rest of the ills. 1469 */ 1470 for (;;) { 1471 maxill = ipmp_illgrp_max_ill(illg); 1472 ASSERT(maxill != NULL); 1473 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) 1474 break; 1475 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 1476 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); 1477 } 1478 } 1479 1480 /* 1481 * Put the interface in the active list. 1482 */ 1483 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1484 list_insert_tail(&illg->ig_actif, ill); 1485 illg->ig_nactif++; 1486 illg->ig_next_ill = ill; 1487 rw_exit(&ipst->ips_ipmp_lock); 1488 1489 /* 1490 * Refresh static/proxy ARP entries to use `ill', if need be. 1491 */ 1492 if (!ill->ill_isv6) 1493 ipmp_illgrp_refresh_arpent(illg); 1494 1495 /* 1496 * Finally, mark the group link up, if necessary. 1497 */ 1498 if (grp->gr_nactif++ == 0) { 1499 ASSERT(grp->gr_linkdownmp == NULL); 1500 grp->gr_linkdownmp = linkdownmp; 1501 put(illg->ig_ipmp_ill->ill_rq, linkupmp); 1502 } 1503 return (B_TRUE); 1504 fail: 1505 freemsg(linkupmp); 1506 freemsg(linkdownmp); 1507 return (B_FALSE); 1508 } 1509 1510 /* 1511 * Deactivate `ill' so it will not be used to send or receive data traffic. 1512 */ 1513 static void 1514 ipmp_ill_deactivate(ill_t *ill) 1515 { 1516 ill_t *minill, *ipmp_ill; 1517 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; 1518 mblk_t *mp; 1519 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1520 ipmp_illgrp_t *illg = ill->ill_grp; 1521 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1522 1523 ASSERT(IAM_WRITER_ILL(ill)); 1524 ASSERT(IS_UNDER_IPMP(ill)); 1525 1526 ipmp_ill = illg->ig_ipmp_ill; 1527 1528 /* 1529 * Pull the interface out of the active list. 1530 */ 1531 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1532 list_remove(&illg->ig_actif, ill); 1533 illg->ig_nactif--; 1534 illg->ig_next_ill = list_head(&illg->ig_actif); 1535 rw_exit(&ipst->ips_ipmp_lock); 1536 1537 /* 1538 * If the ill that's being deactivated had been nominated for 1539 * multicast/broadcast, nominate a new one. 1540 */ 1541 if (ill == illg->ig_cast_ill) 1542 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); 1543 1544 /* 1545 * Delete all nce_t entries using this ill, so that the next attempt 1546 * to send data traffic will revalidate cached nce's. 1547 */ 1548 nce_flush(ill, B_TRUE); 1549 1550 /* 1551 * Unbind all of the ipifs bound to this ill, and save 'em in a list; 1552 * we'll rebind them after we tell the resolver the ill is no longer 1553 * active. We must do things in this order or the resolver could 1554 * accidentally rebind to the ill we're trying to remove if multiple 1555 * ills in the group have the same hardware address (which is 1556 * unsupported, but shouldn't lead to a wedged machine). 1557 */ 1558 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { 1559 ipif->ipif_bound_next = ubheadipif; 1560 ubheadipif = ipif; 1561 } 1562 1563 if (!ill->ill_isv6) { 1564 /* 1565 * Refresh static/proxy ARP entries that had been using `ill'. 1566 */ 1567 ipmp_illgrp_refresh_arpent(illg); 1568 } 1569 1570 /* 1571 * Rebind each ipif from the deactivated ill to the active ill with 1572 * the fewest ipifs. If there are no active ills, the ipifs will 1573 * remain unbound. 1574 */ 1575 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { 1576 ubnextipif = ipif->ipif_bound_next; 1577 ipif->ipif_bound_next = NULL; 1578 1579 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 1580 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); 1581 } 1582 1583 /* 1584 * Remove any IRE_IF_CLONEs for this ill since they might have an 1585 * ire_nce_cache/nce_common which refers to another ill in the group. 1586 */ 1587 ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, ill, 1588 ill); 1589 1590 /* 1591 * Finally, if there are no longer any active interfaces, then delete 1592 * any NCECs associated with the group and mark the group link down. 1593 */ 1594 if (--grp->gr_nactif == 0) { 1595 ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, ipmp_ill, ipst); 1596 mp = grp->gr_linkdownmp; 1597 grp->gr_linkdownmp = NULL; 1598 ASSERT(mp != NULL); 1599 put(ipmp_ill->ill_rq, mp); 1600 } 1601 } 1602 1603 /* 1604 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) 1605 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. 1606 */ 1607 static void 1608 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) 1609 { 1610 ipif_t *ipif; 1611 1612 ASSERT(IAM_WRITER_ILL(ill)); 1613 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); 1614 1615 /* 1616 * If `ill' is truly down, there are no messages to generate since: 1617 * 1618 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface 1619 * and its addresses by bringing them down. But that's already 1620 * true, so there's nothing to hide. 1621 * 1622 * 2. If cmd == RTM_ADD, then we're supposed to generate messages 1623 * indicating that any previously-hidden up addresses are again 1624 * back up (along with the interface). But they aren't, so 1625 * there's nothing to expose. 1626 */ 1627 if (ill->ill_ipif_up_count == 0) 1628 return; 1629 1630 if (cmd == RTM_ADD) 1631 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); 1632 1633 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1634 if (ipif->ipif_flags & IPIF_UP) 1635 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); 1636 1637 if (cmd == RTM_DELETE) 1638 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); 1639 } 1640 1641 /* 1642 * Bind the address named by `ipif' to the underlying ill named by `ill'. 1643 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' 1644 * will indicate to the resolver whether this is an initial bringup of 1645 * `ipif', or just a rebind to another ill. 1646 */ 1647 static void 1648 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) 1649 { 1650 int err = 0; 1651 ip_stack_t *ipst = ill->ill_ipst; 1652 1653 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); 1654 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); 1655 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); 1656 ASSERT(ipif->ipif_bound_ill == NULL); 1657 ASSERT(ipif->ipif_bound_next == NULL); 1658 1659 ipif->ipif_bound_next = ill->ill_bound_ipif; 1660 ill->ill_bound_ipif = ipif; 1661 ill->ill_bound_cnt++; 1662 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1663 ipif->ipif_bound_ill = ill; 1664 rw_exit(&ipst->ips_ipmp_lock); 1665 1666 /* 1667 * If necessary, tell ARP/NDP about the new mapping. Note that 1668 * ipif_resolver_up() cannot fail for IPv6 ills. 1669 */ 1670 if (act != Res_act_none) { 1671 if (ill->ill_isv6) { 1672 VERIFY(ipif_resolver_up(ipif, act) == 0); 1673 err = ipif_ndp_up(ipif, act == Res_act_initial); 1674 } else { 1675 err = ipif_resolver_up(ipif, act); 1676 } 1677 1678 /* 1679 * Since ipif_ndp_up() never returns EINPROGRESS and 1680 * ipif_resolver_up() only returns EINPROGRESS when the 1681 * associated ill is not up, we should never be here with 1682 * EINPROGRESS. We rely on this to simplify the design. 1683 */ 1684 ASSERT(err != EINPROGRESS); 1685 } 1686 /* TODO: retry binding on failure? when? */ 1687 ipif->ipif_bound = (err == 0); 1688 } 1689 1690 /* 1691 * Unbind the address named by `ipif' from the underlying ill named by `ill'. 1692 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. 1693 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is 1694 * B_TRUE, notify the resolver about the change. 1695 */ 1696 static ipif_t * 1697 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) 1698 { 1699 ipif_t *previpif; 1700 ip_stack_t *ipst = ill->ill_ipst; 1701 1702 ASSERT(IAM_WRITER_ILL(ill)); 1703 ASSERT(IS_UNDER_IPMP(ill)); 1704 1705 /* 1706 * If necessary, find an ipif to unbind. 1707 */ 1708 if (ipif == NULL) { 1709 if ((ipif = ill->ill_bound_ipif) == NULL) { 1710 ASSERT(ill->ill_bound_cnt == 0); 1711 return (NULL); 1712 } 1713 } 1714 1715 ASSERT(IAM_WRITER_IPIF(ipif)); 1716 ASSERT(IS_IPMP(ipif->ipif_ill)); 1717 ASSERT(ipif->ipif_bound_ill == ill); 1718 ASSERT(ill->ill_bound_cnt > 0); 1719 1720 /* 1721 * Unbind it. 1722 */ 1723 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1724 ipif->ipif_bound_ill = NULL; 1725 rw_exit(&ipst->ips_ipmp_lock); 1726 ill->ill_bound_cnt--; 1727 1728 if (ill->ill_bound_ipif == ipif) { 1729 ill->ill_bound_ipif = ipif->ipif_bound_next; 1730 } else { 1731 previpif = ill->ill_bound_ipif; 1732 while (previpif->ipif_bound_next != ipif) 1733 previpif = previpif->ipif_bound_next; 1734 1735 previpif->ipif_bound_next = ipif->ipif_bound_next; 1736 } 1737 ipif->ipif_bound_next = NULL; 1738 1739 /* 1740 * If requested, notify the resolvers (provided we're bound). 1741 */ 1742 if (notifyres && ipif->ipif_bound) { 1743 if (ill->ill_isv6) 1744 ipif_ndp_down(ipif); 1745 else 1746 (void) ipif_arp_down(ipif); 1747 } 1748 ipif->ipif_bound = B_FALSE; 1749 1750 return (ipif); 1751 } 1752 1753 /* 1754 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if 1755 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this 1756 * to determine whether an ill should be considered active, other consumers 1757 * may race and learn about an ill that should be deactivated/activated before 1758 * IPMP has performed the activation/deactivation. This should be safe though 1759 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that 1760 * would've been cleaned up by ipmp_ill_deactivate(). 1761 */ 1762 boolean_t 1763 ipmp_ill_is_active(ill_t *ill) 1764 { 1765 phyint_t *phyi = ill->ill_phyint; 1766 1767 ASSERT(IS_UNDER_IPMP(ill)); 1768 ASSERT(IAM_WRITER_ILL(ill) || 1769 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); 1770 1771 /* 1772 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to 1773 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the 1774 * link flapping logic to be just in in.mpathd and allows us to ignore 1775 * changes to PHYI_RUNNING. 1776 */ 1777 return (!(ill->ill_ipif_up_count == 0 || 1778 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); 1779 } 1780 1781 /* 1782 * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated 1783 * with `ill_arg'. 1784 */ 1785 static void 1786 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) 1787 { 1788 ill_t *ill = (ill_t *)ill_arg; 1789 1790 ASSERT(IAM_WRITER_ILL(ill)); 1791 ASSERT(!IS_IPMP(ill)); 1792 1793 if (ire->ire_ill != ill) 1794 return; 1795 1796 if (IRE_HIDDEN_TYPE(ire->ire_type)) { 1797 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); 1798 ire->ire_testhidden = B_TRUE; 1799 } 1800 } 1801 1802 /* 1803 * IRE walker callback: clear ire_testhidden if the IRE has a source address 1804 * on `ill_arg'. 1805 */ 1806 static void 1807 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) 1808 { 1809 ill_t *ill = (ill_t *)ill_arg; 1810 1811 ASSERT(IAM_WRITER_ILL(ill)); 1812 ASSERT(!IS_IPMP(ill)); 1813 1814 if (ire->ire_ill == ill) { 1815 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); 1816 ire->ire_testhidden = B_FALSE; 1817 } 1818 } 1819 1820 /* 1821 * Return a held pointer to the IPMP ill for underlying interface `ill', or 1822 * NULL if one doesn't exist. (Unfortunately, this function needs to take an 1823 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1824 * ill_grp pointer may become stale when not inside an IPSQ and not holding 1825 * ipmp_lock.) Caller need not be inside the IPSQ. 1826 */ 1827 ill_t * 1828 ipmp_ill_hold_ipmp_ill(ill_t *ill) 1829 { 1830 ip_stack_t *ipst = ill->ill_ipst; 1831 ipmp_illgrp_t *illg; 1832 1833 ASSERT(!IS_IPMP(ill)); 1834 1835 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1836 illg = ill->ill_grp; 1837 if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) { 1838 rw_exit(&ipst->ips_ipmp_lock); 1839 return (illg->ig_ipmp_ill); 1840 } 1841 /* 1842 * Assume `ill' was removed from the illgrp in the meantime. 1843 */ 1844 rw_exit(&ill->ill_ipst->ips_ipmp_lock); 1845 return (NULL); 1846 } 1847 1848 /* 1849 * Return a held pointer to the appropriate underlying ill for sending the 1850 * specified type of packet. (Unfortunately, this function needs to take an 1851 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1852 * ill_grp pointer may become stale when not inside an IPSQ and not holding 1853 * ipmp_lock.) Caller need not be inside the IPSQ. 1854 */ 1855 ill_t * 1856 ipmp_ill_hold_xmit_ill(ill_t *ill, boolean_t is_unicast) 1857 { 1858 ill_t *xmit_ill; 1859 ip_stack_t *ipst = ill->ill_ipst; 1860 1861 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 1862 if (ill->ill_grp == NULL) { 1863 /* 1864 * The ill was taken out of the group, so just send on it. 1865 */ 1866 rw_exit(&ipst->ips_ill_g_lock); 1867 ill_refhold(ill); 1868 return (ill); 1869 } 1870 if (is_unicast) 1871 xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp); 1872 else 1873 xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 1874 rw_exit(&ipst->ips_ill_g_lock); 1875 1876 return (xmit_ill); 1877 } 1878 1879 /* 1880 * Return the interface index for the IPMP ill tied to underlying interface 1881 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. 1882 */ 1883 uint_t 1884 ipmp_ill_get_ipmp_ifindex(const ill_t *ill) 1885 { 1886 uint_t ifindex = 0; 1887 ip_stack_t *ipst = ill->ill_ipst; 1888 ipmp_grp_t *grp; 1889 1890 ASSERT(!IS_IPMP(ill)); 1891 1892 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1893 if ((grp = ill->ill_phyint->phyint_grp) != NULL) 1894 ifindex = grp->gr_phyint->phyint_ifindex; 1895 rw_exit(&ipst->ips_ipmp_lock); 1896 return (ifindex); 1897 } 1898 1899 /* 1900 * Place phyint `phyi' into IPMP group `grp'. 1901 */ 1902 void 1903 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) 1904 { 1905 ill_t *ill; 1906 ipsq_t *ipsq = phyi->phyint_ipsq; 1907 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; 1908 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1909 1910 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1911 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); 1912 1913 /* 1914 * Send routing socket messages indicating that the phyint's ills 1915 * and ipifs vanished. 1916 */ 1917 if (phyi->phyint_illv4 != NULL) { 1918 ill = phyi->phyint_illv4; 1919 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1920 } 1921 1922 if (phyi->phyint_illv6 != NULL) { 1923 ill = phyi->phyint_illv6; 1924 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1925 } 1926 1927 /* 1928 * Snapshot the phyint's initial kstats as a baseline. 1929 */ 1930 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); 1931 1932 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1933 1934 phyi->phyint_grp = grp; 1935 if (++grp->gr_nif == 1) 1936 grp->gr_mactype = ill->ill_mactype; 1937 else 1938 ASSERT(grp->gr_mactype == ill->ill_mactype); 1939 1940 /* 1941 * Now that we're in the group, request a switch to the group's xop 1942 * when we ipsq_exit(). All future operations will be exclusive on 1943 * the group xop until ipmp_phyint_leave_grp() is called. 1944 */ 1945 ASSERT(ipsq->ipsq_swxop == NULL); 1946 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); 1947 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; 1948 1949 rw_exit(&ipst->ips_ipmp_lock); 1950 } 1951 1952 /* 1953 * Remove phyint `phyi' from its current IPMP group. 1954 */ 1955 void 1956 ipmp_phyint_leave_grp(phyint_t *phyi) 1957 { 1958 uint_t i; 1959 ipsq_t *ipsq = phyi->phyint_ipsq; 1960 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1961 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 1962 1963 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1964 1965 /* 1966 * If any of the phyint's ills are still in an illgrp, kick 'em out. 1967 */ 1968 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) 1969 ipmp_ill_leave_illgrp(phyi->phyint_illv4); 1970 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) 1971 ipmp_ill_leave_illgrp(phyi->phyint_illv6); 1972 1973 /* 1974 * Send routing socket messages indicating that the phyint's ills 1975 * and ipifs have reappeared. 1976 */ 1977 if (phyi->phyint_illv4 != NULL) 1978 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); 1979 if (phyi->phyint_illv6 != NULL) 1980 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); 1981 1982 /* 1983 * Calculate the phyint's cumulative kstats while it was in the group, 1984 * and add that to the group's baseline. 1985 */ 1986 ipmp_phyint_get_kstats(phyi, phyi_kstats); 1987 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 1988 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 1989 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); 1990 } 1991 1992 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1993 1994 phyi->phyint_grp->gr_nif--; 1995 phyi->phyint_grp = NULL; 1996 1997 /* 1998 * As our final act in leaving the group, request a switch back to our 1999 * IPSQ's own xop when we ipsq_exit(). 2000 */ 2001 ASSERT(ipsq->ipsq_swxop == NULL); 2002 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; 2003 2004 rw_exit(&ipst->ips_ipmp_lock); 2005 } 2006 2007 /* 2008 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. 2009 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. 2010 */ 2011 static void 2012 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) 2013 { 2014 uint_t i, j; 2015 const char *name; 2016 kstat_t *ksp; 2017 kstat_named_t *kn; 2018 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 2019 zoneid_t zoneid; 2020 2021 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); 2022 zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); 2023 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid); 2024 if (ksp == NULL) 2025 return; 2026 2027 KSTAT_ENTER(ksp); 2028 2029 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 2030 /* 2031 * Bring kstats up-to-date before recording. 2032 */ 2033 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 2034 2035 kn = KSTAT_NAMED_PTR(ksp); 2036 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2037 name = ipmp_kstats[i].name; 2038 kstats[i] = 0; 2039 for (j = 0; j < ksp->ks_ndata; j++) { 2040 if (strcmp(kn[j].name, name) != 0) 2041 continue; 2042 2043 switch (kn[j].data_type) { 2044 case KSTAT_DATA_INT32: 2045 case KSTAT_DATA_UINT32: 2046 kstats[i] = kn[j].value.ui32; 2047 break; 2048 #ifdef _LP64 2049 case KSTAT_DATA_LONG: 2050 case KSTAT_DATA_ULONG: 2051 kstats[i] = kn[j].value.ul; 2052 break; 2053 #endif 2054 case KSTAT_DATA_INT64: 2055 case KSTAT_DATA_UINT64: 2056 kstats[i] = kn[j].value.ui64; 2057 break; 2058 } 2059 break; 2060 } 2061 } 2062 } 2063 2064 KSTAT_EXIT(ksp); 2065 kstat_rele(ksp); 2066 } 2067 2068 /* 2069 * Refresh the active state of all ills on `phyi'. 2070 */ 2071 void 2072 ipmp_phyint_refresh_active(phyint_t *phyi) 2073 { 2074 if (phyi->phyint_illv4 != NULL) 2075 ipmp_ill_refresh_active(phyi->phyint_illv4); 2076 if (phyi->phyint_illv6 != NULL) 2077 ipmp_ill_refresh_active(phyi->phyint_illv6); 2078 } 2079 2080 /* 2081 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one 2082 * doesn't exist. Caller need not be inside the IPSQ. 2083 */ 2084 ill_t * 2085 ipmp_ipif_hold_bound_ill(const ipif_t *ipif) 2086 { 2087 ill_t *boundill; 2088 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2089 2090 ASSERT(IS_IPMP(ipif->ipif_ill)); 2091 2092 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2093 boundill = ipif->ipif_bound_ill; 2094 if (boundill != NULL && ill_check_and_refhold(boundill)) { 2095 rw_exit(&ipst->ips_ipmp_lock); 2096 return (boundill); 2097 } 2098 rw_exit(&ipst->ips_ipmp_lock); 2099 return (NULL); 2100 } 2101 2102 /* 2103 * Return a pointer to the underlying ill bound to `ipif', or NULL if one 2104 * doesn't exist. Caller must be inside the IPSQ. 2105 */ 2106 ill_t * 2107 ipmp_ipif_bound_ill(const ipif_t *ipif) 2108 { 2109 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); 2110 ASSERT(IS_IPMP(ipif->ipif_ill)); 2111 2112 return (ipif->ipif_bound_ill); 2113 } 2114 2115 /* 2116 * Check if `ipif' is a "stub" (placeholder address not being used). 2117 */ 2118 boolean_t 2119 ipmp_ipif_is_stubaddr(const ipif_t *ipif) 2120 { 2121 if (ipif->ipif_flags & IPIF_UP) 2122 return (B_FALSE); 2123 if (ipif->ipif_ill->ill_isv6) 2124 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2125 else 2126 return (ipif->ipif_lcl_addr == INADDR_ANY); 2127 } 2128 2129 /* 2130 * Check if `ipif' is an IPMP data address. 2131 */ 2132 boolean_t 2133 ipmp_ipif_is_dataaddr(const ipif_t *ipif) 2134 { 2135 if (ipif->ipif_flags & IPIF_NOFAILOVER) 2136 return (B_FALSE); 2137 if (ipif->ipif_ill->ill_isv6) 2138 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2139 else 2140 return (ipif->ipif_lcl_addr != INADDR_ANY); 2141 } 2142 2143 /* 2144 * Check if `ipif' is an IPIF_UP IPMP data address. 2145 */ 2146 static boolean_t 2147 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) 2148 { 2149 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); 2150 } 2151 2152 /* 2153 * Check if `mp' contains a probe packet by checking if the IP source address 2154 * is a test address on underlying interface `ill'. Caller need not be inside 2155 * the IPSQ. 2156 */ 2157 boolean_t 2158 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill) 2159 { 2160 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2161 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2162 2163 ASSERT(DB_TYPE(mp) != M_CTL); 2164 2165 if (!IS_UNDER_IPMP(ill)) 2166 return (B_FALSE); 2167 2168 if (ill->ill_isv6) { 2169 if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && 2170 ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) 2171 return (B_TRUE); 2172 } else { 2173 if (ipha->ipha_src != INADDR_ANY && 2174 ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL)) 2175 return (B_TRUE); 2176 } 2177 return (B_FALSE); 2178 } 2179 2180 /* 2181 * NCEC walker callback: delete `ncec' if it is associated with `ill_arg' and 2182 * is not one of our local addresses. Caller must be inside the IPSQ. 2183 */ 2184 static void 2185 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *ill_arg) 2186 { 2187 if (!NCE_MYADDR(ncec) && ncec->ncec_ill == (ill_t *)ill_arg) 2188 ncec_delete(ncec); 2189 } 2190 2191 /* 2192 * Delete any NCEs tied to the illgrp associated with `ncec'. Caller need not 2193 * be inside the IPSQ. 2194 */ 2195 void 2196 ipmp_ncec_delete_nce(ncec_t *ncec) 2197 { 2198 ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp; 2199 ip_stack_t *ipst = ncec->ncec_ipst; 2200 ill_t *ill; 2201 nce_t *nce; 2202 list_t dead; 2203 2204 ASSERT(IS_IPMP(ncec->ncec_ill)); 2205 2206 /* 2207 * For each underlying interface, delete `ncec' from its ill_nce list 2208 * via nce_fastpath_list_delete(). Defer the actual nce_refrele() 2209 * until we've dropped ill_g_lock. 2210 */ 2211 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 2212 2213 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2214 ill = list_head(&illg->ig_if); 2215 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) 2216 nce_fastpath_list_delete(ill, ncec, &dead); 2217 rw_exit(&ipst->ips_ill_g_lock); 2218 2219 while ((nce = list_remove_head(&dead)) != NULL) 2220 nce_refrele(nce); 2221 2222 list_destroy(&dead); 2223 } 2224 2225 /* 2226 * Refresh any NCE entries tied to the illgrp associated with `ncec' to 2227 * use the information in `ncec'. Caller need not be inside the IPSQ. 2228 */ 2229 void 2230 ipmp_ncec_refresh_nce(ncec_t *ncec) 2231 { 2232 ipmp_illgrp_t *illg = ncec->ncec_ill->ill_grp; 2233 ip_stack_t *ipst = ncec->ncec_ipst; 2234 ill_t *ill; 2235 nce_t *nce, *nce_next; 2236 list_t replace; 2237 2238 ASSERT(IS_IPMP(ncec->ncec_ill)); 2239 2240 /* 2241 * If `ncec' is not reachable, there is no use in refreshing NCEs. 2242 */ 2243 if (!NCE_ISREACHABLE(ncec)) 2244 return; 2245 2246 /* 2247 * Find all the NCEs matching ncec->ncec_addr. We cannot update them 2248 * in-situ because we're holding ipmp_lock to prevent changes to IPMP 2249 * group membership and updating indirectly calls nce_fastpath_probe() 2250 * -> putnext() which cannot hold locks. Thus, move the NCEs to a 2251 * separate list and process that list after dropping ipmp_lock. 2252 */ 2253 list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node)); 2254 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2255 ill = list_head(&illg->ig_actif); 2256 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 2257 mutex_enter(&ill->ill_lock); 2258 nce = list_head(&ill->ill_nce); 2259 for (; nce != NULL; nce = nce_next) { 2260 nce_next = list_next(&ill->ill_nce, nce); 2261 if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, 2262 &ncec->ncec_addr)) { 2263 nce_refhold(nce); 2264 nce_delete(nce); 2265 list_insert_tail(&replace, nce); 2266 } 2267 } 2268 mutex_exit(&ill->ill_lock); 2269 } 2270 rw_exit(&ipst->ips_ipmp_lock); 2271 2272 /* 2273 * Process the list; nce_lookup_then_add_v* ensures that nce->nce_ill 2274 * is still in the group for ncec->ncec_ill. 2275 */ 2276 while ((nce = list_remove_head(&replace)) != NULL) { 2277 if (ncec->ncec_ill->ill_isv6) { 2278 (void) nce_lookup_then_add_v6(nce->nce_ill, 2279 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2280 &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED, 2281 NULL); 2282 } else { 2283 ipaddr_t ipaddr; 2284 2285 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr); 2286 (void) nce_lookup_then_add_v4(nce->nce_ill, 2287 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2288 &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL); 2289 } 2290 nce_refrele(nce); 2291 } 2292 2293 list_destroy(&replace); 2294 } 2295