1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #include <inet/ip.h> 26 #include <inet/ip6.h> 27 #include <inet/ip_if.h> 28 #include <inet/ip_ire.h> 29 #include <inet/ip_multi.h> 30 #include <inet/ip_ndp.h> 31 #include <inet/ip_rts.h> 32 #include <inet/mi.h> 33 #include <net/if_types.h> 34 #include <sys/dlpi.h> 35 #include <sys/kmem.h> 36 #include <sys/modhash.h> 37 #include <sys/sdt.h> 38 #include <sys/strsun.h> 39 #include <sys/sunddi.h> 40 #include <sys/types.h> 41 42 /* 43 * Convenience macros for getting the ip_stack_t associated with an 44 * ipmp_illgrp_t or ipmp_grp_t. 45 */ 46 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) 47 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) 48 49 /* 50 * Assorted constants that aren't important enough to be tunable. 51 */ 52 #define IPMP_GRP_HASH_SIZE 64 53 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ 54 55 56 /* 57 * IPMP meta-interface kstats (based on those in PSARC/1997/198). 58 */ 59 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { 60 { "obytes", KSTAT_DATA_UINT32 }, 61 { "obytes64", KSTAT_DATA_UINT64 }, 62 { "rbytes", KSTAT_DATA_UINT32 }, 63 { "rbytes64", KSTAT_DATA_UINT64 }, 64 { "opackets", KSTAT_DATA_UINT32 }, 65 { "opackets64", KSTAT_DATA_UINT64 }, 66 { "oerrors", KSTAT_DATA_UINT32 }, 67 { "ipackets", KSTAT_DATA_UINT32 }, 68 { "ipackets64", KSTAT_DATA_UINT64 }, 69 { "ierrors", KSTAT_DATA_UINT32 }, 70 { "multircv", KSTAT_DATA_UINT32 }, 71 { "multixmt", KSTAT_DATA_UINT32 }, 72 { "brdcstrcv", KSTAT_DATA_UINT32 }, 73 { "brdcstxmt", KSTAT_DATA_UINT32 }, 74 { "link_up", KSTAT_DATA_UINT32 } 75 }; 76 77 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); 78 static int ipmp_grp_create_kstats(ipmp_grp_t *); 79 static int ipmp_grp_update_kstats(kstat_t *, int); 80 static void ipmp_grp_destroy_kstats(ipmp_grp_t *); 81 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); 82 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); 83 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); 84 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); 85 static boolean_t ipmp_ill_activate(ill_t *); 86 static void ipmp_ill_deactivate(ill_t *); 87 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); 88 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); 89 static void ipmp_ill_refresh_active_timer_start(ill_t *); 90 static void ipmp_ill_rtsaddrmsg(ill_t *, int); 91 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); 92 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); 93 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); 94 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); 95 96 /* 97 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). 98 */ 99 void 100 ipmp_init(ip_stack_t *ipst) 101 { 102 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", 103 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 104 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 105 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); 106 } 107 108 /* 109 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). 110 */ 111 void 112 ipmp_destroy(ip_stack_t *ipst) 113 { 114 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); 115 rw_destroy(&ipst->ips_ipmp_lock); 116 } 117 118 /* 119 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', 120 * and add it to the hash. On success, return a pointer to the created group. 121 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP 122 * meta-interface associated with the group also has the same name (but they 123 * may differ later via ipmp_grp_rename()). 124 */ 125 ipmp_grp_t * 126 ipmp_grp_create(const char *grname, phyint_t *phyi) 127 { 128 ipmp_grp_t *grp; 129 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 130 mod_hash_hndl_t mh; 131 132 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 133 134 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) 135 return (NULL); 136 137 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 138 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); 139 140 /* 141 * Cache the group's phyint. This is safe since a phyint_t will 142 * outlive its ipmp_grp_t. 143 */ 144 grp->gr_phyint = phyi; 145 146 /* 147 * Create IPMP group kstats. 148 */ 149 if (ipmp_grp_create_kstats(grp) != 0) { 150 kmem_free(grp, sizeof (ipmp_grp_t)); 151 return (NULL); 152 } 153 154 /* 155 * Insert the group into the hash. 156 */ 157 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { 158 ipmp_grp_destroy_kstats(grp); 159 kmem_free(grp, sizeof (ipmp_grp_t)); 160 return (NULL); 161 } 162 ipmp_grp_insert(grp, mh); 163 164 return (grp); 165 } 166 167 /* 168 * Create IPMP kstat structures for `grp'. Return an errno upon failure. 169 */ 170 static int 171 ipmp_grp_create_kstats(ipmp_grp_t *grp) 172 { 173 kstat_t *ksp; 174 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 175 176 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", 177 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); 178 if (ksp == NULL) 179 return (ENOMEM); 180 181 ksp->ks_update = ipmp_grp_update_kstats; 182 ksp->ks_private = grp; 183 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); 184 185 kstat_install(ksp); 186 grp->gr_ksp = ksp; 187 return (0); 188 } 189 190 /* 191 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. 192 */ 193 static int 194 ipmp_grp_update_kstats(kstat_t *ksp, int rw) 195 { 196 uint_t i; 197 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 198 ipmp_grp_t *grp = ksp->ks_private; 199 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 200 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; 201 phyint_t *phyi; 202 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 203 204 if (rw == KSTAT_WRITE) 205 return (EACCES); 206 207 /* 208 * Start with the group's baseline values. 209 */ 210 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 211 if (kn[i].data_type == KSTAT_DATA_UINT32) { 212 kn[i].value.ui32 = grp->gr_kstats0[i]; 213 } else { 214 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); 215 kn[i].value.ui64 = grp->gr_kstats0[i]; 216 } 217 } 218 219 /* 220 * Add in the stats of each phyint currently in the group. Since we 221 * don't directly track the phyints in a group, we cheat by walking 222 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while 223 * ill_g_lock is held.) 224 */ 225 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 226 ipsq = grp_ipsq->ipsq_next; 227 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { 228 phyi = ipsq->ipsq_phyint; 229 230 /* 231 * If a phyint in a group is being unplumbed, it's possible 232 * that ill_glist_delete() -> phyint_free() already freed the 233 * phyint (and set ipsq_phyint to NULL), but the unplumb 234 * operation has yet to complete (and thus ipsq_dq() has yet 235 * to remove the phyint's IPSQ from the group IPSQ's phyint 236 * list). We skip those phyints here (note that their kstats 237 * have already been added to gr_kstats0[]). 238 */ 239 if (phyi == NULL) 240 continue; 241 242 ipmp_phyint_get_kstats(phyi, phyi_kstats); 243 244 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 245 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 246 if (kn[i].data_type == KSTAT_DATA_UINT32) 247 kn[i].value.ui32 += phyi_kstats[i]; 248 else 249 kn[i].value.ui64 += phyi_kstats[i]; 250 } 251 } 252 253 kn[IPMP_KSTAT_LINK_UP].value.ui32 = 254 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; 255 256 rw_exit(&ipst->ips_ill_g_lock); 257 return (0); 258 } 259 260 /* 261 * Destroy IPMP kstat structures for `grp'. 262 */ 263 static void 264 ipmp_grp_destroy_kstats(ipmp_grp_t *grp) 265 { 266 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 267 268 kstat_delete_netstack(grp->gr_ksp, id); 269 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); 270 grp->gr_ksp = NULL; 271 } 272 273 /* 274 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it 275 * does not exist. 276 */ 277 ipmp_grp_t * 278 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) 279 { 280 ipmp_grp_t *grp; 281 282 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 283 284 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 285 (mod_hash_val_t *)&grp) == 0) 286 return (grp); 287 288 return (NULL); 289 } 290 291 /* 292 * Place information about group `grp' into `lifgr'. 293 */ 294 void 295 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) 296 { 297 ill_t *ill; 298 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 299 300 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 301 302 lifgr->gi_v4 = (grp->gr_v4 != NULL); 303 lifgr->gi_v6 = (grp->gr_v6 != NULL); 304 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; 305 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; 306 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; 307 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); 308 lifgr->gi_m4ifname[0] = '\0'; 309 lifgr->gi_m6ifname[0] = '\0'; 310 lifgr->gi_bcifname[0] = '\0'; 311 312 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { 313 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); 314 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); 315 } 316 317 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) 318 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); 319 } 320 321 /* 322 * Insert `grp' into the hash using the reserved hash entry `mh'. 323 * Caller must ensure `grp' is not yet in the hash. 324 */ 325 static void 326 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) 327 { 328 int err; 329 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 330 331 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 332 333 /* 334 * Since grp->gr_name will exist at least as long as `grp' is in the 335 * hash, we use it directly as the key. 336 */ 337 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, 338 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); 339 if (err != 0) { 340 /* 341 * This should never happen since `mh' was preallocated. 342 */ 343 panic("cannot insert IPMP group \"%s\" (err %d)", 344 grp->gr_name, err); 345 } 346 } 347 348 /* 349 * Remove `grp' from the hash. Caller must ensure `grp' is in it. 350 */ 351 static void 352 ipmp_grp_remove(ipmp_grp_t *grp) 353 { 354 int err; 355 mod_hash_val_t val; 356 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; 357 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 358 359 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 360 361 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); 362 if (err != 0 || val != grp) { 363 panic("cannot remove IPMP group \"%s\" (err %d)", 364 grp->gr_name, err); 365 } 366 } 367 368 /* 369 * Attempt to rename `grp' to new name `grname'. Return an errno if the new 370 * group name already exists or is invalid, or if there isn't enough memory. 371 */ 372 int 373 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) 374 { 375 mod_hash_hndl_t mh; 376 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 377 378 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 379 380 if (grname[0] == '\0') 381 return (EINVAL); 382 383 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 384 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) 385 return (EEXIST); 386 387 /* 388 * Before we remove the group from the hash, ensure we'll be able to 389 * re-insert it by reserving space. 390 */ 391 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) 392 return (ENOMEM); 393 394 ipmp_grp_remove(grp); 395 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 396 ipmp_grp_insert(grp, mh); 397 398 return (0); 399 } 400 401 /* 402 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in 403 * the hash, and that there are no interfaces on it. 404 */ 405 void 406 ipmp_grp_destroy(ipmp_grp_t *grp) 407 { 408 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 409 410 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 411 412 /* 413 * If there are still interfaces using this group, panic before things 414 * go really off the rails. 415 */ 416 if (grp->gr_nif != 0) 417 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); 418 419 ipmp_grp_remove(grp); 420 ipmp_grp_destroy_kstats(grp); 421 422 ASSERT(grp->gr_v4 == NULL); 423 ASSERT(grp->gr_v6 == NULL); 424 ASSERT(grp->gr_nv4 == 0); 425 ASSERT(grp->gr_nv6 == 0); 426 ASSERT(grp->gr_nactif == 0); 427 ASSERT(grp->gr_linkdownmp == NULL); 428 grp->gr_phyint = NULL; 429 430 kmem_free(grp, sizeof (ipmp_grp_t)); 431 } 432 433 /* 434 * Check whether `ill' is suitable for inclusion into `grp', and return an 435 * errno describing the problem (if any). NOTE: many of these errno values 436 * are interpreted by ifconfig, which will take corrective action and retry 437 * the SIOCSLIFGROUPNAME, so please exercise care when changing them. 438 */ 439 static int 440 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) 441 { 442 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 443 444 ASSERT(IAM_WRITER_ILL(ill)); 445 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 446 447 /* 448 * To sidestep complicated address migration logic in the kernel and 449 * to force the kernel's all-hosts multicast memberships to be blown 450 * away, all addresses that had been brought up must be brought back 451 * down prior to adding an interface to a group. (This includes 452 * addresses currently down due to DAD.) Once the interface has been 453 * added to the group, its addresses can then be brought back up, at 454 * which point they will be moved to the IPMP meta-interface. 455 * NOTE: we do this before ill_appaddr_cnt() since bringing down the 456 * link-local causes in.ndpd to remove its ADDRCONF'd addresses. 457 */ 458 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 459 return (EADDRINUSE); 460 461 /* 462 * To avoid confusing applications by changing addresses that are 463 * under their control, all such control must be removed prior to 464 * adding an interface into a group. 465 */ 466 if (ill_appaddr_cnt(ill) != 0) 467 return (EADDRNOTAVAIL); 468 469 /* 470 * Since PTP addresses do not share the same broadcast domain, they 471 * are not allowed to be in an IPMP group. 472 */ 473 if (ill_ptpaddr_cnt(ill) != 0) 474 return (EINVAL); 475 476 /* 477 * An ill must support multicast to be allowed into a group. 478 */ 479 if (!(ill->ill_flags & ILLF_MULTICAST)) 480 return (ENOTSUP); 481 482 /* 483 * An ill must strictly be using ARP and/or ND for address 484 * resolution for it to be allowed into a group. 485 */ 486 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP)) 487 return (ENOTSUP); 488 489 /* 490 * An ill cannot also be using usesrc groups. (Although usesrc uses 491 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does 492 * all its modifications as writer.) 493 */ 494 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) 495 return (ENOTSUP); 496 497 /* 498 * All ills in a group must be the same mactype. 499 */ 500 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) 501 return (EINVAL); 502 503 return (0); 504 } 505 506 /* 507 * Check whether `phyi' is suitable for inclusion into `grp', and return an 508 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() 509 * regarding errno values. 510 */ 511 int 512 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) 513 { 514 int err = 0; 515 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 516 517 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); 518 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 519 520 /* 521 * An interface cannot have address families plumbed that are not 522 * configured in the group. 523 */ 524 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || 525 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) 526 return (EAFNOSUPPORT); 527 528 if (phyi->phyint_illv4 != NULL) 529 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); 530 if (err == 0 && phyi->phyint_illv6 != NULL) 531 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); 532 533 return (err); 534 } 535 536 /* 537 * Create a new illgrp on IPMP meta-interface `ill'. 538 */ 539 ipmp_illgrp_t * 540 ipmp_illgrp_create(ill_t *ill) 541 { 542 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 543 ipmp_illgrp_t *illg; 544 545 ASSERT(IAM_WRITER_ILL(ill)); 546 ASSERT(IS_IPMP(ill)); 547 ASSERT(ill->ill_grp == NULL); 548 549 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) 550 return (NULL); 551 552 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); 553 list_create(&illg->ig_actif, sizeof (ill_t), 554 offsetof(ill_t, ill_actnode)); 555 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), 556 offsetof(ipmp_arpent_t, ia_node)); 557 558 illg->ig_ipmp_ill = ill; 559 ill->ill_grp = illg; 560 ipmp_illgrp_set_mtu(illg, mtu); 561 562 return (illg); 563 } 564 565 /* 566 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. 567 */ 568 void 569 ipmp_illgrp_destroy(ipmp_illgrp_t *illg) 570 { 571 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 572 ASSERT(IS_IPMP(illg->ig_ipmp_ill)); 573 574 /* 575 * Verify `illg' is empty. 576 */ 577 ASSERT(illg->ig_next_ill == NULL); 578 ASSERT(illg->ig_cast_ill == NULL); 579 ASSERT(list_is_empty(&illg->ig_arpent)); 580 ASSERT(list_is_empty(&illg->ig_if)); 581 ASSERT(list_is_empty(&illg->ig_actif)); 582 ASSERT(illg->ig_nactif == 0); 583 584 /* 585 * Destroy `illg'. 586 */ 587 illg->ig_ipmp_ill->ill_grp = NULL; 588 illg->ig_ipmp_ill = NULL; 589 list_destroy(&illg->ig_if); 590 list_destroy(&illg->ig_actif); 591 list_destroy(&illg->ig_arpent); 592 kmem_free(illg, sizeof (ipmp_illgrp_t)); 593 } 594 595 /* 596 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to 597 * bind it to an underlying ill, while keeping an even address distribution. 598 * If the bind is successful, return a pointer to the bound ill. 599 */ 600 ill_t * 601 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 602 { 603 ill_t *minill; 604 ipmp_arpent_t *entp; 605 606 ASSERT(IAM_WRITER_IPIF(ipif)); 607 ASSERT(ipmp_ipif_is_dataaddr(ipif)); 608 609 /* 610 * IPMP data address mappings are internally managed by IP itself, so 611 * delete any existing ARP entries associated with the address. 612 */ 613 if (!ipif->ipif_isv6) { 614 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); 615 if (entp != NULL) 616 ipmp_illgrp_destroy_arpent(illg, entp); 617 } 618 619 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 620 ipmp_ill_bind_ipif(minill, ipif, Res_act_none); 621 622 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); 623 } 624 625 /* 626 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's 627 * bound, unbind it from the underlying ill while keeping an even address 628 * distribution. 629 */ 630 void 631 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 632 { 633 ill_t *maxill, *boundill = ipif->ipif_bound_ill; 634 635 ASSERT(IAM_WRITER_IPIF(ipif)); 636 637 if (boundill != NULL) { 638 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); 639 640 maxill = ipmp_illgrp_max_ill(illg); 641 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { 642 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 643 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); 644 } 645 } 646 } 647 648 /* 649 * Return the active ill with the greatest number of data addresses in `illg'. 650 */ 651 static ill_t * 652 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) 653 { 654 ill_t *ill, *bestill = NULL; 655 656 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 657 658 ill = list_head(&illg->ig_actif); 659 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 660 if (bestill == NULL || 661 ill->ill_bound_cnt > bestill->ill_bound_cnt) { 662 bestill = ill; 663 } 664 } 665 return (bestill); 666 } 667 668 /* 669 * Return the active ill with the fewest number of data addresses in `illg'. 670 */ 671 static ill_t * 672 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) 673 { 674 ill_t *ill, *bestill = NULL; 675 676 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 677 678 ill = list_head(&illg->ig_actif); 679 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 680 if (bestill == NULL || 681 ill->ill_bound_cnt < bestill->ill_bound_cnt) { 682 if (ill->ill_bound_cnt == 0) 683 return (ill); /* can't get better */ 684 bestill = ill; 685 } 686 } 687 return (bestill); 688 } 689 690 /* 691 * Return a pointer to IPMP meta-interface for `illg' (which must exist). 692 * Since ig_ipmp_ill never changes for a given illg, no locks are needed. 693 */ 694 ill_t * 695 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) 696 { 697 return (illg->ig_ipmp_ill); 698 } 699 700 /* 701 * Return a pointer to the next available underlying ill in `illg', or NULL if 702 * one doesn't exist. Caller must be inside the IPSQ. 703 */ 704 ill_t * 705 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) 706 { 707 ill_t *ill; 708 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 709 710 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 711 712 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 713 if ((ill = illg->ig_next_ill) != NULL) { 714 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 715 if (illg->ig_next_ill == NULL) 716 illg->ig_next_ill = list_head(&illg->ig_actif); 717 } 718 rw_exit(&ipst->ips_ipmp_lock); 719 720 return (ill); 721 } 722 723 /* 724 * Return a held pointer to the next available underlying ill in `illg', or 725 * NULL if one doesn't exist. Caller need not be inside the IPSQ. 726 */ 727 ill_t * 728 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) 729 { 730 ill_t *ill; 731 uint_t i; 732 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 733 734 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 735 for (i = 0; i < illg->ig_nactif; i++) { 736 ill = illg->ig_next_ill; 737 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 738 if (illg->ig_next_ill == NULL) 739 illg->ig_next_ill = list_head(&illg->ig_actif); 740 741 if (ill_check_and_refhold(ill)) { 742 rw_exit(&ipst->ips_ipmp_lock); 743 return (ill); 744 } 745 } 746 rw_exit(&ipst->ips_ipmp_lock); 747 748 return (NULL); 749 } 750 751 /* 752 * Return a held pointer to the nominated multicast ill in `illg', or NULL if 753 * one doesn't exist. Caller need not be inside the IPSQ. 754 */ 755 ill_t * 756 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) 757 { 758 ill_t *castill; 759 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 760 761 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 762 castill = illg->ig_cast_ill; 763 if (castill != NULL && ill_check_and_refhold(castill)) { 764 rw_exit(&ipst->ips_ipmp_lock); 765 return (castill); 766 } 767 rw_exit(&ipst->ips_ipmp_lock); 768 return (NULL); 769 } 770 771 /* 772 * Callback routine for ncec_walk() that deletes `nce' if it is associated with 773 * the `(ill_t *)arg' and it is not one of the local addresses. Caller must be 774 * inside the IPSQ. 775 */ 776 static void 777 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg) 778 { 779 if ((ncec != NULL) && !NCE_MYADDR(ncec) && 780 ncec->ncec_ill == (ill_t *)arg) { 781 ncec_delete(ncec); 782 } 783 } 784 785 /* 786 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, 787 * any existing nomination is removed. Caller must be inside the IPSQ. 788 */ 789 static void 790 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) 791 { 792 ill_t *ocastill = illg->ig_cast_ill; 793 ill_t *ipmp_ill = illg->ig_ipmp_ill; 794 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 795 796 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 797 798 /* 799 * Disable old nominated ill (if any). 800 */ 801 if (ocastill != NULL) { 802 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, 803 illg, ill_t *, ocastill); 804 ASSERT(ocastill->ill_nom_cast); 805 ocastill->ill_nom_cast = B_FALSE; 806 /* 807 * If the IPMP meta-interface is down, we never did the join, 808 * so we must not try to leave. 809 */ 810 if (ipmp_ill->ill_dl_up) 811 ill_leave_multicast(ipmp_ill); 812 813 /* 814 * Delete any NCEs tied to the old nomination. We must do this 815 * last since ill_leave_multicast() may trigger IREs to be 816 * built using ig_cast_ill. 817 */ 818 ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill, 819 ocastill->ill_ipst); 820 } 821 822 /* 823 * Set new nomination. 824 */ 825 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 826 illg->ig_cast_ill = castill; 827 rw_exit(&ipst->ips_ipmp_lock); 828 829 /* 830 * Enable new nominated ill (if any). 831 */ 832 if (castill != NULL) { 833 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, 834 illg, ill_t *, castill); 835 ASSERT(!castill->ill_nom_cast); 836 castill->ill_nom_cast = B_TRUE; 837 /* 838 * If the IPMP meta-interface is down, the attempt to recover 839 * will silently fail but ill_need_recover_multicast will be 840 * erroneously cleared -- so check first. 841 */ 842 if (ipmp_ill->ill_dl_up) 843 ill_recover_multicast(ipmp_ill); 844 } 845 } 846 847 /* 848 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an 849 * entry for the same IP address already exists, destroy it first. Return the 850 * created IPMP ARP entry, or NULL on failure. 851 */ 852 ipmp_arpent_t * 853 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp, 854 ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags) 855 { 856 ipmp_arpent_t *entp, *oentp; 857 858 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 859 860 if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len, 861 KM_NOSLEEP)) == NULL) 862 return (NULL); 863 864 /* 865 * Delete any existing ARP entry for this address. 866 */ 867 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) 868 ipmp_illgrp_destroy_arpent(illg, oentp); 869 870 /* 871 * Prepend the new entry. 872 */ 873 entp->ia_ipaddr = ipaddr; 874 entp->ia_flags = flags; 875 entp->ia_lladdr_len = lladdr_len; 876 entp->ia_lladdr = (uchar_t *)&entp[1]; 877 bcopy(lladdr, entp->ia_lladdr, lladdr_len); 878 entp->ia_proxyarp = proxyarp; 879 entp->ia_notified = B_TRUE; 880 list_insert_head(&illg->ig_arpent, entp); 881 return (entp); 882 } 883 884 /* 885 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. 886 */ 887 void 888 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 889 { 890 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 891 892 list_remove(&illg->ig_arpent, entp); 893 kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len); 894 } 895 896 /* 897 * Mark that ARP has been notified about the IP address on `entp'; `illg' is 898 * taken as a debugging aid for DTrace FBT probes. 899 */ 900 /* ARGSUSED */ 901 void 902 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 903 { 904 entp->ia_notified = B_TRUE; 905 } 906 907 /* 908 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is 909 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. 910 */ 911 ipmp_arpent_t * 912 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) 913 { 914 ipmp_arpent_t *entp = list_head(&illg->ig_arpent); 915 916 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 917 918 if (addrp == NULL) 919 return (entp); 920 921 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) 922 if (entp->ia_ipaddr == *addrp) 923 break; 924 return (entp); 925 } 926 927 /* 928 * Refresh ARP entries on `illg' to be distributed across its active 929 * interfaces. Entries that cannot be refreshed (e.g., because there are no 930 * active interfaces) are marked so that subsequent calls can try again. 931 */ 932 void 933 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) 934 { 935 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; 936 uint_t paddrlen = ipmp_ill->ill_phys_addr_length; 937 ipmp_arpent_t *entp; 938 ncec_t *ncec; 939 nce_t *nce; 940 941 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 942 ASSERT(!ipmp_ill->ill_isv6); 943 944 ill = list_head(&illg->ig_actif); 945 entp = list_head(&illg->ig_arpent); 946 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { 947 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { 948 entp->ia_notified = B_FALSE; 949 continue; 950 } 951 952 ASSERT(paddrlen == ill->ill_phys_addr_length); 953 954 /* 955 * If this is a proxy ARP entry, we can skip notifying ARP if 956 * the entry is already up-to-date. If it has changed, we 957 * update the entry's hardware address before notifying ARP. 958 */ 959 if (entp->ia_proxyarp) { 960 if (bcmp(ill->ill_phys_addr, entp->ia_lladdr, 961 paddrlen) == 0 && entp->ia_notified) 962 continue; 963 bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen); 964 } 965 966 (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr, 967 paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED, 968 &nce); 969 if (nce == NULL || !entp->ia_proxyarp) { 970 if (nce != NULL) 971 nce_refrele(nce); 972 continue; 973 } 974 ncec = nce->nce_common; 975 mutex_enter(&ncec->ncec_lock); 976 nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr); 977 mutex_exit(&ncec->ncec_lock); 978 nce_refrele(nce); 979 ipmp_illgrp_mark_arpent(illg, entp); 980 981 if ((ill = list_next(&illg->ig_actif, ill)) == NULL) 982 ill = list_head(&illg->ig_actif); 983 } 984 } 985 986 /* 987 * Return an interface in `illg' with the specified `physaddr', or NULL if one 988 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. 989 */ 990 ill_t * 991 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) 992 { 993 ill_t *ill; 994 ill_t *ipmp_ill = illg->ig_ipmp_ill; 995 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 996 997 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 998 999 ill = list_head(&illg->ig_if); 1000 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1001 if (ill->ill_phys_addr_length == paddrlen && 1002 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) 1003 return (ill); 1004 } 1005 return (NULL); 1006 } 1007 1008 /* 1009 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. 1010 * Caller must be inside the IPSQ unless this is initialization. 1011 */ 1012 static void 1013 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) 1014 { 1015 ill_t *ill = illg->ig_ipmp_ill; 1016 mblk_t *mp; 1017 1018 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); 1019 1020 /* 1021 * If allocation fails, we have bigger problems than MTU. 1022 */ 1023 if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { 1024 illg->ig_mtu = mtu; 1025 put(ill->ill_rq, mp); 1026 } 1027 } 1028 1029 /* 1030 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP 1031 * ill MTU if necessary. 1032 */ 1033 void 1034 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) 1035 { 1036 ill_t *ill; 1037 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1038 uint_t mtu = 0; 1039 1040 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 1041 1042 /* 1043 * Since ill_mtu can only change under ill_lock, we hold ill_lock 1044 * for each ill as we iterate through the list. Any changes to the 1045 * ill_mtu will also trigger an update, so even if we missed it 1046 * this time around, the update will catch it. 1047 */ 1048 ill = list_head(&illg->ig_if); 1049 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1050 mutex_enter(&ill->ill_lock); 1051 if (mtu == 0 || ill->ill_mtu < mtu) 1052 mtu = ill->ill_mtu; 1053 mutex_exit(&ill->ill_lock); 1054 } 1055 1056 /* 1057 * MTU must be at least the minimum MTU. 1058 */ 1059 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1060 1061 if (illg->ig_mtu != mtu) 1062 ipmp_illgrp_set_mtu(illg, mtu); 1063 } 1064 1065 /* 1066 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently 1067 * allow the same link to be established more than once. 1068 */ 1069 void 1070 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) 1071 { 1072 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1073 1074 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1075 1076 if (illg->ig_ipmp_ill->ill_isv6) { 1077 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); 1078 grp->gr_v6 = illg; 1079 } else { 1080 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); 1081 grp->gr_v4 = illg; 1082 } 1083 } 1084 1085 /* 1086 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp 1087 * cannot be unlinked (e.g., because there are still interfaces using it). 1088 */ 1089 int 1090 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) 1091 { 1092 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; 1093 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1094 1095 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1096 1097 if (illg->ig_ipmp_ill->ill_isv6) { 1098 if (grp->gr_nv6 + grp->gr_pendv6 != 0) 1099 return (EBUSY); 1100 grp->gr_v6 = NULL; 1101 } else { 1102 if (grp->gr_nv4 + grp->gr_pendv4 != 0) 1103 return (EBUSY); 1104 grp->gr_v4 = NULL; 1105 } 1106 return (0); 1107 } 1108 1109 /* 1110 * Place `ill' into `illg', and rebalance the data addresses on `illg' 1111 * to be spread evenly across the ills now in it. Also, adjust the IPMP 1112 * ill as necessary to account for `ill' (e.g., MTU). 1113 */ 1114 void 1115 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) 1116 { 1117 ill_t *ipmp_ill; 1118 ipif_t *ipif; 1119 ip_stack_t *ipst = ill->ill_ipst; 1120 1121 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ 1122 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); 1123 ASSERT(IAM_WRITER_ILL(ill)); 1124 ASSERT(ill->ill_grp == NULL); 1125 1126 ipmp_ill = illg->ig_ipmp_ill; 1127 1128 /* 1129 * Account for `ill' joining the illgrp. 1130 */ 1131 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1132 if (ill->ill_isv6) 1133 ill->ill_phyint->phyint_grp->gr_nv6++; 1134 else 1135 ill->ill_phyint->phyint_grp->gr_nv4++; 1136 rw_exit(&ipst->ips_ipmp_lock); 1137 1138 /* 1139 * Ensure the ILLF_ROUTER flag remains consistent across the group. 1140 */ 1141 mutex_enter(&ill->ill_lock); 1142 if (ipmp_ill->ill_flags & ILLF_ROUTER) 1143 ill->ill_flags |= ILLF_ROUTER; 1144 else 1145 ill->ill_flags &= ~ILLF_ROUTER; 1146 mutex_exit(&ill->ill_lock); 1147 1148 /* 1149 * Blow away all multicast memberships that currently exist on `ill'. 1150 * This may seem odd, but it's consistent with the application view 1151 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). 1152 * The ill_grp_pending bit prevents multicast group joins after 1153 * update_conn_ill() and before ill_grp assignment. 1154 */ 1155 mutex_enter(&ill->ill_mcast_serializer); 1156 ill->ill_grp_pending = 1; 1157 mutex_exit(&ill->ill_mcast_serializer); 1158 update_conn_ill(ill, ill->ill_ipst); 1159 if (ill->ill_isv6) { 1160 reset_mrt_ill(ill); 1161 } else { 1162 ipif = ill->ill_ipif; 1163 for (; ipif != NULL; ipif = ipif->ipif_next) { 1164 reset_mrt_vif_ipif(ipif); 1165 } 1166 } 1167 ip_purge_allmulti(ill); 1168 1169 /* 1170 * Borrow the first ill's ill_phys_addr_length value for the illgrp's 1171 * physical address length. All other ills must have the same value, 1172 * since they are required to all be the same mactype. Also update 1173 * the IPMP ill's MTU and CoS marking, if necessary. 1174 */ 1175 if (list_is_empty(&illg->ig_if)) { 1176 ASSERT(ipmp_ill->ill_phys_addr_length == 0); 1177 /* 1178 * NOTE: we leave ill_phys_addr NULL since the IPMP group 1179 * doesn't have a physical address. This means that code must 1180 * not assume that ill_phys_addr is non-NULL just because 1181 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. 1182 */ 1183 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; 1184 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; 1185 ipmp_ill->ill_type = ill->ill_type; 1186 1187 if (ill->ill_flags & ILLF_COS_ENABLED) { 1188 mutex_enter(&ipmp_ill->ill_lock); 1189 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1190 mutex_exit(&ipmp_ill->ill_lock); 1191 } 1192 ipmp_illgrp_set_mtu(illg, ill->ill_mtu); 1193 } else { 1194 ASSERT(ipmp_ill->ill_phys_addr_length == 1195 ill->ill_phys_addr_length); 1196 ASSERT(ipmp_ill->ill_type == ill->ill_type); 1197 1198 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1199 mutex_enter(&ipmp_ill->ill_lock); 1200 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1201 mutex_exit(&ipmp_ill->ill_lock); 1202 } 1203 if (illg->ig_mtu > ill->ill_mtu) 1204 ipmp_illgrp_set_mtu(illg, ill->ill_mtu); 1205 } 1206 1207 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1208 list_insert_tail(&illg->ig_if, ill); 1209 ill->ill_grp = illg; 1210 rw_exit(&ipst->ips_ill_g_lock); 1211 1212 mutex_enter(&ill->ill_mcast_serializer); 1213 ill->ill_grp_pending = 0; 1214 mutex_exit(&ill->ill_mcast_serializer); 1215 1216 /* 1217 * Hide the IREs on `ill' so that we don't accidentally find them when 1218 * sending data traffic. 1219 */ 1220 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); 1221 1222 ipmp_ill_refresh_active(ill); 1223 } 1224 1225 /* 1226 * Remove `ill' from its illgrp, and rebalance the data addresses in that 1227 * illgrp to be spread evenly across the remaining ills. Also, adjust the 1228 * IPMP ill as necessary now that `ill' is removed (e.g., MTU). 1229 */ 1230 void 1231 ipmp_ill_leave_illgrp(ill_t *ill) 1232 { 1233 ill_t *ipmp_ill; 1234 ipif_t *ipif; 1235 ipmp_arpent_t *entp; 1236 ipmp_illgrp_t *illg = ill->ill_grp; 1237 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1238 1239 ASSERT(IS_UNDER_IPMP(ill)); 1240 ASSERT(IAM_WRITER_ILL(ill)); 1241 ASSERT(illg != NULL); 1242 1243 ipmp_ill = illg->ig_ipmp_ill; 1244 1245 /* 1246 * Cancel IPMP-specific ill timeouts. 1247 */ 1248 (void) untimeout(ill->ill_refresh_tid); 1249 1250 /* 1251 * Expose any previously-hidden IREs on `ill'. 1252 */ 1253 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); 1254 1255 /* 1256 * Ensure the multicast state for each ipif on `ill' is down so that 1257 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin 1258 * all eligible groups. 1259 */ 1260 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1261 if (ipif->ipif_flags & IPIF_UP) 1262 ipif_multicast_down(ipif); 1263 1264 /* 1265 * Account for `ill' leaving the illgrp. 1266 */ 1267 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1268 if (ill->ill_isv6) 1269 ill->ill_phyint->phyint_grp->gr_nv6--; 1270 else 1271 ill->ill_phyint->phyint_grp->gr_nv4--; 1272 rw_exit(&ipst->ips_ipmp_lock); 1273 1274 /* 1275 * Pull `ill' out of the interface lists. 1276 */ 1277 if (list_link_active(&ill->ill_actnode)) 1278 ipmp_ill_deactivate(ill); 1279 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1280 list_remove(&illg->ig_if, ill); 1281 ill->ill_grp = NULL; 1282 rw_exit(&ipst->ips_ill_g_lock); 1283 1284 /* 1285 * Re-establish multicast memberships that were previously being 1286 * handled by the IPMP meta-interface. 1287 */ 1288 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1289 if (ipif->ipif_flags & IPIF_UP) 1290 ipif_multicast_up(ipif); 1291 1292 /* 1293 * Refresh the group MTU based on the new interface list. 1294 */ 1295 ipmp_illgrp_refresh_mtu(illg); 1296 1297 if (list_is_empty(&illg->ig_if)) { 1298 /* 1299 * No ills left in the illgrp; we no longer have a physical 1300 * address length, nor can we support ARP, CoS, or anything 1301 * else that depends on knowing the link layer type. 1302 */ 1303 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) 1304 ipmp_illgrp_destroy_arpent(illg, entp); 1305 1306 ipmp_ill->ill_phys_addr_length = 0; 1307 ipmp_ill->ill_nd_lla_len = 0; 1308 ipmp_ill->ill_type = IFT_OTHER; 1309 mutex_enter(&ipmp_ill->ill_lock); 1310 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1311 mutex_exit(&ipmp_ill->ill_lock); 1312 } else { 1313 /* 1314 * If `ill' didn't support CoS, see if it can now be enabled. 1315 */ 1316 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1317 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); 1318 1319 ill = list_head(&illg->ig_if); 1320 do { 1321 if (!(ill->ill_flags & ILLF_COS_ENABLED)) 1322 break; 1323 } while ((ill = list_next(&illg->ig_if, ill)) != NULL); 1324 1325 if (ill == NULL) { 1326 mutex_enter(&ipmp_ill->ill_lock); 1327 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1328 mutex_exit(&ipmp_ill->ill_lock); 1329 } 1330 } 1331 } 1332 } 1333 1334 /* 1335 * Check if `ill' should be active, and activate or deactivate if need be. 1336 * Return B_FALSE if a refresh was necessary but could not be performed. 1337 */ 1338 static boolean_t 1339 ipmp_ill_try_refresh_active(ill_t *ill) 1340 { 1341 boolean_t refreshed = B_TRUE; 1342 1343 ASSERT(IAM_WRITER_ILL(ill)); 1344 ASSERT(IS_UNDER_IPMP(ill)); 1345 1346 if (ipmp_ill_is_active(ill)) { 1347 if (!list_link_active(&ill->ill_actnode)) 1348 refreshed = ipmp_ill_activate(ill); 1349 } else { 1350 if (list_link_active(&ill->ill_actnode)) 1351 ipmp_ill_deactivate(ill); 1352 } 1353 1354 return (refreshed); 1355 } 1356 1357 /* 1358 * Check if `ill' should be active, and activate or deactivate if need be. 1359 * If the refresh fails, schedule a timer to try again later. 1360 */ 1361 void 1362 ipmp_ill_refresh_active(ill_t *ill) 1363 { 1364 if (!ipmp_ill_try_refresh_active(ill)) 1365 ipmp_ill_refresh_active_timer_start(ill); 1366 } 1367 1368 /* 1369 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. 1370 */ 1371 static void 1372 ipmp_ill_refresh_active_timer(void *ill_arg) 1373 { 1374 ill_t *ill = ill_arg; 1375 boolean_t refreshed = B_FALSE; 1376 1377 /* 1378 * Clear ill_refresh_tid to indicate that no timeout is pending 1379 * (another thread could schedule a new timeout while we're still 1380 * running, but that's harmless). If the ill is going away, bail. 1381 */ 1382 mutex_enter(&ill->ill_lock); 1383 ill->ill_refresh_tid = 0; 1384 if (ill->ill_state_flags & ILL_CONDEMNED) { 1385 mutex_exit(&ill->ill_lock); 1386 return; 1387 } 1388 mutex_exit(&ill->ill_lock); 1389 1390 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { 1391 refreshed = ipmp_ill_try_refresh_active(ill); 1392 ipsq_exit(ill->ill_phyint->phyint_ipsq); 1393 } 1394 1395 /* 1396 * If the refresh failed, schedule another attempt. 1397 */ 1398 if (!refreshed) 1399 ipmp_ill_refresh_active_timer_start(ill); 1400 } 1401 1402 /* 1403 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. 1404 */ 1405 static void 1406 ipmp_ill_refresh_active_timer_start(ill_t *ill) 1407 { 1408 mutex_enter(&ill->ill_lock); 1409 1410 /* 1411 * If the ill is going away or a refresh is already scheduled, bail. 1412 */ 1413 if (ill->ill_refresh_tid != 0 || 1414 (ill->ill_state_flags & ILL_CONDEMNED)) { 1415 mutex_exit(&ill->ill_lock); 1416 return; 1417 } 1418 1419 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, 1420 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); 1421 1422 mutex_exit(&ill->ill_lock); 1423 } 1424 1425 /* 1426 * Activate `ill' so it will be used to send and receive data traffic. Return 1427 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages 1428 * needed to deactivate `ill' here as well so that deactivation cannot fail. 1429 */ 1430 static boolean_t 1431 ipmp_ill_activate(ill_t *ill) 1432 { 1433 ipif_t *ipif; 1434 mblk_t *linkupmp = NULL, *linkdownmp = NULL; 1435 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1436 ipmp_illgrp_t *illg = ill->ill_grp; 1437 ill_t *maxill; 1438 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1439 1440 ASSERT(IAM_WRITER_ILL(ill)); 1441 ASSERT(IS_UNDER_IPMP(ill)); 1442 1443 /* 1444 * If this will be the first active interface in the group, allocate 1445 * the link-up and link-down messages. 1446 */ 1447 if (grp->gr_nactif == 0) { 1448 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); 1449 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); 1450 if (linkupmp == NULL || linkdownmp == NULL) 1451 goto fail; 1452 } 1453 1454 if (list_is_empty(&illg->ig_actif)) { 1455 /* 1456 * Now that we have an active ill, nominate it for multicast 1457 * and broadcast duties. Do this before ipmp_ill_bind_ipif() 1458 * since that may need to send multicast packets (e.g., IPv6 1459 * neighbor discovery probes). 1460 */ 1461 ipmp_illgrp_set_cast(illg, ill); 1462 1463 /* 1464 * This is the first active ill in the illgrp -- add 'em all. 1465 * We can access/walk ig_ipmp_ill's ipif list since we're 1466 * writer on its IPSQ as well. 1467 */ 1468 ipif = illg->ig_ipmp_ill->ill_ipif; 1469 for (; ipif != NULL; ipif = ipif->ipif_next) 1470 if (ipmp_ipif_is_up_dataaddr(ipif)) 1471 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); 1472 } else { 1473 /* 1474 * Redistribute the addresses by moving them from the ill with 1475 * the most addresses until the ill being activated is at the 1476 * same level as the rest of the ills. 1477 */ 1478 for (;;) { 1479 maxill = ipmp_illgrp_max_ill(illg); 1480 ASSERT(maxill != NULL); 1481 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) 1482 break; 1483 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 1484 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); 1485 } 1486 } 1487 1488 /* 1489 * Put the interface in the active list. 1490 */ 1491 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1492 list_insert_tail(&illg->ig_actif, ill); 1493 illg->ig_nactif++; 1494 illg->ig_next_ill = ill; 1495 rw_exit(&ipst->ips_ipmp_lock); 1496 1497 /* 1498 * Refresh static/proxy ARP entries to use `ill', if need be. 1499 */ 1500 if (!ill->ill_isv6) 1501 ipmp_illgrp_refresh_arpent(illg); 1502 1503 /* 1504 * Finally, mark the group link up, if necessary. 1505 */ 1506 if (grp->gr_nactif++ == 0) { 1507 ASSERT(grp->gr_linkdownmp == NULL); 1508 grp->gr_linkdownmp = linkdownmp; 1509 put(illg->ig_ipmp_ill->ill_rq, linkupmp); 1510 } 1511 return (B_TRUE); 1512 fail: 1513 freemsg(linkupmp); 1514 freemsg(linkdownmp); 1515 return (B_FALSE); 1516 } 1517 1518 /* 1519 * Deactivate `ill' so it will not be used to send or receive data traffic. 1520 */ 1521 static void 1522 ipmp_ill_deactivate(ill_t *ill) 1523 { 1524 ill_t *minill; 1525 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; 1526 mblk_t *mp; 1527 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1528 ipmp_illgrp_t *illg = ill->ill_grp; 1529 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1530 1531 ASSERT(IAM_WRITER_ILL(ill)); 1532 ASSERT(IS_UNDER_IPMP(ill)); 1533 1534 /* 1535 * Pull the interface out of the active list. 1536 */ 1537 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1538 list_remove(&illg->ig_actif, ill); 1539 illg->ig_nactif--; 1540 illg->ig_next_ill = list_head(&illg->ig_actif); 1541 rw_exit(&ipst->ips_ipmp_lock); 1542 1543 /* 1544 * If the ill that's being deactivated had been nominated for 1545 * multicast/broadcast, nominate a new one. 1546 */ 1547 if (ill == illg->ig_cast_ill) 1548 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); 1549 1550 /* 1551 * Delete all nce_t entries using this ill, so that the next attempt 1552 * to send data traffic will revalidate cached nce's. 1553 */ 1554 nce_flush(ill, B_TRUE); 1555 1556 /* 1557 * Unbind all of the ipifs bound to this ill, and save 'em in a list; 1558 * we'll rebind them after we tell the resolver the ill is no longer 1559 * active. We must do things in this order or the resolver could 1560 * accidentally rebind to the ill we're trying to remove if multiple 1561 * ills in the group have the same hardware address (which is 1562 * unsupported, but shouldn't lead to a wedged machine). 1563 */ 1564 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { 1565 ipif->ipif_bound_next = ubheadipif; 1566 ubheadipif = ipif; 1567 } 1568 if (!ill->ill_isv6) { 1569 1570 /* 1571 * Refresh static/proxy ARP entries that had been using `ill'. 1572 */ 1573 ipmp_illgrp_refresh_arpent(illg); 1574 } 1575 1576 /* 1577 * Rebind each ipif from the deactivated ill to the active ill with 1578 * the fewest ipifs. If there are no active ills, the ipifs will 1579 * remain unbound. 1580 */ 1581 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { 1582 ubnextipif = ipif->ipif_bound_next; 1583 ipif->ipif_bound_next = NULL; 1584 1585 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 1586 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); 1587 } 1588 1589 if (list_is_empty(&illg->ig_actif)) { 1590 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1591 1592 ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, 1593 (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst); 1594 } 1595 1596 /* 1597 * Remove any IRE_IF_CLONE for this ill since they might have 1598 * an ire_nce_cache/nce_common which refers to another ill in the group. 1599 */ 1600 ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, 1601 ill, ill); 1602 1603 /* 1604 * Finally, mark the group link down, if necessary. 1605 */ 1606 if (--grp->gr_nactif == 0) { 1607 mp = grp->gr_linkdownmp; 1608 grp->gr_linkdownmp = NULL; 1609 ASSERT(mp != NULL); 1610 put(illg->ig_ipmp_ill->ill_rq, mp); 1611 } 1612 } 1613 1614 /* 1615 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) 1616 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. 1617 */ 1618 static void 1619 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) 1620 { 1621 ipif_t *ipif; 1622 1623 ASSERT(IAM_WRITER_ILL(ill)); 1624 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); 1625 1626 /* 1627 * If `ill' is truly down, there are no messages to generate since: 1628 * 1629 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface 1630 * and its addresses by bringing them down. But that's already 1631 * true, so there's nothing to hide. 1632 * 1633 * 2. If cmd == RTM_ADD, then we're supposed to generate messages 1634 * indicating that any previously-hidden up addresses are again 1635 * back up (along with the interface). But they aren't, so 1636 * there's nothing to expose. 1637 */ 1638 if (ill->ill_ipif_up_count == 0) 1639 return; 1640 1641 if (cmd == RTM_ADD) 1642 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); 1643 1644 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1645 if (ipif->ipif_flags & IPIF_UP) 1646 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); 1647 1648 if (cmd == RTM_DELETE) 1649 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); 1650 } 1651 1652 /* 1653 * Bind the address named by `ipif' to the underlying ill named by `ill'. 1654 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' 1655 * will indicate to the resolver whether this is an initial bringup of 1656 * `ipif', or just a rebind to another ill. 1657 */ 1658 static void 1659 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) 1660 { 1661 int err = 0; 1662 ip_stack_t *ipst = ill->ill_ipst; 1663 1664 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); 1665 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); 1666 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); 1667 ASSERT(ipif->ipif_bound_ill == NULL); 1668 ASSERT(ipif->ipif_bound_next == NULL); 1669 1670 ipif->ipif_bound_next = ill->ill_bound_ipif; 1671 ill->ill_bound_ipif = ipif; 1672 ill->ill_bound_cnt++; 1673 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1674 ipif->ipif_bound_ill = ill; 1675 rw_exit(&ipst->ips_ipmp_lock); 1676 1677 /* 1678 * If necessary, tell ARP/NDP about the new mapping. Note that 1679 * ipif_resolver_up() cannot fail for IPv6 ills. 1680 */ 1681 if (act != Res_act_none) { 1682 if (ill->ill_isv6) { 1683 VERIFY(ipif_resolver_up(ipif, act) == 0); 1684 err = ipif_ndp_up(ipif, act == Res_act_initial); 1685 } else { 1686 err = ipif_resolver_up(ipif, act); 1687 } 1688 1689 /* 1690 * Since ipif_ndp_up() never returns EINPROGRESS and 1691 * ipif_resolver_up() only returns EINPROGRESS when the 1692 * associated ill is not up, we should never be here with 1693 * EINPROGRESS. We rely on this to simplify the design. 1694 */ 1695 ASSERT(err != EINPROGRESS); 1696 } 1697 /* TODO: retry binding on failure? when? */ 1698 ipif->ipif_bound = (err == 0); 1699 } 1700 1701 /* 1702 * Unbind the address named by `ipif' from the underlying ill named by `ill'. 1703 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. 1704 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is 1705 * B_TRUE, notify the resolver about the change. 1706 */ 1707 static ipif_t * 1708 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) 1709 { 1710 ipif_t *previpif; 1711 ip_stack_t *ipst = ill->ill_ipst; 1712 1713 ASSERT(IAM_WRITER_ILL(ill)); 1714 ASSERT(IS_UNDER_IPMP(ill)); 1715 1716 /* 1717 * If necessary, find an ipif to unbind. 1718 */ 1719 if (ipif == NULL) { 1720 if ((ipif = ill->ill_bound_ipif) == NULL) { 1721 ASSERT(ill->ill_bound_cnt == 0); 1722 return (NULL); 1723 } 1724 } 1725 1726 ASSERT(IAM_WRITER_IPIF(ipif)); 1727 ASSERT(IS_IPMP(ipif->ipif_ill)); 1728 ASSERT(ipif->ipif_bound_ill == ill); 1729 ASSERT(ill->ill_bound_cnt > 0); 1730 1731 /* 1732 * Unbind it. 1733 */ 1734 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1735 ipif->ipif_bound_ill = NULL; 1736 rw_exit(&ipst->ips_ipmp_lock); 1737 ill->ill_bound_cnt--; 1738 1739 if (ill->ill_bound_ipif == ipif) { 1740 ill->ill_bound_ipif = ipif->ipif_bound_next; 1741 } else { 1742 previpif = ill->ill_bound_ipif; 1743 while (previpif->ipif_bound_next != ipif) 1744 previpif = previpif->ipif_bound_next; 1745 1746 previpif->ipif_bound_next = ipif->ipif_bound_next; 1747 } 1748 ipif->ipif_bound_next = NULL; 1749 1750 /* 1751 * If requested, notify the resolvers (provided we're bound). 1752 */ 1753 if (notifyres && ipif->ipif_bound) { 1754 if (ill->ill_isv6) 1755 ipif_ndp_down(ipif); 1756 else 1757 (void) ipif_arp_down(ipif); 1758 } 1759 ipif->ipif_bound = B_FALSE; 1760 1761 return (ipif); 1762 } 1763 1764 /* 1765 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if 1766 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this 1767 * to determine whether an ill should be considered active, other consumers 1768 * may race and learn about an ill that should be deactivated/activated before 1769 * IPMP has performed the activation/deactivation. This should be safe though 1770 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that 1771 * would've been cleaned up by ipmp_ill_deactivate(). 1772 */ 1773 boolean_t 1774 ipmp_ill_is_active(ill_t *ill) 1775 { 1776 phyint_t *phyi = ill->ill_phyint; 1777 1778 ASSERT(IS_UNDER_IPMP(ill)); 1779 ASSERT(IAM_WRITER_ILL(ill) || 1780 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); 1781 1782 /* 1783 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to 1784 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the 1785 * link flapping logic to be just in in.mpathd and allows us to ignore 1786 * changes to PHYI_RUNNING. 1787 */ 1788 return (!(ill->ill_ipif_up_count == 0 || 1789 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); 1790 } 1791 1792 /* 1793 * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated 1794 * with `ill_arg'. 1795 */ 1796 static void 1797 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) 1798 { 1799 ill_t *ill = (ill_t *)ill_arg; 1800 1801 ASSERT(IAM_WRITER_ILL(ill)); 1802 ASSERT(!IS_IPMP(ill)); 1803 1804 if (ire->ire_ill != ill) 1805 return; 1806 1807 if (IRE_HIDDEN_TYPE(ire->ire_type)) { 1808 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); 1809 ire->ire_testhidden = B_TRUE; 1810 } 1811 } 1812 1813 /* 1814 * IRE walker callback: clear ire_testhidden if the IRE has a source address 1815 * on `ill_arg'. 1816 */ 1817 static void 1818 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) 1819 { 1820 ill_t *ill = (ill_t *)ill_arg; 1821 1822 ASSERT(IAM_WRITER_ILL(ill)); 1823 ASSERT(!IS_IPMP(ill)); 1824 1825 if (ire->ire_ill == ill) { 1826 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); 1827 ire->ire_testhidden = B_FALSE; 1828 } 1829 } 1830 1831 /* 1832 * Return a held pointer to the IPMP ill for underlying interface `ill', or 1833 * NULL if one doesn't exist. (Unfortunately, this function needs to take an 1834 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1835 * ill_grp pointer may become stale when not inside an IPSQ and not holding 1836 * ipmp_lock.) Caller need not be inside the IPSQ. 1837 */ 1838 ill_t * 1839 ipmp_ill_hold_ipmp_ill(ill_t *ill) 1840 { 1841 ip_stack_t *ipst = ill->ill_ipst; 1842 ipmp_illgrp_t *illg; 1843 1844 ASSERT(!IS_IPMP(ill)); 1845 1846 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1847 illg = ill->ill_grp; 1848 if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) { 1849 rw_exit(&ipst->ips_ipmp_lock); 1850 return (illg->ig_ipmp_ill); 1851 } 1852 /* 1853 * Assume `ill' was removed from the illgrp in the meantime. 1854 */ 1855 rw_exit(&ill->ill_ipst->ips_ipmp_lock); 1856 return (NULL); 1857 } 1858 1859 /* 1860 * Return the interface index for the IPMP ill tied to underlying interface 1861 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. 1862 */ 1863 uint_t 1864 ipmp_ill_get_ipmp_ifindex(const ill_t *ill) 1865 { 1866 uint_t ifindex = 0; 1867 ip_stack_t *ipst = ill->ill_ipst; 1868 ipmp_grp_t *grp; 1869 1870 ASSERT(!IS_IPMP(ill)); 1871 1872 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1873 if ((grp = ill->ill_phyint->phyint_grp) != NULL) 1874 ifindex = grp->gr_phyint->phyint_ifindex; 1875 rw_exit(&ipst->ips_ipmp_lock); 1876 return (ifindex); 1877 } 1878 1879 /* 1880 * Place phyint `phyi' into IPMP group `grp'. 1881 */ 1882 void 1883 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) 1884 { 1885 ill_t *ill; 1886 ipsq_t *ipsq = phyi->phyint_ipsq; 1887 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; 1888 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1889 1890 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1891 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); 1892 1893 /* 1894 * Send routing socket messages indicating that the phyint's ills 1895 * and ipifs vanished. 1896 */ 1897 if (phyi->phyint_illv4 != NULL) { 1898 ill = phyi->phyint_illv4; 1899 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1900 } 1901 1902 if (phyi->phyint_illv6 != NULL) { 1903 ill = phyi->phyint_illv6; 1904 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1905 } 1906 1907 /* 1908 * Snapshot the phyint's initial kstats as a baseline. 1909 */ 1910 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); 1911 1912 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1913 1914 phyi->phyint_grp = grp; 1915 if (++grp->gr_nif == 1) 1916 grp->gr_mactype = ill->ill_mactype; 1917 else 1918 ASSERT(grp->gr_mactype == ill->ill_mactype); 1919 1920 /* 1921 * Now that we're in the group, request a switch to the group's xop 1922 * when we ipsq_exit(). All future operations will be exclusive on 1923 * the group xop until ipmp_phyint_leave_grp() is called. 1924 */ 1925 ASSERT(ipsq->ipsq_swxop == NULL); 1926 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); 1927 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; 1928 1929 rw_exit(&ipst->ips_ipmp_lock); 1930 } 1931 1932 /* 1933 * Remove phyint `phyi' from its current IPMP group. 1934 */ 1935 void 1936 ipmp_phyint_leave_grp(phyint_t *phyi) 1937 { 1938 uint_t i; 1939 ipsq_t *ipsq = phyi->phyint_ipsq; 1940 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1941 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 1942 1943 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1944 1945 /* 1946 * If any of the phyint's ills are still in an illgrp, kick 'em out. 1947 */ 1948 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) 1949 ipmp_ill_leave_illgrp(phyi->phyint_illv4); 1950 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) 1951 ipmp_ill_leave_illgrp(phyi->phyint_illv6); 1952 1953 /* 1954 * Send routing socket messages indicating that the phyint's ills 1955 * and ipifs have reappeared. 1956 */ 1957 if (phyi->phyint_illv4 != NULL) 1958 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); 1959 if (phyi->phyint_illv6 != NULL) 1960 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); 1961 1962 /* 1963 * Calculate the phyint's cumulative kstats while it was in the group, 1964 * and add that to the group's baseline. 1965 */ 1966 ipmp_phyint_get_kstats(phyi, phyi_kstats); 1967 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 1968 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 1969 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); 1970 } 1971 1972 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1973 1974 phyi->phyint_grp->gr_nif--; 1975 phyi->phyint_grp = NULL; 1976 1977 /* 1978 * As our final act in leaving the group, request a switch back to our 1979 * IPSQ's own xop when we ipsq_exit(). 1980 */ 1981 ASSERT(ipsq->ipsq_swxop == NULL); 1982 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; 1983 1984 rw_exit(&ipst->ips_ipmp_lock); 1985 } 1986 1987 /* 1988 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. 1989 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. 1990 */ 1991 static void 1992 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) 1993 { 1994 uint_t i, j; 1995 const char *name; 1996 kstat_t *ksp; 1997 kstat_named_t *kn; 1998 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1999 zoneid_t zoneid; 2000 2001 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); 2002 zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); 2003 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid); 2004 if (ksp == NULL) 2005 return; 2006 2007 KSTAT_ENTER(ksp); 2008 2009 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 2010 /* 2011 * Bring kstats up-to-date before recording. 2012 */ 2013 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 2014 2015 kn = KSTAT_NAMED_PTR(ksp); 2016 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2017 name = ipmp_kstats[i].name; 2018 kstats[i] = 0; 2019 for (j = 0; j < ksp->ks_ndata; j++) { 2020 if (strcmp(kn[j].name, name) != 0) 2021 continue; 2022 2023 switch (kn[j].data_type) { 2024 case KSTAT_DATA_INT32: 2025 case KSTAT_DATA_UINT32: 2026 kstats[i] = kn[j].value.ui32; 2027 break; 2028 #ifdef _LP64 2029 case KSTAT_DATA_LONG: 2030 case KSTAT_DATA_ULONG: 2031 kstats[i] = kn[j].value.ul; 2032 break; 2033 #endif 2034 case KSTAT_DATA_INT64: 2035 case KSTAT_DATA_UINT64: 2036 kstats[i] = kn[j].value.ui64; 2037 break; 2038 } 2039 break; 2040 } 2041 } 2042 } 2043 2044 KSTAT_EXIT(ksp); 2045 kstat_rele(ksp); 2046 } 2047 2048 /* 2049 * Refresh the active state of all ills on `phyi'. 2050 */ 2051 void 2052 ipmp_phyint_refresh_active(phyint_t *phyi) 2053 { 2054 if (phyi->phyint_illv4 != NULL) 2055 ipmp_ill_refresh_active(phyi->phyint_illv4); 2056 if (phyi->phyint_illv6 != NULL) 2057 ipmp_ill_refresh_active(phyi->phyint_illv6); 2058 } 2059 2060 /* 2061 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one 2062 * doesn't exist. Caller need not be inside the IPSQ. 2063 */ 2064 ill_t * 2065 ipmp_ipif_hold_bound_ill(const ipif_t *ipif) 2066 { 2067 ill_t *boundill; 2068 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2069 2070 ASSERT(IS_IPMP(ipif->ipif_ill)); 2071 2072 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2073 boundill = ipif->ipif_bound_ill; 2074 if (boundill != NULL && ill_check_and_refhold(boundill)) { 2075 rw_exit(&ipst->ips_ipmp_lock); 2076 return (boundill); 2077 } 2078 rw_exit(&ipst->ips_ipmp_lock); 2079 return (NULL); 2080 } 2081 2082 /* 2083 * Return a pointer to the underlying ill bound to `ipif', or NULL if one 2084 * doesn't exist. Caller must be inside the IPSQ. 2085 */ 2086 ill_t * 2087 ipmp_ipif_bound_ill(const ipif_t *ipif) 2088 { 2089 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); 2090 ASSERT(IS_IPMP(ipif->ipif_ill)); 2091 2092 return (ipif->ipif_bound_ill); 2093 } 2094 2095 /* 2096 * Check if `ipif' is a "stub" (placeholder address not being used). 2097 */ 2098 boolean_t 2099 ipmp_ipif_is_stubaddr(const ipif_t *ipif) 2100 { 2101 if (ipif->ipif_flags & IPIF_UP) 2102 return (B_FALSE); 2103 if (ipif->ipif_ill->ill_isv6) 2104 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2105 else 2106 return (ipif->ipif_lcl_addr == INADDR_ANY); 2107 } 2108 2109 /* 2110 * Check if `ipif' is an IPMP data address. 2111 */ 2112 boolean_t 2113 ipmp_ipif_is_dataaddr(const ipif_t *ipif) 2114 { 2115 if (ipif->ipif_flags & IPIF_NOFAILOVER) 2116 return (B_FALSE); 2117 if (ipif->ipif_ill->ill_isv6) 2118 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2119 else 2120 return (ipif->ipif_lcl_addr != INADDR_ANY); 2121 } 2122 2123 /* 2124 * Check if `ipif' is an IPIF_UP IPMP data address. 2125 */ 2126 static boolean_t 2127 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) 2128 { 2129 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); 2130 } 2131 2132 /* 2133 * Check if `mp' contains a probe packet by verifying if the IP source address 2134 * is a test address on an underlying interface `ill'. Caller need not be inside 2135 * the IPSQ. 2136 */ 2137 boolean_t 2138 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill) 2139 { 2140 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2141 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2142 2143 ASSERT(DB_TYPE(mp) != M_CTL); 2144 2145 if (!IS_UNDER_IPMP(ill)) 2146 return (B_FALSE); 2147 2148 if (ill->ill_isv6) { 2149 if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && 2150 ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) 2151 return (B_TRUE); 2152 } else { 2153 if ((ipha->ipha_src != INADDR_ANY) && 2154 ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL)) 2155 return (B_TRUE); 2156 } 2157 return (B_FALSE); 2158 } 2159 2160 /* 2161 * Pick out an appropriate underlying interface for packet transmit. This 2162 * function may be called from the data path, so we need to verify that the 2163 * IPMP group associated with `ill' is non-null after holding the ill_g_lock. 2164 * Caller need not be inside the IPSQ. 2165 */ 2166 ill_t * 2167 ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast) 2168 { 2169 ill_t *xmit_ill; 2170 ip_stack_t *ipst = ill->ill_ipst; 2171 2172 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2173 if (ill->ill_grp == NULL) { 2174 /* 2175 * The interface was taken out of the group. Return ill itself, 2176 * but take a ref so that callers will always be able to do 2177 * ill_refrele(ill); 2178 */ 2179 rw_exit(&ipst->ips_ill_g_lock); 2180 ill_refhold(ill); 2181 return (ill); 2182 } 2183 if (!is_unicast) 2184 xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 2185 else 2186 xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp); 2187 rw_exit(&ipst->ips_ill_g_lock); 2188 return (xmit_ill); 2189 } 2190 2191 /* 2192 * Flush out any nce that points at `ncec' from an underlying interface 2193 */ 2194 void 2195 ipmp_ncec_flush_nce(ncec_t *ncec) 2196 { 2197 ill_t *ncec_ill = ncec->ncec_ill; 2198 ill_t *ill; 2199 ipmp_illgrp_t *illg; 2200 ip_stack_t *ipst = ncec_ill->ill_ipst; 2201 list_t dead; 2202 nce_t *nce; 2203 2204 if (!IS_IPMP(ncec_ill)) 2205 return; 2206 2207 illg = ncec_ill->ill_grp; 2208 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 2209 2210 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2211 ill = list_head(&illg->ig_if); 2212 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 2213 nce_fastpath_list_delete(ill, ncec, &dead); 2214 } 2215 rw_exit(&ipst->ips_ill_g_lock); 2216 2217 /* 2218 * we may now nce_refrele() all dead entries since all locks have been 2219 * dropped. 2220 */ 2221 while ((nce = list_head(&dead)) != NULL) { 2222 list_remove(&dead, nce); 2223 nce_refrele(nce); 2224 } 2225 ASSERT(list_is_empty(&dead)); 2226 list_destroy(&dead); 2227 } 2228 2229 /* 2230 * For each interface in the IPMP group, if there are nce_t entries for the IP 2231 * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath 2232 * information must be updated to match the link-layer address information in 2233 * `ncec'. 2234 */ 2235 void 2236 ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill) 2237 { 2238 ill_t *ill; 2239 ipmp_illgrp_t *illg = ipmp_ill->ill_grp; 2240 ip_stack_t *ipst = ipmp_ill->ill_ipst; 2241 nce_t *nce, *nce_next; 2242 list_t replace; 2243 2244 ASSERT(IS_IPMP(ipmp_ill)); 2245 2246 /* 2247 * if ncec itself is not reachable, there is no use in creating nce_t 2248 * entries on the underlying interfaces in the group. 2249 */ 2250 if (!NCE_ISREACHABLE(ncec)) 2251 return; 2252 2253 list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node)); 2254 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2255 ill = list_head(&illg->ig_actif); 2256 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 2257 /* 2258 * For each underlying interface, we first check if there is an 2259 * nce_t for the address in ncec->ncec_addr. If one exists, 2260 * we should trigger nce_fastpath for that nce_t. However, the 2261 * catch is that we are holding the ips_ipmp_lock to prevent 2262 * changes to the IPMP group membership, so that we cannot 2263 * putnext() to the driver. So we nce_delete the 2264 * list nce_t entries that need to be updated into the 2265 * `replace' list, and then process the `replace' list 2266 * after dropping the ips_ipmp_lock. 2267 */ 2268 mutex_enter(&ill->ill_lock); 2269 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 2270 nce_next = list_next(&ill->ill_nce, nce); 2271 if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr, 2272 &ncec->ncec_addr)) { 2273 nce = nce_next; 2274 continue; 2275 } 2276 nce_refhold(nce); 2277 nce_delete(nce); 2278 list_insert_tail(&replace, nce); 2279 nce = nce_next; 2280 } 2281 mutex_exit(&ill->ill_lock); 2282 } 2283 rw_exit(&ipst->ips_ipmp_lock); 2284 /* 2285 * `replace' now has the list of nce's on which we should be triggering 2286 * nce_fastpath(). We now retrigger fastpath by setting up the nce 2287 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill 2288 * is still in the group for ncec->ncec_ill 2289 */ 2290 while ((nce = list_head(&replace)) != NULL) { 2291 list_remove(&replace, nce); 2292 if (ncec->ncec_ill->ill_isv6) { 2293 (void) nce_lookup_then_add_v6(nce->nce_ill, 2294 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2295 &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED, 2296 NULL); 2297 } else { 2298 ipaddr_t ipaddr; 2299 2300 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr); 2301 (void) nce_lookup_then_add_v4(nce->nce_ill, 2302 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2303 &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL); 2304 } 2305 nce_refrele(nce); 2306 } 2307 ASSERT(list_is_empty(&replace)); 2308 list_destroy(&replace); 2309 } 2310