1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #include <inet/ip.h> 26 #include <inet/ip6.h> 27 #include <inet/ip_if.h> 28 #include <inet/ip_ire.h> 29 #include <inet/ip_multi.h> 30 #include <inet/ip_ndp.h> 31 #include <inet/ip_rts.h> 32 #include <inet/mi.h> 33 #include <net/if_types.h> 34 #include <sys/dlpi.h> 35 #include <sys/kmem.h> 36 #include <sys/modhash.h> 37 #include <sys/sdt.h> 38 #include <sys/strsun.h> 39 #include <sys/sunddi.h> 40 #include <sys/types.h> 41 42 /* 43 * Convenience macros for getting the ip_stack_t associated with an 44 * ipmp_illgrp_t or ipmp_grp_t. 45 */ 46 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) 47 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) 48 49 /* 50 * Assorted constants that aren't important enough to be tunable. 51 */ 52 #define IPMP_GRP_HASH_SIZE 64 53 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ 54 55 56 /* 57 * IPMP meta-interface kstats (based on those in PSARC/1997/198). 58 */ 59 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { 60 { "obytes", KSTAT_DATA_UINT32 }, 61 { "obytes64", KSTAT_DATA_UINT64 }, 62 { "rbytes", KSTAT_DATA_UINT32 }, 63 { "rbytes64", KSTAT_DATA_UINT64 }, 64 { "opackets", KSTAT_DATA_UINT32 }, 65 { "opackets64", KSTAT_DATA_UINT64 }, 66 { "oerrors", KSTAT_DATA_UINT32 }, 67 { "ipackets", KSTAT_DATA_UINT32 }, 68 { "ipackets64", KSTAT_DATA_UINT64 }, 69 { "ierrors", KSTAT_DATA_UINT32 }, 70 { "multircv", KSTAT_DATA_UINT32 }, 71 { "multixmt", KSTAT_DATA_UINT32 }, 72 { "brdcstrcv", KSTAT_DATA_UINT32 }, 73 { "brdcstxmt", KSTAT_DATA_UINT32 }, 74 { "link_up", KSTAT_DATA_UINT32 } 75 }; 76 77 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); 78 static int ipmp_grp_create_kstats(ipmp_grp_t *); 79 static int ipmp_grp_update_kstats(kstat_t *, int); 80 static void ipmp_grp_destroy_kstats(ipmp_grp_t *); 81 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); 82 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); 83 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); 84 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); 85 static boolean_t ipmp_ill_activate(ill_t *); 86 static void ipmp_ill_deactivate(ill_t *); 87 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); 88 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); 89 static void ipmp_ill_refresh_active_timer_start(ill_t *); 90 static void ipmp_ill_rtsaddrmsg(ill_t *, int); 91 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); 92 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); 93 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); 94 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); 95 96 /* 97 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). 98 */ 99 void 100 ipmp_init(ip_stack_t *ipst) 101 { 102 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", 103 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 104 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 105 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); 106 } 107 108 /* 109 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). 110 */ 111 void 112 ipmp_destroy(ip_stack_t *ipst) 113 { 114 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); 115 rw_destroy(&ipst->ips_ipmp_lock); 116 } 117 118 /* 119 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', 120 * and add it to the hash. On success, return a pointer to the created group. 121 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP 122 * meta-interface associated with the group also has the same name (but they 123 * may differ later via ipmp_grp_rename()). 124 */ 125 ipmp_grp_t * 126 ipmp_grp_create(const char *grname, phyint_t *phyi) 127 { 128 ipmp_grp_t *grp; 129 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 130 mod_hash_hndl_t mh; 131 132 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 133 134 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) 135 return (NULL); 136 137 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 138 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); 139 140 /* 141 * Cache the group's phyint. This is safe since a phyint_t will 142 * outlive its ipmp_grp_t. 143 */ 144 grp->gr_phyint = phyi; 145 146 /* 147 * Create IPMP group kstats. 148 */ 149 if (ipmp_grp_create_kstats(grp) != 0) { 150 kmem_free(grp, sizeof (ipmp_grp_t)); 151 return (NULL); 152 } 153 154 /* 155 * Insert the group into the hash. 156 */ 157 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { 158 ipmp_grp_destroy_kstats(grp); 159 kmem_free(grp, sizeof (ipmp_grp_t)); 160 return (NULL); 161 } 162 ipmp_grp_insert(grp, mh); 163 164 return (grp); 165 } 166 167 /* 168 * Create IPMP kstat structures for `grp'. Return an errno upon failure. 169 */ 170 static int 171 ipmp_grp_create_kstats(ipmp_grp_t *grp) 172 { 173 kstat_t *ksp; 174 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 175 176 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", 177 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); 178 if (ksp == NULL) 179 return (ENOMEM); 180 181 ksp->ks_update = ipmp_grp_update_kstats; 182 ksp->ks_private = grp; 183 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); 184 185 kstat_install(ksp); 186 grp->gr_ksp = ksp; 187 return (0); 188 } 189 190 /* 191 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. 192 */ 193 static int 194 ipmp_grp_update_kstats(kstat_t *ksp, int rw) 195 { 196 uint_t i; 197 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 198 ipmp_grp_t *grp = ksp->ks_private; 199 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 200 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; 201 phyint_t *phyi; 202 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 203 204 if (rw == KSTAT_WRITE) 205 return (EACCES); 206 207 /* 208 * Start with the group's baseline values. 209 */ 210 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 211 if (kn[i].data_type == KSTAT_DATA_UINT32) { 212 kn[i].value.ui32 = grp->gr_kstats0[i]; 213 } else { 214 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); 215 kn[i].value.ui64 = grp->gr_kstats0[i]; 216 } 217 } 218 219 /* 220 * Add in the stats of each phyint currently in the group. Since we 221 * don't directly track the phyints in a group, we cheat by walking 222 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while 223 * ill_g_lock is held.) 224 */ 225 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 226 ipsq = grp_ipsq->ipsq_next; 227 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { 228 phyi = ipsq->ipsq_phyint; 229 230 /* 231 * If a phyint in a group is being unplumbed, it's possible 232 * that ill_glist_delete() -> phyint_free() already freed the 233 * phyint (and set ipsq_phyint to NULL), but the unplumb 234 * operation has yet to complete (and thus ipsq_dq() has yet 235 * to remove the phyint's IPSQ from the group IPSQ's phyint 236 * list). We skip those phyints here (note that their kstats 237 * have already been added to gr_kstats0[]). 238 */ 239 if (phyi == NULL) 240 continue; 241 242 ipmp_phyint_get_kstats(phyi, phyi_kstats); 243 244 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 245 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 246 if (kn[i].data_type == KSTAT_DATA_UINT32) 247 kn[i].value.ui32 += phyi_kstats[i]; 248 else 249 kn[i].value.ui64 += phyi_kstats[i]; 250 } 251 } 252 253 kn[IPMP_KSTAT_LINK_UP].value.ui32 = 254 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; 255 256 rw_exit(&ipst->ips_ill_g_lock); 257 return (0); 258 } 259 260 /* 261 * Destroy IPMP kstat structures for `grp'. 262 */ 263 static void 264 ipmp_grp_destroy_kstats(ipmp_grp_t *grp) 265 { 266 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 267 268 kstat_delete_netstack(grp->gr_ksp, id); 269 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); 270 grp->gr_ksp = NULL; 271 } 272 273 /* 274 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it 275 * does not exist. 276 */ 277 ipmp_grp_t * 278 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) 279 { 280 ipmp_grp_t *grp; 281 282 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 283 284 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 285 (mod_hash_val_t *)&grp) == 0) 286 return (grp); 287 288 return (NULL); 289 } 290 291 /* 292 * Place information about group `grp' into `lifgr'. 293 */ 294 void 295 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) 296 { 297 ill_t *ill; 298 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 299 300 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 301 302 lifgr->gi_v4 = (grp->gr_v4 != NULL); 303 lifgr->gi_v6 = (grp->gr_v6 != NULL); 304 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; 305 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; 306 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; 307 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); 308 lifgr->gi_m4ifname[0] = '\0'; 309 lifgr->gi_m6ifname[0] = '\0'; 310 lifgr->gi_bcifname[0] = '\0'; 311 312 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { 313 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); 314 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); 315 } 316 317 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) 318 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); 319 } 320 321 /* 322 * Insert `grp' into the hash using the reserved hash entry `mh'. 323 * Caller must ensure `grp' is not yet in the hash. 324 */ 325 static void 326 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) 327 { 328 int err; 329 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 330 331 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 332 333 /* 334 * Since grp->gr_name will exist at least as long as `grp' is in the 335 * hash, we use it directly as the key. 336 */ 337 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, 338 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); 339 if (err != 0) { 340 /* 341 * This should never happen since `mh' was preallocated. 342 */ 343 panic("cannot insert IPMP group \"%s\" (err %d)", 344 grp->gr_name, err); 345 } 346 } 347 348 /* 349 * Remove `grp' from the hash. Caller must ensure `grp' is in it. 350 */ 351 static void 352 ipmp_grp_remove(ipmp_grp_t *grp) 353 { 354 int err; 355 mod_hash_val_t val; 356 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; 357 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 358 359 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 360 361 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); 362 if (err != 0 || val != grp) { 363 panic("cannot remove IPMP group \"%s\" (err %d)", 364 grp->gr_name, err); 365 } 366 } 367 368 /* 369 * Attempt to rename `grp' to new name `grname'. Return an errno if the new 370 * group name already exists or is invalid, or if there isn't enough memory. 371 */ 372 int 373 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) 374 { 375 mod_hash_hndl_t mh; 376 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 377 378 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 379 380 if (grname[0] == '\0') 381 return (EINVAL); 382 383 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 384 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) 385 return (EEXIST); 386 387 /* 388 * Before we remove the group from the hash, ensure we'll be able to 389 * re-insert it by reserving space. 390 */ 391 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) 392 return (ENOMEM); 393 394 ipmp_grp_remove(grp); 395 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 396 ipmp_grp_insert(grp, mh); 397 398 return (0); 399 } 400 401 /* 402 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in 403 * the hash, and that there are no interfaces on it. 404 */ 405 void 406 ipmp_grp_destroy(ipmp_grp_t *grp) 407 { 408 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 409 410 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 411 412 /* 413 * If there are still interfaces using this group, panic before things 414 * go really off the rails. 415 */ 416 if (grp->gr_nif != 0) 417 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); 418 419 ipmp_grp_remove(grp); 420 ipmp_grp_destroy_kstats(grp); 421 422 ASSERT(grp->gr_v4 == NULL); 423 ASSERT(grp->gr_v6 == NULL); 424 ASSERT(grp->gr_nv4 == 0); 425 ASSERT(grp->gr_nv6 == 0); 426 ASSERT(grp->gr_nactif == 0); 427 ASSERT(grp->gr_linkdownmp == NULL); 428 grp->gr_phyint = NULL; 429 430 kmem_free(grp, sizeof (ipmp_grp_t)); 431 } 432 433 /* 434 * Check whether `ill' is suitable for inclusion into `grp', and return an 435 * errno describing the problem (if any). NOTE: many of these errno values 436 * are interpreted by ifconfig, which will take corrective action and retry 437 * the SIOCSLIFGROUPNAME, so please exercise care when changing them. 438 */ 439 static int 440 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) 441 { 442 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 443 444 ASSERT(IAM_WRITER_ILL(ill)); 445 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 446 447 /* 448 * To sidestep complicated address migration logic in the kernel and 449 * to force the kernel's all-hosts multicast memberships to be blown 450 * away, all addresses that had been brought up must be brought back 451 * down prior to adding an interface to a group. (This includes 452 * addresses currently down due to DAD.) Once the interface has been 453 * added to the group, its addresses can then be brought back up, at 454 * which point they will be moved to the IPMP meta-interface. 455 * NOTE: we do this before ill_appaddr_cnt() since bringing down the 456 * link-local causes in.ndpd to remove its ADDRCONF'd addresses. 457 */ 458 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 459 return (EADDRINUSE); 460 461 /* 462 * To avoid confusing applications by changing addresses that are 463 * under their control, all such control must be removed prior to 464 * adding an interface into a group. 465 */ 466 if (ill_appaddr_cnt(ill) != 0) 467 return (EADDRNOTAVAIL); 468 469 /* 470 * Since PTP addresses do not share the same broadcast domain, they 471 * are not allowed to be in an IPMP group. 472 */ 473 if (ill_ptpaddr_cnt(ill) != 0) 474 return (EINVAL); 475 476 /* 477 * An ill must support multicast to be allowed into a group. 478 */ 479 if (!(ill->ill_flags & ILLF_MULTICAST)) 480 return (ENOTSUP); 481 482 /* 483 * An ill must strictly be using ARP and/or ND for address 484 * resolution for it to be allowed into a group. 485 */ 486 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP)) 487 return (ENOTSUP); 488 489 /* 490 * An ill cannot also be using usesrc groups. (Although usesrc uses 491 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does 492 * all its modifications as writer.) 493 */ 494 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) 495 return (ENOTSUP); 496 497 /* 498 * All ills in a group must be the same mactype. 499 */ 500 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) 501 return (EINVAL); 502 503 return (0); 504 } 505 506 /* 507 * Check whether `phyi' is suitable for inclusion into `grp', and return an 508 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() 509 * regarding errno values. 510 */ 511 int 512 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) 513 { 514 int err = 0; 515 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 516 517 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); 518 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 519 520 /* 521 * An interface cannot have address families plumbed that are not 522 * configured in the group. 523 */ 524 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || 525 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) 526 return (EAFNOSUPPORT); 527 528 if (phyi->phyint_illv4 != NULL) 529 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); 530 if (err == 0 && phyi->phyint_illv6 != NULL) 531 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); 532 533 return (err); 534 } 535 536 /* 537 * Create a new illgrp on IPMP meta-interface `ill'. 538 */ 539 ipmp_illgrp_t * 540 ipmp_illgrp_create(ill_t *ill) 541 { 542 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 543 ipmp_illgrp_t *illg; 544 545 ASSERT(IAM_WRITER_ILL(ill)); 546 ASSERT(IS_IPMP(ill)); 547 ASSERT(ill->ill_grp == NULL); 548 549 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) 550 return (NULL); 551 552 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); 553 list_create(&illg->ig_actif, sizeof (ill_t), 554 offsetof(ill_t, ill_actnode)); 555 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), 556 offsetof(ipmp_arpent_t, ia_node)); 557 558 illg->ig_ipmp_ill = ill; 559 ill->ill_grp = illg; 560 ipmp_illgrp_set_mtu(illg, mtu); 561 562 return (illg); 563 } 564 565 /* 566 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. 567 */ 568 void 569 ipmp_illgrp_destroy(ipmp_illgrp_t *illg) 570 { 571 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 572 ASSERT(IS_IPMP(illg->ig_ipmp_ill)); 573 574 /* 575 * Verify `illg' is empty. 576 */ 577 ASSERT(illg->ig_next_ill == NULL); 578 ASSERT(illg->ig_cast_ill == NULL); 579 ASSERT(list_is_empty(&illg->ig_arpent)); 580 ASSERT(list_is_empty(&illg->ig_if)); 581 ASSERT(list_is_empty(&illg->ig_actif)); 582 ASSERT(illg->ig_nactif == 0); 583 584 /* 585 * Destroy `illg'. 586 */ 587 illg->ig_ipmp_ill->ill_grp = NULL; 588 illg->ig_ipmp_ill = NULL; 589 list_destroy(&illg->ig_if); 590 list_destroy(&illg->ig_actif); 591 list_destroy(&illg->ig_arpent); 592 kmem_free(illg, sizeof (ipmp_illgrp_t)); 593 } 594 595 /* 596 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to 597 * bind it to an underlying ill, while keeping an even address distribution. 598 * If the bind is successful, return a pointer to the bound ill. 599 */ 600 ill_t * 601 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 602 { 603 ill_t *minill; 604 ipmp_arpent_t *entp; 605 606 ASSERT(IAM_WRITER_IPIF(ipif)); 607 ASSERT(ipmp_ipif_is_dataaddr(ipif)); 608 609 /* 610 * IPMP data address mappings are internally managed by IP itself, so 611 * delete any existing ARP entries associated with the address. 612 */ 613 if (!ipif->ipif_isv6) { 614 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); 615 if (entp != NULL) 616 ipmp_illgrp_destroy_arpent(illg, entp); 617 } 618 619 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 620 ipmp_ill_bind_ipif(minill, ipif, Res_act_none); 621 622 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); 623 } 624 625 /* 626 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's 627 * bound, unbind it from the underlying ill while keeping an even address 628 * distribution. 629 */ 630 void 631 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 632 { 633 ill_t *maxill, *boundill = ipif->ipif_bound_ill; 634 635 ASSERT(IAM_WRITER_IPIF(ipif)); 636 637 if (boundill != NULL) { 638 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); 639 640 maxill = ipmp_illgrp_max_ill(illg); 641 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { 642 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 643 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); 644 } 645 } 646 } 647 648 /* 649 * Return the active ill with the greatest number of data addresses in `illg'. 650 */ 651 static ill_t * 652 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) 653 { 654 ill_t *ill, *bestill = NULL; 655 656 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 657 658 ill = list_head(&illg->ig_actif); 659 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 660 if (bestill == NULL || 661 ill->ill_bound_cnt > bestill->ill_bound_cnt) { 662 bestill = ill; 663 } 664 } 665 return (bestill); 666 } 667 668 /* 669 * Return the active ill with the fewest number of data addresses in `illg'. 670 */ 671 static ill_t * 672 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) 673 { 674 ill_t *ill, *bestill = NULL; 675 676 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 677 678 ill = list_head(&illg->ig_actif); 679 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 680 if (bestill == NULL || 681 ill->ill_bound_cnt < bestill->ill_bound_cnt) { 682 if (ill->ill_bound_cnt == 0) 683 return (ill); /* can't get better */ 684 bestill = ill; 685 } 686 } 687 return (bestill); 688 } 689 690 /* 691 * Return a pointer to IPMP meta-interface for `illg' (which must exist). 692 * Since ig_ipmp_ill never changes for a given illg, no locks are needed. 693 */ 694 ill_t * 695 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) 696 { 697 return (illg->ig_ipmp_ill); 698 } 699 700 /* 701 * Return a pointer to the next available underlying ill in `illg', or NULL if 702 * one doesn't exist. Caller must be inside the IPSQ. 703 */ 704 ill_t * 705 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) 706 { 707 ill_t *ill; 708 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 709 710 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 711 712 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 713 if ((ill = illg->ig_next_ill) != NULL) { 714 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 715 if (illg->ig_next_ill == NULL) 716 illg->ig_next_ill = list_head(&illg->ig_actif); 717 } 718 rw_exit(&ipst->ips_ipmp_lock); 719 720 return (ill); 721 } 722 723 /* 724 * Return a held pointer to the next available underlying ill in `illg', or 725 * NULL if one doesn't exist. Caller need not be inside the IPSQ. 726 */ 727 ill_t * 728 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) 729 { 730 ill_t *ill; 731 uint_t i; 732 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 733 734 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 735 for (i = 0; i < illg->ig_nactif; i++) { 736 ill = illg->ig_next_ill; 737 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 738 if (illg->ig_next_ill == NULL) 739 illg->ig_next_ill = list_head(&illg->ig_actif); 740 741 if (ill_check_and_refhold(ill)) { 742 rw_exit(&ipst->ips_ipmp_lock); 743 return (ill); 744 } 745 } 746 rw_exit(&ipst->ips_ipmp_lock); 747 748 return (NULL); 749 } 750 751 /* 752 * Return a held pointer to the nominated multicast ill in `illg', or NULL if 753 * one doesn't exist. Caller need not be inside the IPSQ. 754 */ 755 ill_t * 756 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) 757 { 758 ill_t *castill; 759 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 760 761 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 762 castill = illg->ig_cast_ill; 763 if (castill != NULL && ill_check_and_refhold(castill)) { 764 rw_exit(&ipst->ips_ipmp_lock); 765 return (castill); 766 } 767 rw_exit(&ipst->ips_ipmp_lock); 768 return (NULL); 769 } 770 771 /* 772 * Callback routine for ncec_walk() that deletes `nce' if it is associated with 773 * the `(ill_t *)arg' and it is not one of the local addresses. Caller must be 774 * inside the IPSQ. 775 */ 776 static void 777 ipmp_ncec_delete_nonlocal(ncec_t *ncec, uchar_t *arg) 778 { 779 if ((ncec != NULL) && !NCE_MYADDR(ncec) && 780 ncec->ncec_ill == (ill_t *)arg) { 781 ncec_delete(ncec); 782 } 783 } 784 785 /* 786 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, 787 * any existing nomination is removed. Caller must be inside the IPSQ. 788 */ 789 static void 790 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) 791 { 792 ill_t *ocastill = illg->ig_cast_ill; 793 ill_t *ipmp_ill = illg->ig_ipmp_ill; 794 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 795 796 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 797 798 /* 799 * Disable old nominated ill (if any). 800 */ 801 if (ocastill != NULL) { 802 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, 803 illg, ill_t *, ocastill); 804 ASSERT(ocastill->ill_nom_cast); 805 ocastill->ill_nom_cast = B_FALSE; 806 /* 807 * If the IPMP meta-interface is down, we never did the join, 808 * so we must not try to leave. 809 */ 810 if (ipmp_ill->ill_dl_up) 811 ill_leave_multicast(ipmp_ill); 812 813 /* 814 * Delete any NCEs tied to the old nomination. We must do this 815 * last since ill_leave_multicast() may trigger IREs to be 816 * built using ig_cast_ill. 817 */ 818 ncec_walk(ocastill, (pfi_t)ipmp_ncec_delete_nonlocal, ocastill, 819 ocastill->ill_ipst); 820 } 821 822 /* 823 * Set new nomination. 824 */ 825 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 826 illg->ig_cast_ill = castill; 827 rw_exit(&ipst->ips_ipmp_lock); 828 829 /* 830 * Enable new nominated ill (if any). 831 */ 832 if (castill != NULL) { 833 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, 834 illg, ill_t *, castill); 835 ASSERT(!castill->ill_nom_cast); 836 castill->ill_nom_cast = B_TRUE; 837 /* 838 * If the IPMP meta-interface is down, the attempt to recover 839 * will silently fail but ill_need_recover_multicast will be 840 * erroneously cleared -- so check first. 841 */ 842 if (ipmp_ill->ill_dl_up) 843 ill_recover_multicast(ipmp_ill); 844 } 845 } 846 847 /* 848 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an 849 * entry for the same IP address already exists, destroy it first. Return the 850 * created IPMP ARP entry, or NULL on failure. 851 */ 852 ipmp_arpent_t * 853 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, boolean_t proxyarp, 854 ipaddr_t ipaddr, uchar_t *lladdr, size_t lladdr_len, uint16_t flags) 855 { 856 ipmp_arpent_t *entp, *oentp; 857 858 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 859 860 if ((entp = kmem_alloc(sizeof (ipmp_arpent_t) + lladdr_len, 861 KM_NOSLEEP)) == NULL) 862 return (NULL); 863 864 /* 865 * Delete any existing ARP entry for this address. 866 */ 867 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) 868 ipmp_illgrp_destroy_arpent(illg, oentp); 869 870 /* 871 * Prepend the new entry. 872 */ 873 entp->ia_ipaddr = ipaddr; 874 entp->ia_flags = flags; 875 entp->ia_lladdr_len = lladdr_len; 876 entp->ia_lladdr = (uchar_t *)&entp[1]; 877 bcopy(lladdr, entp->ia_lladdr, lladdr_len); 878 entp->ia_proxyarp = proxyarp; 879 entp->ia_notified = B_TRUE; 880 list_insert_head(&illg->ig_arpent, entp); 881 return (entp); 882 } 883 884 /* 885 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. 886 */ 887 void 888 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 889 { 890 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 891 892 list_remove(&illg->ig_arpent, entp); 893 kmem_free(entp, sizeof (ipmp_arpent_t) + entp->ia_lladdr_len); 894 } 895 896 /* 897 * Mark that ARP has been notified about the IP address on `entp'; `illg' is 898 * taken as a debugging aid for DTrace FBT probes. 899 */ 900 /* ARGSUSED */ 901 void 902 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 903 { 904 entp->ia_notified = B_TRUE; 905 } 906 907 /* 908 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is 909 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. 910 */ 911 ipmp_arpent_t * 912 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) 913 { 914 ipmp_arpent_t *entp = list_head(&illg->ig_arpent); 915 916 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 917 918 if (addrp == NULL) 919 return (entp); 920 921 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) 922 if (entp->ia_ipaddr == *addrp) 923 break; 924 return (entp); 925 } 926 927 /* 928 * Refresh ARP entries on `illg' to be distributed across its active 929 * interfaces. Entries that cannot be refreshed (e.g., because there are no 930 * active interfaces) are marked so that subsequent calls can try again. 931 */ 932 void 933 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) 934 { 935 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; 936 uint_t paddrlen = ipmp_ill->ill_phys_addr_length; 937 ipmp_arpent_t *entp; 938 ncec_t *ncec; 939 nce_t *nce; 940 941 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 942 ASSERT(!ipmp_ill->ill_isv6); 943 944 ill = list_head(&illg->ig_actif); 945 entp = list_head(&illg->ig_arpent); 946 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { 947 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { 948 entp->ia_notified = B_FALSE; 949 continue; 950 } 951 952 ASSERT(paddrlen == ill->ill_phys_addr_length); 953 954 /* 955 * If this is a proxy ARP entry, we can skip notifying ARP if 956 * the entry is already up-to-date. If it has changed, we 957 * update the entry's hardware address before notifying ARP. 958 */ 959 if (entp->ia_proxyarp) { 960 if (bcmp(ill->ill_phys_addr, entp->ia_lladdr, 961 paddrlen) == 0 && entp->ia_notified) 962 continue; 963 bcopy(ill->ill_phys_addr, entp->ia_lladdr, paddrlen); 964 } 965 966 (void) nce_lookup_then_add_v4(ipmp_ill, entp->ia_lladdr, 967 paddrlen, &entp->ia_ipaddr, entp->ia_flags, ND_UNCHANGED, 968 &nce); 969 if (nce == NULL || !entp->ia_proxyarp) { 970 if (nce != NULL) 971 nce_refrele(nce); 972 continue; 973 } 974 ncec = nce->nce_common; 975 mutex_enter(&ncec->ncec_lock); 976 nce_update(ncec, ND_UNCHANGED, ill->ill_phys_addr); 977 mutex_exit(&ncec->ncec_lock); 978 nce_refrele(nce); 979 ipmp_illgrp_mark_arpent(illg, entp); 980 981 if ((ill = list_next(&illg->ig_actif, ill)) == NULL) 982 ill = list_head(&illg->ig_actif); 983 } 984 } 985 986 /* 987 * Return an interface in `illg' with the specified `physaddr', or NULL if one 988 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. 989 */ 990 ill_t * 991 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) 992 { 993 ill_t *ill; 994 ill_t *ipmp_ill = illg->ig_ipmp_ill; 995 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 996 997 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 998 999 ill = list_head(&illg->ig_if); 1000 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1001 if (ill->ill_phys_addr_length == paddrlen && 1002 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) 1003 return (ill); 1004 } 1005 return (NULL); 1006 } 1007 1008 /* 1009 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. 1010 * Caller must be inside the IPSQ unless this is initialization. 1011 */ 1012 static void 1013 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) 1014 { 1015 ill_t *ill = illg->ig_ipmp_ill; 1016 mblk_t *mp; 1017 1018 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); 1019 1020 /* 1021 * If allocation fails, we have bigger problems than MTU. 1022 */ 1023 if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { 1024 illg->ig_mtu = mtu; 1025 put(ill->ill_rq, mp); 1026 } 1027 } 1028 1029 /* 1030 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP 1031 * ill MTU if necessary. 1032 */ 1033 void 1034 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) 1035 { 1036 ill_t *ill; 1037 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1038 uint_t mtu = 0; 1039 1040 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 1041 1042 /* 1043 * Since ill_mtu can only change under ill_lock, we hold ill_lock 1044 * for each ill as we iterate through the list. Any changes to the 1045 * ill_mtu will also trigger an update, so even if we missed it 1046 * this time around, the update will catch it. 1047 */ 1048 ill = list_head(&illg->ig_if); 1049 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1050 mutex_enter(&ill->ill_lock); 1051 if (mtu == 0 || ill->ill_mtu < mtu) 1052 mtu = ill->ill_mtu; 1053 mutex_exit(&ill->ill_lock); 1054 } 1055 1056 /* 1057 * MTU must be at least the minimum MTU. 1058 */ 1059 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1060 1061 if (illg->ig_mtu != mtu) 1062 ipmp_illgrp_set_mtu(illg, mtu); 1063 } 1064 1065 /* 1066 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently 1067 * allow the same link to be established more than once. 1068 */ 1069 void 1070 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) 1071 { 1072 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1073 1074 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1075 1076 if (illg->ig_ipmp_ill->ill_isv6) { 1077 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); 1078 grp->gr_v6 = illg; 1079 } else { 1080 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); 1081 grp->gr_v4 = illg; 1082 } 1083 } 1084 1085 /* 1086 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp 1087 * cannot be unlinked (e.g., because there are still interfaces using it). 1088 */ 1089 int 1090 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) 1091 { 1092 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; 1093 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1094 1095 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1096 1097 if (illg->ig_ipmp_ill->ill_isv6) { 1098 if (grp->gr_nv6 + grp->gr_pendv6 != 0) 1099 return (EBUSY); 1100 grp->gr_v6 = NULL; 1101 } else { 1102 if (grp->gr_nv4 + grp->gr_pendv4 != 0) 1103 return (EBUSY); 1104 grp->gr_v4 = NULL; 1105 } 1106 return (0); 1107 } 1108 1109 /* 1110 * Place `ill' into `illg', and rebalance the data addresses on `illg' 1111 * to be spread evenly across the ills now in it. Also, adjust the IPMP 1112 * ill as necessary to account for `ill' (e.g., MTU). 1113 */ 1114 void 1115 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) 1116 { 1117 ill_t *ipmp_ill; 1118 ipif_t *ipif; 1119 ip_stack_t *ipst = ill->ill_ipst; 1120 1121 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ 1122 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); 1123 ASSERT(IAM_WRITER_ILL(ill)); 1124 ASSERT(ill->ill_grp == NULL); 1125 1126 ipmp_ill = illg->ig_ipmp_ill; 1127 1128 /* 1129 * Account for `ill' joining the illgrp. 1130 */ 1131 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1132 if (ill->ill_isv6) 1133 ill->ill_phyint->phyint_grp->gr_nv6++; 1134 else 1135 ill->ill_phyint->phyint_grp->gr_nv4++; 1136 rw_exit(&ipst->ips_ipmp_lock); 1137 1138 /* 1139 * Ensure the ILLF_ROUTER flag remains consistent across the group. 1140 */ 1141 mutex_enter(&ill->ill_lock); 1142 if (ipmp_ill->ill_flags & ILLF_ROUTER) 1143 ill->ill_flags |= ILLF_ROUTER; 1144 else 1145 ill->ill_flags &= ~ILLF_ROUTER; 1146 mutex_exit(&ill->ill_lock); 1147 1148 /* 1149 * Blow away all multicast memberships that currently exist on `ill'. 1150 * This may seem odd, but it's consistent with the application view 1151 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). 1152 */ 1153 update_conn_ill(ill, ill->ill_ipst); 1154 if (ill->ill_isv6) { 1155 reset_mrt_ill(ill); 1156 } else { 1157 ipif = ill->ill_ipif; 1158 for (; ipif != NULL; ipif = ipif->ipif_next) { 1159 reset_mrt_vif_ipif(ipif); 1160 } 1161 } 1162 ip_purge_allmulti(ill); 1163 1164 /* 1165 * Borrow the first ill's ill_phys_addr_length value for the illgrp's 1166 * physical address length. All other ills must have the same value, 1167 * since they are required to all be the same mactype. Also update 1168 * the IPMP ill's MTU and CoS marking, if necessary. 1169 */ 1170 if (list_is_empty(&illg->ig_if)) { 1171 ASSERT(ipmp_ill->ill_phys_addr_length == 0); 1172 /* 1173 * NOTE: we leave ill_phys_addr NULL since the IPMP group 1174 * doesn't have a physical address. This means that code must 1175 * not assume that ill_phys_addr is non-NULL just because 1176 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. 1177 */ 1178 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; 1179 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; 1180 ipmp_ill->ill_type = ill->ill_type; 1181 1182 if (ill->ill_flags & ILLF_COS_ENABLED) { 1183 mutex_enter(&ipmp_ill->ill_lock); 1184 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1185 mutex_exit(&ipmp_ill->ill_lock); 1186 } 1187 ipmp_illgrp_set_mtu(illg, ill->ill_mtu); 1188 } else { 1189 ASSERT(ipmp_ill->ill_phys_addr_length == 1190 ill->ill_phys_addr_length); 1191 ASSERT(ipmp_ill->ill_type == ill->ill_type); 1192 1193 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1194 mutex_enter(&ipmp_ill->ill_lock); 1195 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1196 mutex_exit(&ipmp_ill->ill_lock); 1197 } 1198 if (illg->ig_mtu > ill->ill_mtu) 1199 ipmp_illgrp_set_mtu(illg, ill->ill_mtu); 1200 } 1201 1202 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1203 list_insert_tail(&illg->ig_if, ill); 1204 ill->ill_grp = illg; 1205 rw_exit(&ipst->ips_ill_g_lock); 1206 1207 /* 1208 * Hide the IREs on `ill' so that we don't accidentally find them when 1209 * sending data traffic. 1210 */ 1211 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); 1212 1213 ipmp_ill_refresh_active(ill); 1214 } 1215 1216 /* 1217 * Remove `ill' from its illgrp, and rebalance the data addresses in that 1218 * illgrp to be spread evenly across the remaining ills. Also, adjust the 1219 * IPMP ill as necessary now that `ill' is removed (e.g., MTU). 1220 */ 1221 void 1222 ipmp_ill_leave_illgrp(ill_t *ill) 1223 { 1224 ill_t *ipmp_ill; 1225 ipif_t *ipif; 1226 ipmp_arpent_t *entp; 1227 ipmp_illgrp_t *illg = ill->ill_grp; 1228 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1229 1230 ASSERT(IS_UNDER_IPMP(ill)); 1231 ASSERT(IAM_WRITER_ILL(ill)); 1232 ASSERT(illg != NULL); 1233 1234 ipmp_ill = illg->ig_ipmp_ill; 1235 1236 /* 1237 * Cancel IPMP-specific ill timeouts. 1238 */ 1239 (void) untimeout(ill->ill_refresh_tid); 1240 1241 /* 1242 * Expose any previously-hidden IREs on `ill'. 1243 */ 1244 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); 1245 1246 /* 1247 * Ensure the multicast state for each ipif on `ill' is down so that 1248 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin 1249 * all eligible groups. 1250 */ 1251 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1252 if (ipif->ipif_flags & IPIF_UP) 1253 ipif_multicast_down(ipif); 1254 1255 /* 1256 * Account for `ill' leaving the illgrp. 1257 */ 1258 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1259 if (ill->ill_isv6) 1260 ill->ill_phyint->phyint_grp->gr_nv6--; 1261 else 1262 ill->ill_phyint->phyint_grp->gr_nv4--; 1263 rw_exit(&ipst->ips_ipmp_lock); 1264 1265 /* 1266 * Pull `ill' out of the interface lists. 1267 */ 1268 if (list_link_active(&ill->ill_actnode)) 1269 ipmp_ill_deactivate(ill); 1270 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1271 list_remove(&illg->ig_if, ill); 1272 ill->ill_grp = NULL; 1273 rw_exit(&ipst->ips_ill_g_lock); 1274 1275 /* 1276 * Re-establish multicast memberships that were previously being 1277 * handled by the IPMP meta-interface. 1278 */ 1279 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1280 if (ipif->ipif_flags & IPIF_UP) 1281 ipif_multicast_up(ipif); 1282 1283 /* 1284 * Refresh the group MTU based on the new interface list. 1285 */ 1286 ipmp_illgrp_refresh_mtu(illg); 1287 1288 if (list_is_empty(&illg->ig_if)) { 1289 /* 1290 * No ills left in the illgrp; we no longer have a physical 1291 * address length, nor can we support ARP, CoS, or anything 1292 * else that depends on knowing the link layer type. 1293 */ 1294 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) 1295 ipmp_illgrp_destroy_arpent(illg, entp); 1296 1297 ipmp_ill->ill_phys_addr_length = 0; 1298 ipmp_ill->ill_nd_lla_len = 0; 1299 ipmp_ill->ill_type = IFT_OTHER; 1300 mutex_enter(&ipmp_ill->ill_lock); 1301 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1302 mutex_exit(&ipmp_ill->ill_lock); 1303 } else { 1304 /* 1305 * If `ill' didn't support CoS, see if it can now be enabled. 1306 */ 1307 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1308 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); 1309 1310 ill = list_head(&illg->ig_if); 1311 do { 1312 if (!(ill->ill_flags & ILLF_COS_ENABLED)) 1313 break; 1314 } while ((ill = list_next(&illg->ig_if, ill)) != NULL); 1315 1316 if (ill == NULL) { 1317 mutex_enter(&ipmp_ill->ill_lock); 1318 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1319 mutex_exit(&ipmp_ill->ill_lock); 1320 } 1321 } 1322 } 1323 } 1324 1325 /* 1326 * Check if `ill' should be active, and activate or deactivate if need be. 1327 * Return B_FALSE if a refresh was necessary but could not be performed. 1328 */ 1329 static boolean_t 1330 ipmp_ill_try_refresh_active(ill_t *ill) 1331 { 1332 boolean_t refreshed = B_TRUE; 1333 1334 ASSERT(IAM_WRITER_ILL(ill)); 1335 ASSERT(IS_UNDER_IPMP(ill)); 1336 1337 if (ipmp_ill_is_active(ill)) { 1338 if (!list_link_active(&ill->ill_actnode)) 1339 refreshed = ipmp_ill_activate(ill); 1340 } else { 1341 if (list_link_active(&ill->ill_actnode)) 1342 ipmp_ill_deactivate(ill); 1343 } 1344 1345 return (refreshed); 1346 } 1347 1348 /* 1349 * Check if `ill' should be active, and activate or deactivate if need be. 1350 * If the refresh fails, schedule a timer to try again later. 1351 */ 1352 void 1353 ipmp_ill_refresh_active(ill_t *ill) 1354 { 1355 if (!ipmp_ill_try_refresh_active(ill)) 1356 ipmp_ill_refresh_active_timer_start(ill); 1357 } 1358 1359 /* 1360 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. 1361 */ 1362 static void 1363 ipmp_ill_refresh_active_timer(void *ill_arg) 1364 { 1365 ill_t *ill = ill_arg; 1366 boolean_t refreshed = B_FALSE; 1367 1368 /* 1369 * Clear ill_refresh_tid to indicate that no timeout is pending 1370 * (another thread could schedule a new timeout while we're still 1371 * running, but that's harmless). If the ill is going away, bail. 1372 */ 1373 mutex_enter(&ill->ill_lock); 1374 ill->ill_refresh_tid = 0; 1375 if (ill->ill_state_flags & ILL_CONDEMNED) { 1376 mutex_exit(&ill->ill_lock); 1377 return; 1378 } 1379 mutex_exit(&ill->ill_lock); 1380 1381 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { 1382 refreshed = ipmp_ill_try_refresh_active(ill); 1383 ipsq_exit(ill->ill_phyint->phyint_ipsq); 1384 } 1385 1386 /* 1387 * If the refresh failed, schedule another attempt. 1388 */ 1389 if (!refreshed) 1390 ipmp_ill_refresh_active_timer_start(ill); 1391 } 1392 1393 /* 1394 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. 1395 */ 1396 static void 1397 ipmp_ill_refresh_active_timer_start(ill_t *ill) 1398 { 1399 mutex_enter(&ill->ill_lock); 1400 1401 /* 1402 * If the ill is going away or a refresh is already scheduled, bail. 1403 */ 1404 if (ill->ill_refresh_tid != 0 || 1405 (ill->ill_state_flags & ILL_CONDEMNED)) { 1406 mutex_exit(&ill->ill_lock); 1407 return; 1408 } 1409 1410 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, 1411 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); 1412 1413 mutex_exit(&ill->ill_lock); 1414 } 1415 1416 /* 1417 * Activate `ill' so it will be used to send and receive data traffic. Return 1418 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages 1419 * needed to deactivate `ill' here as well so that deactivation cannot fail. 1420 */ 1421 static boolean_t 1422 ipmp_ill_activate(ill_t *ill) 1423 { 1424 ipif_t *ipif; 1425 mblk_t *linkupmp = NULL, *linkdownmp = NULL; 1426 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1427 ipmp_illgrp_t *illg = ill->ill_grp; 1428 ill_t *maxill; 1429 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1430 1431 ASSERT(IAM_WRITER_ILL(ill)); 1432 ASSERT(IS_UNDER_IPMP(ill)); 1433 1434 /* 1435 * If this will be the first active interface in the group, allocate 1436 * the link-up and link-down messages. 1437 */ 1438 if (grp->gr_nactif == 0) { 1439 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); 1440 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); 1441 if (linkupmp == NULL || linkdownmp == NULL) 1442 goto fail; 1443 } 1444 1445 if (list_is_empty(&illg->ig_actif)) { 1446 /* 1447 * Now that we have an active ill, nominate it for multicast 1448 * and broadcast duties. Do this before ipmp_ill_bind_ipif() 1449 * since that may need to send multicast packets (e.g., IPv6 1450 * neighbor discovery probes). 1451 */ 1452 ipmp_illgrp_set_cast(illg, ill); 1453 1454 /* 1455 * This is the first active ill in the illgrp -- add 'em all. 1456 * We can access/walk ig_ipmp_ill's ipif list since we're 1457 * writer on its IPSQ as well. 1458 */ 1459 ipif = illg->ig_ipmp_ill->ill_ipif; 1460 for (; ipif != NULL; ipif = ipif->ipif_next) 1461 if (ipmp_ipif_is_up_dataaddr(ipif)) 1462 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); 1463 } else { 1464 /* 1465 * Redistribute the addresses by moving them from the ill with 1466 * the most addresses until the ill being activated is at the 1467 * same level as the rest of the ills. 1468 */ 1469 for (;;) { 1470 maxill = ipmp_illgrp_max_ill(illg); 1471 ASSERT(maxill != NULL); 1472 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) 1473 break; 1474 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 1475 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); 1476 } 1477 } 1478 1479 /* 1480 * Put the interface in the active list. 1481 */ 1482 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1483 list_insert_tail(&illg->ig_actif, ill); 1484 illg->ig_nactif++; 1485 illg->ig_next_ill = ill; 1486 rw_exit(&ipst->ips_ipmp_lock); 1487 1488 /* 1489 * Refresh static/proxy ARP entries to use `ill', if need be. 1490 */ 1491 if (!ill->ill_isv6) 1492 ipmp_illgrp_refresh_arpent(illg); 1493 1494 /* 1495 * Finally, mark the group link up, if necessary. 1496 */ 1497 if (grp->gr_nactif++ == 0) { 1498 ASSERT(grp->gr_linkdownmp == NULL); 1499 grp->gr_linkdownmp = linkdownmp; 1500 put(illg->ig_ipmp_ill->ill_rq, linkupmp); 1501 } 1502 return (B_TRUE); 1503 fail: 1504 freemsg(linkupmp); 1505 freemsg(linkdownmp); 1506 return (B_FALSE); 1507 } 1508 1509 /* 1510 * Deactivate `ill' so it will not be used to send or receive data traffic. 1511 */ 1512 static void 1513 ipmp_ill_deactivate(ill_t *ill) 1514 { 1515 ill_t *minill; 1516 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; 1517 mblk_t *mp; 1518 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1519 ipmp_illgrp_t *illg = ill->ill_grp; 1520 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1521 1522 ASSERT(IAM_WRITER_ILL(ill)); 1523 ASSERT(IS_UNDER_IPMP(ill)); 1524 1525 /* 1526 * Pull the interface out of the active list. 1527 */ 1528 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1529 list_remove(&illg->ig_actif, ill); 1530 illg->ig_nactif--; 1531 illg->ig_next_ill = list_head(&illg->ig_actif); 1532 rw_exit(&ipst->ips_ipmp_lock); 1533 1534 /* 1535 * If the ill that's being deactivated had been nominated for 1536 * multicast/broadcast, nominate a new one. 1537 */ 1538 if (ill == illg->ig_cast_ill) 1539 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); 1540 1541 /* 1542 * Delete all nce_t entries using this ill, so that the next attempt 1543 * to send data traffic will revalidate cached nce's. 1544 */ 1545 nce_flush(ill, B_TRUE); 1546 1547 /* 1548 * Unbind all of the ipifs bound to this ill, and save 'em in a list; 1549 * we'll rebind them after we tell the resolver the ill is no longer 1550 * active. We must do things in this order or the resolver could 1551 * accidentally rebind to the ill we're trying to remove if multiple 1552 * ills in the group have the same hardware address (which is 1553 * unsupported, but shouldn't lead to a wedged machine). 1554 */ 1555 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { 1556 ipif->ipif_bound_next = ubheadipif; 1557 ubheadipif = ipif; 1558 } 1559 if (!ill->ill_isv6) { 1560 1561 /* 1562 * Refresh static/proxy ARP entries that had been using `ill'. 1563 */ 1564 ipmp_illgrp_refresh_arpent(illg); 1565 } 1566 1567 /* 1568 * Rebind each ipif from the deactivated ill to the active ill with 1569 * the fewest ipifs. If there are no active ills, the ipifs will 1570 * remain unbound. 1571 */ 1572 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { 1573 ubnextipif = ipif->ipif_bound_next; 1574 ipif->ipif_bound_next = NULL; 1575 1576 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 1577 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); 1578 } 1579 1580 if (list_is_empty(&illg->ig_actif)) { 1581 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1582 1583 ncec_walk(ipmp_ill, (pfi_t)ncec_delete_per_ill, 1584 (uchar_t *)ipmp_ill, ipmp_ill->ill_ipst); 1585 } 1586 1587 /* 1588 * Remove any IRE_IF_CLONE for this ill since they might have 1589 * an ire_nce_cache/nce_common which refers to another ill in the group. 1590 */ 1591 ire_walk_ill(MATCH_IRE_TYPE, IRE_IF_CLONE, ill_downi_if_clone, 1592 ill, ill); 1593 1594 /* 1595 * Finally, mark the group link down, if necessary. 1596 */ 1597 if (--grp->gr_nactif == 0) { 1598 mp = grp->gr_linkdownmp; 1599 grp->gr_linkdownmp = NULL; 1600 ASSERT(mp != NULL); 1601 put(illg->ig_ipmp_ill->ill_rq, mp); 1602 } 1603 } 1604 1605 /* 1606 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) 1607 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. 1608 */ 1609 static void 1610 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) 1611 { 1612 ipif_t *ipif; 1613 1614 ASSERT(IAM_WRITER_ILL(ill)); 1615 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); 1616 1617 /* 1618 * If `ill' is truly down, there are no messages to generate since: 1619 * 1620 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface 1621 * and its addresses by bringing them down. But that's already 1622 * true, so there's nothing to hide. 1623 * 1624 * 2. If cmd == RTM_ADD, then we're supposed to generate messages 1625 * indicating that any previously-hidden up addresses are again 1626 * back up (along with the interface). But they aren't, so 1627 * there's nothing to expose. 1628 */ 1629 if (ill->ill_ipif_up_count == 0) 1630 return; 1631 1632 if (cmd == RTM_ADD) 1633 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); 1634 1635 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1636 if (ipif->ipif_flags & IPIF_UP) 1637 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); 1638 1639 if (cmd == RTM_DELETE) 1640 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); 1641 } 1642 1643 /* 1644 * Bind the address named by `ipif' to the underlying ill named by `ill'. 1645 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' 1646 * will indicate to the resolver whether this is an initial bringup of 1647 * `ipif', or just a rebind to another ill. 1648 */ 1649 static void 1650 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) 1651 { 1652 int err = 0; 1653 ip_stack_t *ipst = ill->ill_ipst; 1654 1655 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); 1656 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); 1657 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); 1658 ASSERT(ipif->ipif_bound_ill == NULL); 1659 ASSERT(ipif->ipif_bound_next == NULL); 1660 1661 ipif->ipif_bound_next = ill->ill_bound_ipif; 1662 ill->ill_bound_ipif = ipif; 1663 ill->ill_bound_cnt++; 1664 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1665 ipif->ipif_bound_ill = ill; 1666 rw_exit(&ipst->ips_ipmp_lock); 1667 1668 /* 1669 * If necessary, tell ARP/NDP about the new mapping. Note that 1670 * ipif_resolver_up() cannot fail for IPv6 ills. 1671 */ 1672 if (act != Res_act_none) { 1673 if (ill->ill_isv6) { 1674 VERIFY(ipif_resolver_up(ipif, act) == 0); 1675 err = ipif_ndp_up(ipif, act == Res_act_initial); 1676 } else { 1677 err = ipif_resolver_up(ipif, act); 1678 } 1679 1680 /* 1681 * Since ipif_ndp_up() never returns EINPROGRESS and 1682 * ipif_resolver_up() only returns EINPROGRESS when the 1683 * associated ill is not up, we should never be here with 1684 * EINPROGRESS. We rely on this to simplify the design. 1685 */ 1686 ASSERT(err != EINPROGRESS); 1687 } 1688 /* TODO: retry binding on failure? when? */ 1689 ipif->ipif_bound = (err == 0); 1690 } 1691 1692 /* 1693 * Unbind the address named by `ipif' from the underlying ill named by `ill'. 1694 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. 1695 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is 1696 * B_TRUE, notify the resolver about the change. 1697 */ 1698 static ipif_t * 1699 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) 1700 { 1701 ipif_t *previpif; 1702 ip_stack_t *ipst = ill->ill_ipst; 1703 1704 ASSERT(IAM_WRITER_ILL(ill)); 1705 ASSERT(IS_UNDER_IPMP(ill)); 1706 1707 /* 1708 * If necessary, find an ipif to unbind. 1709 */ 1710 if (ipif == NULL) { 1711 if ((ipif = ill->ill_bound_ipif) == NULL) { 1712 ASSERT(ill->ill_bound_cnt == 0); 1713 return (NULL); 1714 } 1715 } 1716 1717 ASSERT(IAM_WRITER_IPIF(ipif)); 1718 ASSERT(IS_IPMP(ipif->ipif_ill)); 1719 ASSERT(ipif->ipif_bound_ill == ill); 1720 ASSERT(ill->ill_bound_cnt > 0); 1721 1722 /* 1723 * Unbind it. 1724 */ 1725 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1726 ipif->ipif_bound_ill = NULL; 1727 rw_exit(&ipst->ips_ipmp_lock); 1728 ill->ill_bound_cnt--; 1729 1730 if (ill->ill_bound_ipif == ipif) { 1731 ill->ill_bound_ipif = ipif->ipif_bound_next; 1732 } else { 1733 previpif = ill->ill_bound_ipif; 1734 while (previpif->ipif_bound_next != ipif) 1735 previpif = previpif->ipif_bound_next; 1736 1737 previpif->ipif_bound_next = ipif->ipif_bound_next; 1738 } 1739 ipif->ipif_bound_next = NULL; 1740 1741 /* 1742 * If requested, notify the resolvers (provided we're bound). 1743 */ 1744 if (notifyres && ipif->ipif_bound) { 1745 if (ill->ill_isv6) 1746 ipif_ndp_down(ipif); 1747 else 1748 (void) ipif_arp_down(ipif); 1749 } 1750 ipif->ipif_bound = B_FALSE; 1751 1752 return (ipif); 1753 } 1754 1755 /* 1756 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if 1757 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this 1758 * to determine whether an ill should be considered active, other consumers 1759 * may race and learn about an ill that should be deactivated/activated before 1760 * IPMP has performed the activation/deactivation. This should be safe though 1761 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that 1762 * would've been cleaned up by ipmp_ill_deactivate(). 1763 */ 1764 boolean_t 1765 ipmp_ill_is_active(ill_t *ill) 1766 { 1767 phyint_t *phyi = ill->ill_phyint; 1768 1769 ASSERT(IS_UNDER_IPMP(ill)); 1770 ASSERT(IAM_WRITER_ILL(ill) || 1771 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); 1772 1773 /* 1774 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to 1775 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the 1776 * link flapping logic to be just in in.mpathd and allows us to ignore 1777 * changes to PHYI_RUNNING. 1778 */ 1779 return (!(ill->ill_ipif_up_count == 0 || 1780 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); 1781 } 1782 1783 /* 1784 * IRE walker callback: set ire_testhidden on IRE_HIDDEN_TYPE IREs associated 1785 * with `ill_arg'. 1786 */ 1787 static void 1788 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) 1789 { 1790 ill_t *ill = (ill_t *)ill_arg; 1791 1792 ASSERT(IAM_WRITER_ILL(ill)); 1793 ASSERT(!IS_IPMP(ill)); 1794 1795 if (ire->ire_ill != ill) 1796 return; 1797 1798 if (IRE_HIDDEN_TYPE(ire->ire_type)) { 1799 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); 1800 ire->ire_testhidden = B_TRUE; 1801 } 1802 } 1803 1804 /* 1805 * IRE walker callback: clear ire_testhidden if the IRE has a source address 1806 * on `ill_arg'. 1807 */ 1808 static void 1809 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) 1810 { 1811 ill_t *ill = (ill_t *)ill_arg; 1812 1813 ASSERT(IAM_WRITER_ILL(ill)); 1814 ASSERT(!IS_IPMP(ill)); 1815 1816 if (ire->ire_ill == ill) { 1817 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); 1818 ire->ire_testhidden = B_FALSE; 1819 } 1820 } 1821 1822 /* 1823 * Return a held pointer to the IPMP ill for underlying interface `ill', or 1824 * NULL if one doesn't exist. (Unfortunately, this function needs to take an 1825 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1826 * ill_grp pointer may become stale when not inside an IPSQ and not holding 1827 * ipmp_lock.) Caller need not be inside the IPSQ. 1828 */ 1829 ill_t * 1830 ipmp_ill_hold_ipmp_ill(ill_t *ill) 1831 { 1832 ip_stack_t *ipst = ill->ill_ipst; 1833 ipmp_illgrp_t *illg; 1834 1835 ASSERT(!IS_IPMP(ill)); 1836 1837 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1838 illg = ill->ill_grp; 1839 if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill)) { 1840 rw_exit(&ipst->ips_ipmp_lock); 1841 return (illg->ig_ipmp_ill); 1842 } 1843 /* 1844 * Assume `ill' was removed from the illgrp in the meantime. 1845 */ 1846 rw_exit(&ill->ill_ipst->ips_ipmp_lock); 1847 return (NULL); 1848 } 1849 1850 /* 1851 * Return the interface index for the IPMP ill tied to underlying interface 1852 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. 1853 */ 1854 uint_t 1855 ipmp_ill_get_ipmp_ifindex(const ill_t *ill) 1856 { 1857 uint_t ifindex = 0; 1858 ip_stack_t *ipst = ill->ill_ipst; 1859 ipmp_grp_t *grp; 1860 1861 ASSERT(!IS_IPMP(ill)); 1862 1863 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1864 if ((grp = ill->ill_phyint->phyint_grp) != NULL) 1865 ifindex = grp->gr_phyint->phyint_ifindex; 1866 rw_exit(&ipst->ips_ipmp_lock); 1867 return (ifindex); 1868 } 1869 1870 /* 1871 * Place phyint `phyi' into IPMP group `grp'. 1872 */ 1873 void 1874 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) 1875 { 1876 ill_t *ill; 1877 ipsq_t *ipsq = phyi->phyint_ipsq; 1878 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; 1879 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1880 1881 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1882 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); 1883 1884 /* 1885 * Send routing socket messages indicating that the phyint's ills 1886 * and ipifs vanished. 1887 */ 1888 if (phyi->phyint_illv4 != NULL) { 1889 ill = phyi->phyint_illv4; 1890 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1891 } 1892 1893 if (phyi->phyint_illv6 != NULL) { 1894 ill = phyi->phyint_illv6; 1895 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1896 } 1897 1898 /* 1899 * Snapshot the phyint's initial kstats as a baseline. 1900 */ 1901 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); 1902 1903 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1904 1905 phyi->phyint_grp = grp; 1906 if (++grp->gr_nif == 1) 1907 grp->gr_mactype = ill->ill_mactype; 1908 else 1909 ASSERT(grp->gr_mactype == ill->ill_mactype); 1910 1911 /* 1912 * Now that we're in the group, request a switch to the group's xop 1913 * when we ipsq_exit(). All future operations will be exclusive on 1914 * the group xop until ipmp_phyint_leave_grp() is called. 1915 */ 1916 ASSERT(ipsq->ipsq_swxop == NULL); 1917 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); 1918 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; 1919 1920 rw_exit(&ipst->ips_ipmp_lock); 1921 } 1922 1923 /* 1924 * Remove phyint `phyi' from its current IPMP group. 1925 */ 1926 void 1927 ipmp_phyint_leave_grp(phyint_t *phyi) 1928 { 1929 uint_t i; 1930 ipsq_t *ipsq = phyi->phyint_ipsq; 1931 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1932 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 1933 1934 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1935 1936 /* 1937 * If any of the phyint's ills are still in an illgrp, kick 'em out. 1938 */ 1939 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) 1940 ipmp_ill_leave_illgrp(phyi->phyint_illv4); 1941 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) 1942 ipmp_ill_leave_illgrp(phyi->phyint_illv6); 1943 1944 /* 1945 * Send routing socket messages indicating that the phyint's ills 1946 * and ipifs have reappeared. 1947 */ 1948 if (phyi->phyint_illv4 != NULL) 1949 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); 1950 if (phyi->phyint_illv6 != NULL) 1951 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); 1952 1953 /* 1954 * Calculate the phyint's cumulative kstats while it was in the group, 1955 * and add that to the group's baseline. 1956 */ 1957 ipmp_phyint_get_kstats(phyi, phyi_kstats); 1958 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 1959 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 1960 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); 1961 } 1962 1963 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1964 1965 phyi->phyint_grp->gr_nif--; 1966 phyi->phyint_grp = NULL; 1967 1968 /* 1969 * As our final act in leaving the group, request a switch back to our 1970 * IPSQ's own xop when we ipsq_exit(). 1971 */ 1972 ASSERT(ipsq->ipsq_swxop == NULL); 1973 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; 1974 1975 rw_exit(&ipst->ips_ipmp_lock); 1976 } 1977 1978 /* 1979 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. 1980 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. 1981 */ 1982 static void 1983 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) 1984 { 1985 uint_t i, j; 1986 const char *name; 1987 kstat_t *ksp; 1988 kstat_named_t *kn; 1989 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1990 zoneid_t zoneid; 1991 1992 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); 1993 zoneid = netstackid_to_zoneid(ipst->ips_netstack->netstack_stackid); 1994 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, zoneid); 1995 if (ksp == NULL) 1996 return; 1997 1998 KSTAT_ENTER(ksp); 1999 2000 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 2001 /* 2002 * Bring kstats up-to-date before recording. 2003 */ 2004 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 2005 2006 kn = KSTAT_NAMED_PTR(ksp); 2007 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2008 name = ipmp_kstats[i].name; 2009 kstats[i] = 0; 2010 for (j = 0; j < ksp->ks_ndata; j++) { 2011 if (strcmp(kn[j].name, name) != 0) 2012 continue; 2013 2014 switch (kn[j].data_type) { 2015 case KSTAT_DATA_INT32: 2016 case KSTAT_DATA_UINT32: 2017 kstats[i] = kn[j].value.ui32; 2018 break; 2019 #ifdef _LP64 2020 case KSTAT_DATA_LONG: 2021 case KSTAT_DATA_ULONG: 2022 kstats[i] = kn[j].value.ul; 2023 break; 2024 #endif 2025 case KSTAT_DATA_INT64: 2026 case KSTAT_DATA_UINT64: 2027 kstats[i] = kn[j].value.ui64; 2028 break; 2029 } 2030 break; 2031 } 2032 } 2033 } 2034 2035 KSTAT_EXIT(ksp); 2036 kstat_rele(ksp); 2037 } 2038 2039 /* 2040 * Refresh the active state of all ills on `phyi'. 2041 */ 2042 void 2043 ipmp_phyint_refresh_active(phyint_t *phyi) 2044 { 2045 if (phyi->phyint_illv4 != NULL) 2046 ipmp_ill_refresh_active(phyi->phyint_illv4); 2047 if (phyi->phyint_illv6 != NULL) 2048 ipmp_ill_refresh_active(phyi->phyint_illv6); 2049 } 2050 2051 /* 2052 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one 2053 * doesn't exist. Caller need not be inside the IPSQ. 2054 */ 2055 ill_t * 2056 ipmp_ipif_hold_bound_ill(const ipif_t *ipif) 2057 { 2058 ill_t *boundill; 2059 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2060 2061 ASSERT(IS_IPMP(ipif->ipif_ill)); 2062 2063 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2064 boundill = ipif->ipif_bound_ill; 2065 if (boundill != NULL && ill_check_and_refhold(boundill)) { 2066 rw_exit(&ipst->ips_ipmp_lock); 2067 return (boundill); 2068 } 2069 rw_exit(&ipst->ips_ipmp_lock); 2070 return (NULL); 2071 } 2072 2073 /* 2074 * Return a pointer to the underlying ill bound to `ipif', or NULL if one 2075 * doesn't exist. Caller must be inside the IPSQ. 2076 */ 2077 ill_t * 2078 ipmp_ipif_bound_ill(const ipif_t *ipif) 2079 { 2080 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); 2081 ASSERT(IS_IPMP(ipif->ipif_ill)); 2082 2083 return (ipif->ipif_bound_ill); 2084 } 2085 2086 /* 2087 * Check if `ipif' is a "stub" (placeholder address not being used). 2088 */ 2089 boolean_t 2090 ipmp_ipif_is_stubaddr(const ipif_t *ipif) 2091 { 2092 if (ipif->ipif_flags & IPIF_UP) 2093 return (B_FALSE); 2094 if (ipif->ipif_ill->ill_isv6) 2095 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2096 else 2097 return (ipif->ipif_lcl_addr == INADDR_ANY); 2098 } 2099 2100 /* 2101 * Check if `ipif' is an IPMP data address. 2102 */ 2103 boolean_t 2104 ipmp_ipif_is_dataaddr(const ipif_t *ipif) 2105 { 2106 if (ipif->ipif_flags & IPIF_NOFAILOVER) 2107 return (B_FALSE); 2108 if (ipif->ipif_ill->ill_isv6) 2109 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2110 else 2111 return (ipif->ipif_lcl_addr != INADDR_ANY); 2112 } 2113 2114 /* 2115 * Check if `ipif' is an IPIF_UP IPMP data address. 2116 */ 2117 static boolean_t 2118 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) 2119 { 2120 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); 2121 } 2122 2123 /* 2124 * Check if `mp' contains a probe packet by verifying if the IP source address 2125 * is a test address on an underlying interface `ill'. Caller need not be inside 2126 * the IPSQ. 2127 */ 2128 boolean_t 2129 ipmp_packet_is_probe(mblk_t *mp, ill_t *ill) 2130 { 2131 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 2132 ipha_t *ipha = (ipha_t *)mp->b_rptr; 2133 2134 ASSERT(DB_TYPE(mp) != M_CTL); 2135 2136 if (!IS_UNDER_IPMP(ill)) 2137 return (B_FALSE); 2138 2139 if (ill->ill_isv6) { 2140 if (!IN6_IS_ADDR_UNSPECIFIED(&ip6h->ip6_src) && 2141 ipif_lookup_testaddr_v6(ill, &ip6h->ip6_src, NULL)) 2142 return (B_TRUE); 2143 } else { 2144 if ((ipha->ipha_src != INADDR_ANY) && 2145 ipif_lookup_testaddr_v4(ill, &ipha->ipha_src, NULL)) 2146 return (B_TRUE); 2147 } 2148 return (B_FALSE); 2149 } 2150 2151 /* 2152 * Pick out an appropriate underlying interface for packet transmit. This 2153 * function may be called from the data path, so we need to verify that the 2154 * IPMP group associated with `ill' is non-null after holding the ill_g_lock. 2155 * Caller need not be inside the IPSQ. 2156 */ 2157 ill_t * 2158 ipmp_ill_get_xmit_ill(ill_t *ill, boolean_t is_unicast) 2159 { 2160 ill_t *xmit_ill; 2161 ip_stack_t *ipst = ill->ill_ipst; 2162 2163 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2164 if (ill->ill_grp == NULL) { 2165 /* 2166 * The interface was taken out of the group. Return ill itself, 2167 * but take a ref so that callers will always be able to do 2168 * ill_refrele(ill); 2169 */ 2170 rw_exit(&ipst->ips_ill_g_lock); 2171 ill_refhold(ill); 2172 return (ill); 2173 } 2174 if (!is_unicast) 2175 xmit_ill = ipmp_illgrp_hold_cast_ill(ill->ill_grp); 2176 else 2177 xmit_ill = ipmp_illgrp_hold_next_ill(ill->ill_grp); 2178 rw_exit(&ipst->ips_ill_g_lock); 2179 return (xmit_ill); 2180 } 2181 2182 /* 2183 * Flush out any nce that points at `ncec' from an underlying interface 2184 */ 2185 void 2186 ipmp_ncec_flush_nce(ncec_t *ncec) 2187 { 2188 ill_t *ncec_ill = ncec->ncec_ill; 2189 ill_t *ill; 2190 ipmp_illgrp_t *illg; 2191 ip_stack_t *ipst = ncec_ill->ill_ipst; 2192 list_t dead; 2193 nce_t *nce; 2194 2195 if (!IS_IPMP(ncec_ill)) 2196 return; 2197 2198 illg = ncec_ill->ill_grp; 2199 list_create(&dead, sizeof (nce_t), offsetof(nce_t, nce_node)); 2200 2201 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 2202 ill = list_head(&illg->ig_if); 2203 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 2204 nce_fastpath_list_delete(ill, ncec, &dead); 2205 } 2206 rw_exit(&ipst->ips_ill_g_lock); 2207 2208 /* 2209 * we may now nce_refrele() all dead entries since all locks have been 2210 * dropped. 2211 */ 2212 while ((nce = list_head(&dead)) != NULL) { 2213 list_remove(&dead, nce); 2214 nce_refrele(nce); 2215 } 2216 ASSERT(list_is_empty(&dead)); 2217 list_destroy(&dead); 2218 } 2219 2220 /* 2221 * For each interface in the IPMP group, if there are nce_t entries for the IP 2222 * address corresponding to `ncec', then their dl_unitdata_req_t and fastpath 2223 * information must be updated to match the link-layer address information in 2224 * `ncec'. 2225 */ 2226 void 2227 ipmp_ncec_fastpath(ncec_t *ncec, ill_t *ipmp_ill) 2228 { 2229 ill_t *ill; 2230 ipmp_illgrp_t *illg = ipmp_ill->ill_grp; 2231 ip_stack_t *ipst = ipmp_ill->ill_ipst; 2232 nce_t *nce, *nce_next; 2233 list_t replace; 2234 2235 ASSERT(IS_IPMP(ipmp_ill)); 2236 2237 /* 2238 * if ncec itself is not reachable, there is no use in creating nce_t 2239 * entries on the underlying interfaces in the group. 2240 */ 2241 if (!NCE_ISREACHABLE(ncec)) 2242 return; 2243 2244 list_create(&replace, sizeof (nce_t), offsetof(nce_t, nce_node)); 2245 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2246 ill = list_head(&illg->ig_actif); 2247 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 2248 /* 2249 * For each underlying interface, we first check if there is an 2250 * nce_t for the address in ncec->ncec_addr. If one exists, 2251 * we should trigger nce_fastpath for that nce_t. However, the 2252 * catch is that we are holding the ips_ipmp_lock to prevent 2253 * changes to the IPMP group membership, so that we cannot 2254 * putnext() to the driver. So we nce_delete the 2255 * list nce_t entries that need to be updated into the 2256 * `replace' list, and then process the `replace' list 2257 * after dropping the ips_ipmp_lock. 2258 */ 2259 mutex_enter(&ill->ill_lock); 2260 for (nce = list_head(&ill->ill_nce); nce != NULL; ) { 2261 nce_next = list_next(&ill->ill_nce, nce); 2262 if (!IN6_ARE_ADDR_EQUAL(&nce->nce_addr, 2263 &ncec->ncec_addr)) { 2264 nce = nce_next; 2265 continue; 2266 } 2267 nce_refhold(nce); 2268 nce_delete(nce); 2269 list_insert_tail(&replace, nce); 2270 nce = nce_next; 2271 } 2272 mutex_exit(&ill->ill_lock); 2273 } 2274 rw_exit(&ipst->ips_ipmp_lock); 2275 /* 2276 * `replace' now has the list of nce's on which we should be triggering 2277 * nce_fastpath(). We now retrigger fastpath by setting up the nce 2278 * again. The code in nce_lookup_then_add_v* ensures that nce->nce_ill 2279 * is still in the group for ncec->ncec_ill 2280 */ 2281 while ((nce = list_head(&replace)) != NULL) { 2282 list_remove(&replace, nce); 2283 if (ncec->ncec_ill->ill_isv6) { 2284 (void) nce_lookup_then_add_v6(nce->nce_ill, 2285 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2286 &nce->nce_addr, ncec->ncec_flags, ND_UNCHANGED, 2287 NULL); 2288 } else { 2289 ipaddr_t ipaddr; 2290 2291 IN6_V4MAPPED_TO_IPADDR(&ncec->ncec_addr, ipaddr); 2292 (void) nce_lookup_then_add_v4(nce->nce_ill, 2293 ncec->ncec_lladdr, ncec->ncec_lladdr_length, 2294 &ipaddr, ncec->ncec_flags, ND_UNCHANGED, NULL); 2295 } 2296 nce_refrele(nce); 2297 } 2298 ASSERT(list_is_empty(&replace)); 2299 list_destroy(&replace); 2300 } 2301