1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #include <inet/arp.h> 26 #include <inet/ip.h> 27 #include <inet/ip6.h> 28 #include <inet/ip_if.h> 29 #include <inet/ip_ire.h> 30 #include <inet/ip_multi.h> 31 #include <inet/ip_rts.h> 32 #include <inet/mi.h> 33 #include <net/if_types.h> 34 #include <sys/dlpi.h> 35 #include <sys/kmem.h> 36 #include <sys/modhash.h> 37 #include <sys/sdt.h> 38 #include <sys/strsun.h> 39 #include <sys/sunddi.h> 40 #include <sys/types.h> 41 42 /* 43 * Convenience macros for getting the ip_stack_t associated with an 44 * ipmp_illgrp_t or ipmp_grp_t. 45 */ 46 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) 47 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) 48 49 /* 50 * Assorted constants that aren't important enough to be tunable. 51 */ 52 #define IPMP_GRP_HASH_SIZE 64 53 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ 54 55 /* 56 * Templates for IPMP ARP messages. 57 */ 58 static const arie_t ipmp_aract_template = { 59 AR_IPMP_ACTIVATE, 60 sizeof (arie_t), /* Name offset */ 61 sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ 62 }; 63 64 static const arie_t ipmp_ardeact_template = { 65 AR_IPMP_DEACTIVATE, 66 sizeof (arie_t), /* Name offset */ 67 sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ 68 }; 69 70 /* 71 * IPMP meta-interface kstats (based on those in PSARC/1997/198). 72 */ 73 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { 74 { "obytes", KSTAT_DATA_UINT32 }, 75 { "obytes64", KSTAT_DATA_UINT64 }, 76 { "rbytes", KSTAT_DATA_UINT32 }, 77 { "rbytes64", KSTAT_DATA_UINT64 }, 78 { "opackets", KSTAT_DATA_UINT32 }, 79 { "opackets64", KSTAT_DATA_UINT64 }, 80 { "oerrors", KSTAT_DATA_UINT32 }, 81 { "ipackets", KSTAT_DATA_UINT32 }, 82 { "ipackets64", KSTAT_DATA_UINT64 }, 83 { "ierrors", KSTAT_DATA_UINT32 }, 84 { "multircv", KSTAT_DATA_UINT32 }, 85 { "multixmt", KSTAT_DATA_UINT32 }, 86 { "brdcstrcv", KSTAT_DATA_UINT32 }, 87 { "brdcstxmt", KSTAT_DATA_UINT32 }, 88 { "link_up", KSTAT_DATA_UINT32 } 89 }; 90 91 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); 92 static int ipmp_grp_create_kstats(ipmp_grp_t *); 93 static int ipmp_grp_update_kstats(kstat_t *, int); 94 static void ipmp_grp_destroy_kstats(ipmp_grp_t *); 95 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); 96 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); 97 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); 98 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); 99 static boolean_t ipmp_ill_activate(ill_t *); 100 static void ipmp_ill_deactivate(ill_t *); 101 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); 102 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); 103 static void ipmp_ill_refresh_active_timer_start(ill_t *); 104 static void ipmp_ill_rtsaddrmsg(ill_t *, int); 105 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); 106 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); 107 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); 108 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); 109 110 /* 111 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). 112 */ 113 void 114 ipmp_init(ip_stack_t *ipst) 115 { 116 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", 117 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 118 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 119 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); 120 } 121 122 /* 123 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). 124 */ 125 void 126 ipmp_destroy(ip_stack_t *ipst) 127 { 128 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); 129 rw_destroy(&ipst->ips_ipmp_lock); 130 } 131 132 /* 133 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', 134 * and add it to the hash. On success, return a pointer to the created group. 135 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP 136 * meta-interface associated with the group also has the same name (but they 137 * may differ later via ipmp_grp_rename()). 138 */ 139 ipmp_grp_t * 140 ipmp_grp_create(const char *grname, phyint_t *phyi) 141 { 142 ipmp_grp_t *grp; 143 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 144 mod_hash_hndl_t mh; 145 146 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 147 148 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) 149 return (NULL); 150 151 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 152 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); 153 154 /* 155 * Cache the group's phyint. This is safe since a phyint_t will 156 * outlive its ipmp_grp_t. 157 */ 158 grp->gr_phyint = phyi; 159 160 /* 161 * Create IPMP group kstats. 162 */ 163 if (ipmp_grp_create_kstats(grp) != 0) { 164 kmem_free(grp, sizeof (ipmp_grp_t)); 165 return (NULL); 166 } 167 168 /* 169 * Insert the group into the hash. 170 */ 171 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { 172 ipmp_grp_destroy_kstats(grp); 173 kmem_free(grp, sizeof (ipmp_grp_t)); 174 return (NULL); 175 } 176 ipmp_grp_insert(grp, mh); 177 178 return (grp); 179 } 180 181 /* 182 * Create IPMP kstat structures for `grp'. Return an errno upon failure. 183 */ 184 static int 185 ipmp_grp_create_kstats(ipmp_grp_t *grp) 186 { 187 kstat_t *ksp; 188 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 189 190 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", 191 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); 192 if (ksp == NULL) 193 return (ENOMEM); 194 195 ksp->ks_update = ipmp_grp_update_kstats; 196 ksp->ks_private = grp; 197 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); 198 199 kstat_install(ksp); 200 grp->gr_ksp = ksp; 201 return (0); 202 } 203 204 /* 205 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. 206 */ 207 static int 208 ipmp_grp_update_kstats(kstat_t *ksp, int rw) 209 { 210 uint_t i; 211 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 212 ipmp_grp_t *grp = ksp->ks_private; 213 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 214 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; 215 phyint_t *phyi; 216 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 217 218 if (rw == KSTAT_WRITE) 219 return (EACCES); 220 221 /* 222 * Start with the group's baseline values. 223 */ 224 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 225 if (kn[i].data_type == KSTAT_DATA_UINT32) { 226 kn[i].value.ui32 = grp->gr_kstats0[i]; 227 } else { 228 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); 229 kn[i].value.ui64 = grp->gr_kstats0[i]; 230 } 231 } 232 233 /* 234 * Add in the stats of each phyint currently in the group. Since we 235 * don't directly track the phyints in a group, we cheat by walking 236 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while 237 * ill_g_lock is held.) 238 */ 239 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 240 ipsq = grp_ipsq->ipsq_next; 241 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { 242 phyi = ipsq->ipsq_phyint; 243 244 /* 245 * If a phyint in a group is being unplumbed, it's possible 246 * that ill_glist_delete() -> phyint_free() already freed the 247 * phyint (and set ipsq_phyint to NULL), but the unplumb 248 * operation has yet to complete (and thus ipsq_dq() has yet 249 * to remove the phyint's IPSQ from the group IPSQ's phyint 250 * list). We skip those phyints here (note that their kstats 251 * have already been added to gr_kstats0[]). 252 */ 253 if (phyi == NULL) 254 continue; 255 256 ipmp_phyint_get_kstats(phyi, phyi_kstats); 257 258 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 259 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 260 if (kn[i].data_type == KSTAT_DATA_UINT32) 261 kn[i].value.ui32 += phyi_kstats[i]; 262 else 263 kn[i].value.ui64 += phyi_kstats[i]; 264 } 265 } 266 267 kn[IPMP_KSTAT_LINK_UP].value.ui32 = 268 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; 269 270 rw_exit(&ipst->ips_ill_g_lock); 271 return (0); 272 } 273 274 /* 275 * Destroy IPMP kstat structures for `grp'. 276 */ 277 static void 278 ipmp_grp_destroy_kstats(ipmp_grp_t *grp) 279 { 280 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 281 282 kstat_delete_netstack(grp->gr_ksp, id); 283 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); 284 grp->gr_ksp = NULL; 285 } 286 287 /* 288 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it 289 * does not exist. 290 */ 291 ipmp_grp_t * 292 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) 293 { 294 ipmp_grp_t *grp; 295 296 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 297 298 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 299 (mod_hash_val_t *)&grp) == 0) 300 return (grp); 301 302 return (NULL); 303 } 304 305 /* 306 * Place information about group `grp' into `lifgr'. 307 */ 308 void 309 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) 310 { 311 ill_t *ill; 312 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 313 314 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 315 316 lifgr->gi_v4 = (grp->gr_v4 != NULL); 317 lifgr->gi_v6 = (grp->gr_v6 != NULL); 318 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; 319 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; 320 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; 321 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); 322 lifgr->gi_m4ifname[0] = '\0'; 323 lifgr->gi_m6ifname[0] = '\0'; 324 lifgr->gi_bcifname[0] = '\0'; 325 326 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { 327 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); 328 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); 329 } 330 331 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) 332 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); 333 } 334 335 /* 336 * Insert `grp' into the hash using the reserved hash entry `mh'. 337 * Caller must ensure `grp' is not yet in the hash. 338 */ 339 static void 340 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) 341 { 342 int err; 343 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 344 345 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 346 347 /* 348 * Since grp->gr_name will exist at least as long as `grp' is in the 349 * hash, we use it directly as the key. 350 */ 351 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, 352 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); 353 if (err != 0) { 354 /* 355 * This should never happen since `mh' was preallocated. 356 */ 357 panic("cannot insert IPMP group \"%s\" (err %d)", 358 grp->gr_name, err); 359 } 360 } 361 362 /* 363 * Remove `grp' from the hash. Caller must ensure `grp' is in it. 364 */ 365 static void 366 ipmp_grp_remove(ipmp_grp_t *grp) 367 { 368 int err; 369 mod_hash_val_t val; 370 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; 371 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 372 373 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 374 375 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); 376 if (err != 0 || val != grp) { 377 panic("cannot remove IPMP group \"%s\" (err %d)", 378 grp->gr_name, err); 379 } 380 } 381 382 /* 383 * Attempt to rename `grp' to new name `grname'. Return an errno if the new 384 * group name already exists or is invalid, or if there isn't enough memory. 385 */ 386 int 387 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) 388 { 389 mod_hash_hndl_t mh; 390 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 391 392 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 393 394 if (grname[0] == '\0') 395 return (EINVAL); 396 397 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 398 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) 399 return (EEXIST); 400 401 /* 402 * Before we remove the group from the hash, ensure we'll be able to 403 * re-insert it by reserving space. 404 */ 405 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) 406 return (ENOMEM); 407 408 ipmp_grp_remove(grp); 409 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 410 ipmp_grp_insert(grp, mh); 411 412 return (0); 413 } 414 415 /* 416 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in 417 * the hash, and that there are no interfaces on it. 418 */ 419 void 420 ipmp_grp_destroy(ipmp_grp_t *grp) 421 { 422 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 423 424 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 425 426 /* 427 * If there are still interfaces using this group, panic before things 428 * go really off the rails. 429 */ 430 if (grp->gr_nif != 0) 431 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); 432 433 ipmp_grp_remove(grp); 434 ipmp_grp_destroy_kstats(grp); 435 436 ASSERT(grp->gr_v4 == NULL); 437 ASSERT(grp->gr_v6 == NULL); 438 ASSERT(grp->gr_nv4 == 0); 439 ASSERT(grp->gr_nv6 == 0); 440 ASSERT(grp->gr_nactif == 0); 441 ASSERT(grp->gr_linkdownmp == NULL); 442 grp->gr_phyint = NULL; 443 444 kmem_free(grp, sizeof (ipmp_grp_t)); 445 } 446 447 /* 448 * Check whether `ill' is suitable for inclusion into `grp', and return an 449 * errno describing the problem (if any). NOTE: many of these errno values 450 * are interpreted by ifconfig, which will take corrective action and retry 451 * the SIOCSLIFGROUPNAME, so please exercise care when changing them. 452 */ 453 static int 454 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) 455 { 456 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 457 458 ASSERT(IAM_WRITER_ILL(ill)); 459 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 460 461 /* 462 * To sidestep complicated address migration logic in the kernel and 463 * to force the kernel's all-hosts multicast memberships to be blown 464 * away, all addresses that had been brought up must be brought back 465 * down prior to adding an interface to a group. (This includes 466 * addresses currently down due to DAD.) Once the interface has been 467 * added to the group, its addresses can then be brought back up, at 468 * which point they will be moved to the IPMP meta-interface. 469 * NOTE: we do this before ill_appaddr_cnt() since bringing down the 470 * link-local causes in.ndpd to remove its ADDRCONF'd addresses. 471 */ 472 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 473 return (EADDRINUSE); 474 475 /* 476 * To avoid confusing applications by changing addresses that are 477 * under their control, all such control must be removed prior to 478 * adding an interface into a group. 479 */ 480 if (ill_appaddr_cnt(ill) != 0) 481 return (EADDRNOTAVAIL); 482 483 /* 484 * Since PTP addresses do not share the same broadcast domain, they 485 * are not allowed to be in an IPMP group. 486 */ 487 if (ill_ptpaddr_cnt(ill) != 0) 488 return (EINVAL); 489 490 /* 491 * An ill must support multicast to be allowed into a group. 492 */ 493 if (!(ill->ill_flags & ILLF_MULTICAST)) 494 return (ENOTSUP); 495 496 /* 497 * An ill must strictly be using ARP and/or ND for address 498 * resolution for it to be allowed into a group. 499 */ 500 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) 501 return (ENOTSUP); 502 503 /* 504 * An ill cannot also be using usesrc groups. (Although usesrc uses 505 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does 506 * all its modifications as writer.) 507 */ 508 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) 509 return (ENOTSUP); 510 511 /* 512 * All ills in a group must be the same mactype. 513 */ 514 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) 515 return (EINVAL); 516 517 return (0); 518 } 519 520 /* 521 * Check whether `phyi' is suitable for inclusion into `grp', and return an 522 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() 523 * regarding errno values. 524 */ 525 int 526 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) 527 { 528 int err = 0; 529 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 530 531 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); 532 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 533 534 /* 535 * An interface cannot have address families plumbed that are not 536 * configured in the group. 537 */ 538 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || 539 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) 540 return (EAFNOSUPPORT); 541 542 if (phyi->phyint_illv4 != NULL) 543 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); 544 if (err == 0 && phyi->phyint_illv6 != NULL) 545 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); 546 547 return (err); 548 } 549 550 /* 551 * Create a new illgrp on IPMP meta-interface `ill'. 552 */ 553 ipmp_illgrp_t * 554 ipmp_illgrp_create(ill_t *ill) 555 { 556 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 557 ipmp_illgrp_t *illg; 558 559 ASSERT(IAM_WRITER_ILL(ill)); 560 ASSERT(IS_IPMP(ill)); 561 ASSERT(ill->ill_grp == NULL); 562 563 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) 564 return (NULL); 565 566 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); 567 list_create(&illg->ig_actif, sizeof (ill_t), 568 offsetof(ill_t, ill_actnode)); 569 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), 570 offsetof(ipmp_arpent_t, ia_node)); 571 572 illg->ig_ipmp_ill = ill; 573 ill->ill_grp = illg; 574 ipmp_illgrp_set_mtu(illg, mtu); 575 576 return (illg); 577 } 578 579 /* 580 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. 581 */ 582 void 583 ipmp_illgrp_destroy(ipmp_illgrp_t *illg) 584 { 585 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 586 ASSERT(IS_IPMP(illg->ig_ipmp_ill)); 587 588 /* 589 * Verify `illg' is empty. 590 */ 591 ASSERT(illg->ig_next_ill == NULL); 592 ASSERT(illg->ig_cast_ill == NULL); 593 ASSERT(list_is_empty(&illg->ig_arpent)); 594 ASSERT(list_is_empty(&illg->ig_if)); 595 ASSERT(list_is_empty(&illg->ig_actif)); 596 ASSERT(illg->ig_nactif == 0); 597 598 /* 599 * Destroy `illg'. 600 */ 601 illg->ig_ipmp_ill->ill_grp = NULL; 602 illg->ig_ipmp_ill = NULL; 603 list_destroy(&illg->ig_if); 604 list_destroy(&illg->ig_actif); 605 list_destroy(&illg->ig_arpent); 606 kmem_free(illg, sizeof (ipmp_illgrp_t)); 607 } 608 609 /* 610 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to 611 * bind it to an underlying ill, while keeping an even address distribution. 612 * If the bind is successful, return a pointer to the bound ill. 613 */ 614 ill_t * 615 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 616 { 617 ill_t *minill; 618 ipmp_arpent_t *entp; 619 620 ASSERT(IAM_WRITER_IPIF(ipif)); 621 ASSERT(ipmp_ipif_is_dataaddr(ipif)); 622 623 /* 624 * IPMP data address mappings are internally managed by IP itself, so 625 * delete any existing ARP entries associated with the address. 626 */ 627 if (!ipif->ipif_isv6) { 628 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); 629 if (entp != NULL) 630 ipmp_illgrp_destroy_arpent(illg, entp); 631 } 632 633 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 634 ipmp_ill_bind_ipif(minill, ipif, Res_act_none); 635 636 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); 637 } 638 639 /* 640 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's 641 * bound, unbind it from the underlying ill while keeping an even address 642 * distribution. 643 */ 644 void 645 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 646 { 647 ill_t *maxill, *boundill = ipif->ipif_bound_ill; 648 649 ASSERT(IAM_WRITER_IPIF(ipif)); 650 651 if (boundill != NULL) { 652 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); 653 654 maxill = ipmp_illgrp_max_ill(illg); 655 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { 656 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 657 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); 658 } 659 } 660 } 661 662 /* 663 * Return the active ill with the greatest number of data addresses in `illg'. 664 */ 665 static ill_t * 666 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) 667 { 668 ill_t *ill, *bestill = NULL; 669 670 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 671 672 ill = list_head(&illg->ig_actif); 673 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 674 if (bestill == NULL || 675 ill->ill_bound_cnt > bestill->ill_bound_cnt) { 676 bestill = ill; 677 } 678 } 679 return (bestill); 680 } 681 682 /* 683 * Return the active ill with the fewest number of data addresses in `illg'. 684 */ 685 static ill_t * 686 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) 687 { 688 ill_t *ill, *bestill = NULL; 689 690 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 691 692 ill = list_head(&illg->ig_actif); 693 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 694 if (bestill == NULL || 695 ill->ill_bound_cnt < bestill->ill_bound_cnt) { 696 if (ill->ill_bound_cnt == 0) 697 return (ill); /* can't get better */ 698 bestill = ill; 699 } 700 } 701 return (bestill); 702 } 703 704 /* 705 * Return a pointer to IPMP meta-interface for `illg' (which must exist). 706 * Since ig_ipmp_ill never changes for a given illg, no locks are needed. 707 */ 708 ill_t * 709 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) 710 { 711 return (illg->ig_ipmp_ill); 712 } 713 714 /* 715 * Return a pointer to the next available underlying ill in `illg', or NULL if 716 * one doesn't exist. Caller must be inside the IPSQ. 717 */ 718 ill_t * 719 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) 720 { 721 ill_t *ill; 722 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 723 724 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 725 726 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 727 if ((ill = illg->ig_next_ill) != NULL) { 728 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 729 if (illg->ig_next_ill == NULL) 730 illg->ig_next_ill = list_head(&illg->ig_actif); 731 } 732 rw_exit(&ipst->ips_ipmp_lock); 733 734 return (ill); 735 } 736 737 /* 738 * Return a held pointer to the next available underlying ill in `illg', or 739 * NULL if one doesn't exist. Caller need not be inside the IPSQ. 740 */ 741 ill_t * 742 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) 743 { 744 ill_t *ill; 745 uint_t i; 746 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 747 748 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 749 for (i = 0; i < illg->ig_nactif; i++) { 750 ill = illg->ig_next_ill; 751 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 752 if (illg->ig_next_ill == NULL) 753 illg->ig_next_ill = list_head(&illg->ig_actif); 754 755 if (ill_check_and_refhold(ill) == 0) { 756 rw_exit(&ipst->ips_ipmp_lock); 757 return (ill); 758 } 759 } 760 rw_exit(&ipst->ips_ipmp_lock); 761 762 return (NULL); 763 } 764 765 /* 766 * Return a pointer to the nominated multicast ill in `illg', or NULL if one 767 * doesn't exist. Caller must be inside the IPSQ. 768 */ 769 ill_t * 770 ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) 771 { 772 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 773 return (illg->ig_cast_ill); 774 } 775 776 /* 777 * Return a held pointer to the nominated multicast ill in `illg', or NULL if 778 * one doesn't exist. Caller need not be inside the IPSQ. 779 */ 780 ill_t * 781 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) 782 { 783 ill_t *castill; 784 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 785 786 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 787 castill = illg->ig_cast_ill; 788 if (castill != NULL && ill_check_and_refhold(castill) == 0) { 789 rw_exit(&ipst->ips_ipmp_lock); 790 return (castill); 791 } 792 rw_exit(&ipst->ips_ipmp_lock); 793 return (NULL); 794 } 795 796 /* 797 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, 798 * any existing nomination is removed. Caller must be inside the IPSQ. 799 */ 800 static void 801 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) 802 { 803 ill_t *ocastill = illg->ig_cast_ill; 804 ill_t *ipmp_ill = illg->ig_ipmp_ill; 805 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 806 807 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 808 809 /* 810 * Disable old nominated ill (if any). 811 */ 812 if (ocastill != NULL) { 813 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, 814 illg, ill_t *, ocastill); 815 ASSERT(ocastill->ill_nom_cast); 816 ocastill->ill_nom_cast = B_FALSE; 817 /* 818 * If the IPMP meta-interface is down, we never did the join, 819 * so we must not try to leave. 820 */ 821 if (ipmp_ill->ill_dl_up) 822 ill_leave_multicast(ipmp_ill); 823 } 824 825 /* 826 * Set new nomination. 827 */ 828 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 829 illg->ig_cast_ill = castill; 830 rw_exit(&ipst->ips_ipmp_lock); 831 832 if (ocastill != NULL) { 833 /* 834 * Delete any IREs tied to the old nomination. We must do 835 * this after the new castill is set and has reached global 836 * visibility since the datapath has not been quiesced. 837 */ 838 ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, 839 ill_stq_cache_delete, ocastill, ocastill); 840 } 841 842 /* 843 * Enable new nominated ill (if any). 844 */ 845 if (castill != NULL) { 846 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, 847 illg, ill_t *, castill); 848 ASSERT(!castill->ill_nom_cast); 849 castill->ill_nom_cast = B_TRUE; 850 /* 851 * If the IPMP meta-interface is down, the attempt to recover 852 * will silently fail but ill_need_recover_multicast will be 853 * erroneously cleared -- so check first. 854 */ 855 if (ipmp_ill->ill_dl_up) 856 ill_recover_multicast(ipmp_ill); 857 } 858 859 /* 860 * For IPv4, refresh our broadcast IREs. This needs to be done even 861 * if there's no new nomination since ill_refresh_bcast() still must 862 * update the IPMP meta-interface's broadcast IREs to point back at 863 * the IPMP meta-interface itself. 864 */ 865 if (!ipmp_ill->ill_isv6) 866 ill_refresh_bcast(ipmp_ill); 867 } 868 869 /* 870 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an 871 * entry for the same IP address already exists, destroy it first. Return the 872 * created IPMP ARP entry, or NULL on failure. 873 */ 874 ipmp_arpent_t * 875 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) 876 { 877 uchar_t *addrp; 878 area_t *area = (area_t *)mp->b_rptr; 879 ipmp_arpent_t *entp, *oentp; 880 881 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 882 ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); 883 884 if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) 885 return (NULL); 886 887 if ((mp = copyb(mp)) == NULL) { 888 kmem_free(entp, sizeof (ipmp_arpent_t)); 889 return (NULL); 890 } 891 892 DB_TYPE(mp) = M_PROTO; 893 entp->ia_area_mp = mp; 894 entp->ia_proxyarp = proxyarp; 895 addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, 896 sizeof (ipaddr_t)); 897 bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); 898 899 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) 900 ipmp_illgrp_destroy_arpent(illg, oentp); 901 902 list_insert_head(&illg->ig_arpent, entp); 903 return (entp); 904 } 905 906 /* 907 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. 908 */ 909 void 910 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 911 { 912 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 913 914 list_remove(&illg->ig_arpent, entp); 915 freeb(entp->ia_area_mp); 916 kmem_free(entp, sizeof (ipmp_arpent_t)); 917 } 918 919 /* 920 * Mark that ARP has been notified about the IP address on `entp'; `illg' is 921 * taken as a debugging aid for DTrace FBT probes. 922 */ 923 /* ARGSUSED */ 924 void 925 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 926 { 927 entp->ia_notified = B_TRUE; 928 } 929 930 /* 931 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is 932 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. 933 */ 934 ipmp_arpent_t * 935 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) 936 { 937 ipmp_arpent_t *entp = list_head(&illg->ig_arpent); 938 939 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 940 941 if (addrp == NULL) 942 return (entp); 943 944 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) 945 if (entp->ia_ipaddr == *addrp) 946 break; 947 return (entp); 948 } 949 950 /* 951 * Refresh ARP entries on `illg' to be distributed across its active 952 * interfaces. Entries that cannot be refreshed (e.g., because there are no 953 * active interfaces) are marked so that subsequent calls can try again. 954 */ 955 void 956 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) 957 { 958 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; 959 uint_t paddrlen = ipmp_ill->ill_phys_addr_length; 960 area_t *area; 961 mblk_t *area_mp; 962 uchar_t *physaddr; 963 ipmp_arpent_t *entp; 964 965 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 966 ASSERT(!ipmp_ill->ill_isv6); 967 968 ill = list_head(&illg->ig_actif); 969 entp = list_head(&illg->ig_arpent); 970 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { 971 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { 972 entp->ia_notified = B_FALSE; 973 continue; 974 } 975 976 area = (area_t *)entp->ia_area_mp->b_rptr; 977 ASSERT(paddrlen == ill->ill_phys_addr_length); 978 ASSERT(paddrlen == area->area_hw_addr_length); 979 physaddr = mi_offset_paramc(entp->ia_area_mp, 980 area->area_hw_addr_offset, paddrlen); 981 982 /* 983 * If this is a proxy ARP entry, we can skip notifying ARP if 984 * the entry is already up-to-date. If it has changed, we 985 * update the entry's hardware address before notifying ARP. 986 */ 987 if (entp->ia_proxyarp) { 988 if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && 989 entp->ia_notified) 990 continue; 991 bcopy(ill->ill_phys_addr, physaddr, paddrlen); 992 } 993 994 if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { 995 entp->ia_notified = B_FALSE; 996 continue; 997 } 998 999 putnext(ipmp_ill->ill_rq, area_mp); 1000 ipmp_illgrp_mark_arpent(illg, entp); 1001 1002 if ((ill = list_next(&illg->ig_actif, ill)) == NULL) 1003 ill = list_head(&illg->ig_actif); 1004 } 1005 } 1006 1007 /* 1008 * Return an interface in `illg' with the specified `physaddr', or NULL if one 1009 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. 1010 */ 1011 ill_t * 1012 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) 1013 { 1014 ill_t *ill; 1015 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1016 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1017 1018 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 1019 1020 ill = list_head(&illg->ig_if); 1021 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1022 if (ill->ill_phys_addr_length == paddrlen && 1023 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) 1024 return (ill); 1025 } 1026 return (NULL); 1027 } 1028 1029 /* 1030 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. 1031 * Caller must be inside the IPSQ unless this is initialization. 1032 */ 1033 static void 1034 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) 1035 { 1036 ill_t *ill = illg->ig_ipmp_ill; 1037 mblk_t *mp; 1038 1039 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); 1040 1041 /* 1042 * If allocation fails, we have bigger problems than MTU. 1043 */ 1044 if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { 1045 illg->ig_mtu = mtu; 1046 put(ill->ill_rq, mp); 1047 } 1048 } 1049 1050 /* 1051 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP 1052 * ill MTU if necessary. 1053 */ 1054 void 1055 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) 1056 { 1057 ill_t *ill; 1058 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1059 uint_t mtu = 0; 1060 1061 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 1062 1063 /* 1064 * Since ill_max_mtu can only change under ill_lock, we hold ill_lock 1065 * for each ill as we iterate through the list. Any changes to the 1066 * ill_max_mtu will also trigger an update, so even if we missed it 1067 * this time around, the update will catch it. 1068 */ 1069 ill = list_head(&illg->ig_if); 1070 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1071 mutex_enter(&ill->ill_lock); 1072 if (mtu == 0 || ill->ill_max_mtu < mtu) 1073 mtu = ill->ill_max_mtu; 1074 mutex_exit(&ill->ill_lock); 1075 } 1076 1077 /* 1078 * MTU must be at least the minimum MTU. 1079 */ 1080 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1081 1082 if (illg->ig_mtu != mtu) 1083 ipmp_illgrp_set_mtu(illg, mtu); 1084 } 1085 1086 /* 1087 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently 1088 * allow the same link to be established more than once. 1089 */ 1090 void 1091 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) 1092 { 1093 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1094 1095 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1096 1097 if (illg->ig_ipmp_ill->ill_isv6) { 1098 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); 1099 grp->gr_v6 = illg; 1100 } else { 1101 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); 1102 grp->gr_v4 = illg; 1103 } 1104 } 1105 1106 /* 1107 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp 1108 * cannot be unlinked (e.g., because there are still interfaces using it). 1109 */ 1110 int 1111 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) 1112 { 1113 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; 1114 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1115 1116 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1117 1118 if (illg->ig_ipmp_ill->ill_isv6) { 1119 if (grp->gr_nv6 + grp->gr_pendv6 != 0) 1120 return (EBUSY); 1121 grp->gr_v6 = NULL; 1122 } else { 1123 if (grp->gr_nv4 + grp->gr_pendv4 != 0) 1124 return (EBUSY); 1125 grp->gr_v4 = NULL; 1126 } 1127 return (0); 1128 } 1129 1130 /* 1131 * Place `ill' into `illg', and rebalance the data addresses on `illg' 1132 * to be spread evenly across the ills now in it. Also, adjust the IPMP 1133 * ill as necessary to account for `ill' (e.g., MTU). 1134 */ 1135 void 1136 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) 1137 { 1138 ill_t *ipmp_ill; 1139 ipif_t *ipif; 1140 ip_stack_t *ipst = ill->ill_ipst; 1141 1142 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ 1143 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); 1144 ASSERT(IAM_WRITER_ILL(ill)); 1145 ASSERT(ill->ill_grp == NULL); 1146 1147 ipmp_ill = illg->ig_ipmp_ill; 1148 1149 /* 1150 * Account for `ill' joining the illgrp. 1151 */ 1152 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1153 if (ill->ill_isv6) 1154 ill->ill_phyint->phyint_grp->gr_nv6++; 1155 else 1156 ill->ill_phyint->phyint_grp->gr_nv4++; 1157 rw_exit(&ipst->ips_ipmp_lock); 1158 1159 /* 1160 * Ensure the ILLF_ROUTER flag remains consistent across the group. 1161 */ 1162 mutex_enter(&ill->ill_lock); 1163 if (ipmp_ill->ill_flags & ILLF_ROUTER) 1164 ill->ill_flags |= ILLF_ROUTER; 1165 else 1166 ill->ill_flags &= ~ILLF_ROUTER; 1167 mutex_exit(&ill->ill_lock); 1168 1169 /* 1170 * Blow away all multicast memberships that currently exist on `ill'. 1171 * This may seem odd, but it's consistent with the application view 1172 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). 1173 */ 1174 if (ill->ill_isv6) { 1175 reset_conn_ill(ill); 1176 reset_mrt_ill(ill); 1177 } else { 1178 ipif = ill->ill_ipif; 1179 for (; ipif != NULL; ipif = ipif->ipif_next) { 1180 reset_conn_ipif(ipif); 1181 reset_mrt_vif_ipif(ipif); 1182 } 1183 } 1184 ip_purge_allmulti(ill); 1185 1186 /* 1187 * Borrow the first ill's ill_phys_addr_length value for the illgrp's 1188 * physical address length. All other ills must have the same value, 1189 * since they are required to all be the same mactype. Also update 1190 * the IPMP ill's MTU and CoS marking, if necessary. 1191 */ 1192 if (list_is_empty(&illg->ig_if)) { 1193 ASSERT(ipmp_ill->ill_phys_addr_length == 0); 1194 /* 1195 * NOTE: we leave ill_phys_addr NULL since the IPMP group 1196 * doesn't have a physical address. This means that code must 1197 * not assume that ill_phys_addr is non-NULL just because 1198 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. 1199 */ 1200 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; 1201 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; 1202 ipmp_ill->ill_type = ill->ill_type; 1203 1204 if (ill->ill_flags & ILLF_COS_ENABLED) { 1205 mutex_enter(&ipmp_ill->ill_lock); 1206 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1207 mutex_exit(&ipmp_ill->ill_lock); 1208 } 1209 ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); 1210 } else { 1211 ASSERT(ipmp_ill->ill_phys_addr_length == 1212 ill->ill_phys_addr_length); 1213 ASSERT(ipmp_ill->ill_type == ill->ill_type); 1214 1215 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1216 mutex_enter(&ipmp_ill->ill_lock); 1217 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1218 mutex_exit(&ipmp_ill->ill_lock); 1219 } 1220 if (illg->ig_mtu > ill->ill_max_mtu) 1221 ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); 1222 } 1223 1224 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1225 list_insert_tail(&illg->ig_if, ill); 1226 ill->ill_grp = illg; 1227 rw_exit(&ipst->ips_ill_g_lock); 1228 1229 /* 1230 * Hide the IREs on `ill' so that we don't accidentally find them when 1231 * sending data traffic. 1232 */ 1233 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); 1234 1235 /* 1236 * Merge any broadcast IREs, if need be. 1237 */ 1238 if (!ill->ill_isv6) 1239 ill_refresh_bcast(ill); 1240 1241 ipmp_ill_refresh_active(ill); 1242 } 1243 1244 /* 1245 * Remove `ill' from its illgrp, and rebalance the data addresses in that 1246 * illgrp to be spread evenly across the remaining ills. Also, adjust the 1247 * IPMP ill as necessary now that `ill' is removed (e.g., MTU). 1248 */ 1249 void 1250 ipmp_ill_leave_illgrp(ill_t *ill) 1251 { 1252 ill_t *ipmp_ill; 1253 ipif_t *ipif; 1254 ipmp_arpent_t *entp; 1255 ipmp_illgrp_t *illg = ill->ill_grp; 1256 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1257 1258 ASSERT(IS_UNDER_IPMP(ill)); 1259 ASSERT(IAM_WRITER_ILL(ill)); 1260 ASSERT(illg != NULL); 1261 1262 ipmp_ill = illg->ig_ipmp_ill; 1263 1264 /* 1265 * Cancel IPMP-specific ill timeouts. 1266 */ 1267 (void) untimeout(ill->ill_refresh_tid); 1268 1269 /* 1270 * Expose any previously-hidden IREs on `ill'. 1271 */ 1272 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); 1273 1274 /* 1275 * Ensure the multicast state for each ipif on `ill' is down so that 1276 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin 1277 * all eligible groups. 1278 */ 1279 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1280 if (ipif->ipif_flags & IPIF_UP) 1281 ipif_multicast_down(ipif); 1282 1283 /* 1284 * Account for `ill' leaving the illgrp. 1285 */ 1286 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1287 if (ill->ill_isv6) 1288 ill->ill_phyint->phyint_grp->gr_nv6--; 1289 else 1290 ill->ill_phyint->phyint_grp->gr_nv4--; 1291 rw_exit(&ipst->ips_ipmp_lock); 1292 1293 /* 1294 * Pull `ill' out of the interface lists. 1295 */ 1296 if (list_link_active(&ill->ill_actnode)) 1297 ipmp_ill_deactivate(ill); 1298 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1299 list_remove(&illg->ig_if, ill); 1300 ill->ill_grp = NULL; 1301 rw_exit(&ipst->ips_ill_g_lock); 1302 1303 /* 1304 * Recreate any broadcast IREs that had been shared, if need be. 1305 */ 1306 if (!ill->ill_isv6) 1307 ill_refresh_bcast(ill); 1308 1309 /* 1310 * Re-establish multicast memberships that were previously being 1311 * handled by the IPMP meta-interface. 1312 */ 1313 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1314 if (ipif->ipif_flags & IPIF_UP) 1315 ipif_multicast_up(ipif); 1316 1317 /* 1318 * Refresh the group MTU based on the new interface list. 1319 */ 1320 ipmp_illgrp_refresh_mtu(illg); 1321 1322 if (list_is_empty(&illg->ig_if)) { 1323 /* 1324 * No ills left in the illgrp; we no longer have a physical 1325 * address length, nor can we support ARP, CoS, or anything 1326 * else that depends on knowing the link layer type. 1327 */ 1328 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) 1329 ipmp_illgrp_destroy_arpent(illg, entp); 1330 1331 ipmp_ill->ill_phys_addr_length = 0; 1332 ipmp_ill->ill_nd_lla_len = 0; 1333 ipmp_ill->ill_type = IFT_OTHER; 1334 mutex_enter(&ipmp_ill->ill_lock); 1335 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1336 mutex_exit(&ipmp_ill->ill_lock); 1337 } else { 1338 /* 1339 * If `ill' didn't support CoS, see if it can now be enabled. 1340 */ 1341 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1342 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); 1343 1344 ill = list_head(&illg->ig_if); 1345 do { 1346 if (!(ill->ill_flags & ILLF_COS_ENABLED)) 1347 break; 1348 } while ((ill = list_next(&illg->ig_if, ill)) != NULL); 1349 1350 if (ill == NULL) { 1351 mutex_enter(&ipmp_ill->ill_lock); 1352 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1353 mutex_exit(&ipmp_ill->ill_lock); 1354 } 1355 } 1356 } 1357 } 1358 1359 /* 1360 * Check if `ill' should be active, and activate or deactivate if need be. 1361 * Return B_FALSE if a refresh was necessary but could not be performed. 1362 */ 1363 static boolean_t 1364 ipmp_ill_try_refresh_active(ill_t *ill) 1365 { 1366 boolean_t refreshed = B_TRUE; 1367 1368 ASSERT(IAM_WRITER_ILL(ill)); 1369 ASSERT(IS_UNDER_IPMP(ill)); 1370 1371 if (ipmp_ill_is_active(ill)) { 1372 if (!list_link_active(&ill->ill_actnode)) 1373 refreshed = ipmp_ill_activate(ill); 1374 } else { 1375 if (list_link_active(&ill->ill_actnode)) 1376 ipmp_ill_deactivate(ill); 1377 } 1378 1379 return (refreshed); 1380 } 1381 1382 /* 1383 * Check if `ill' should be active, and activate or deactivate if need be. 1384 * If the refresh fails, schedule a timer to try again later. 1385 */ 1386 void 1387 ipmp_ill_refresh_active(ill_t *ill) 1388 { 1389 if (!ipmp_ill_try_refresh_active(ill)) 1390 ipmp_ill_refresh_active_timer_start(ill); 1391 } 1392 1393 /* 1394 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. 1395 */ 1396 static void 1397 ipmp_ill_refresh_active_timer(void *ill_arg) 1398 { 1399 ill_t *ill = ill_arg; 1400 boolean_t refreshed = B_FALSE; 1401 1402 /* 1403 * Clear ill_refresh_tid to indicate that no timeout is pending 1404 * (another thread could schedule a new timeout while we're still 1405 * running, but that's harmless). If the ill is going away, bail. 1406 */ 1407 mutex_enter(&ill->ill_lock); 1408 ill->ill_refresh_tid = 0; 1409 if (ill->ill_state_flags & ILL_CONDEMNED) { 1410 mutex_exit(&ill->ill_lock); 1411 return; 1412 } 1413 mutex_exit(&ill->ill_lock); 1414 1415 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { 1416 refreshed = ipmp_ill_try_refresh_active(ill); 1417 ipsq_exit(ill->ill_phyint->phyint_ipsq); 1418 } 1419 1420 /* 1421 * If the refresh failed, schedule another attempt. 1422 */ 1423 if (!refreshed) 1424 ipmp_ill_refresh_active_timer_start(ill); 1425 } 1426 1427 /* 1428 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. 1429 */ 1430 static void 1431 ipmp_ill_refresh_active_timer_start(ill_t *ill) 1432 { 1433 mutex_enter(&ill->ill_lock); 1434 1435 /* 1436 * If the ill is going away or a refresh is already scheduled, bail. 1437 */ 1438 if (ill->ill_refresh_tid != 0 || 1439 (ill->ill_state_flags & ILL_CONDEMNED)) { 1440 mutex_exit(&ill->ill_lock); 1441 return; 1442 } 1443 1444 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, 1445 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); 1446 1447 mutex_exit(&ill->ill_lock); 1448 } 1449 1450 /* 1451 * Activate `ill' so it will be used to send and receive data traffic. Return 1452 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages 1453 * needed to deactivate `ill' here as well so that deactivation cannot fail. 1454 */ 1455 static boolean_t 1456 ipmp_ill_activate(ill_t *ill) 1457 { 1458 ipif_t *ipif; 1459 mblk_t *actmp = NULL, *deactmp = NULL; 1460 mblk_t *linkupmp = NULL, *linkdownmp = NULL; 1461 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1462 const char *grifname = grp->gr_ifname; 1463 ipmp_illgrp_t *illg = ill->ill_grp; 1464 ill_t *maxill; 1465 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1466 1467 ASSERT(IAM_WRITER_ILL(ill)); 1468 ASSERT(IS_UNDER_IPMP(ill)); 1469 1470 /* 1471 * If this will be the first active interface in the group, allocate 1472 * the link-up and link-down messages. 1473 */ 1474 if (grp->gr_nactif == 0) { 1475 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); 1476 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); 1477 if (linkupmp == NULL || linkdownmp == NULL) 1478 goto fail; 1479 } 1480 1481 /* 1482 * For IPv4, allocate the activate/deactivate messages, and tell ARP. 1483 */ 1484 if (!ill->ill_isv6) { 1485 actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); 1486 deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); 1487 if (actmp == NULL || deactmp == NULL) 1488 goto fail; 1489 1490 ASSERT(ill->ill_ardeact_mp == NULL); 1491 ill->ill_ardeact_mp = deactmp; 1492 putnext(illg->ig_ipmp_ill->ill_rq, actmp); 1493 } 1494 1495 if (list_is_empty(&illg->ig_actif)) { 1496 /* 1497 * Now that we have an active ill, nominate it for multicast 1498 * and broadcast duties. Do this before ipmp_ill_bind_ipif() 1499 * since that may need to send multicast packets (e.g., IPv6 1500 * neighbor discovery probes). 1501 */ 1502 ipmp_illgrp_set_cast(illg, ill); 1503 1504 /* 1505 * This is the first active ill in the illgrp -- add 'em all. 1506 * We can access/walk ig_ipmp_ill's ipif list since we're 1507 * writer on its IPSQ as well. 1508 */ 1509 ipif = illg->ig_ipmp_ill->ill_ipif; 1510 for (; ipif != NULL; ipif = ipif->ipif_next) 1511 if (ipmp_ipif_is_up_dataaddr(ipif)) 1512 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); 1513 } else { 1514 /* 1515 * Redistribute the addresses by moving them from the ill with 1516 * the most addresses until the ill being activated is at the 1517 * same level as the rest of the ills. 1518 */ 1519 for (;;) { 1520 maxill = ipmp_illgrp_max_ill(illg); 1521 ASSERT(maxill != NULL); 1522 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) 1523 break; 1524 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 1525 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); 1526 } 1527 1528 /* 1529 * TODO: explore whether it's advantageous to flush IRE_CACHE 1530 * bindings to force existing connections to be redistributed 1531 * to the new ill. 1532 */ 1533 } 1534 1535 /* 1536 * Put the interface in the active list. 1537 */ 1538 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1539 list_insert_tail(&illg->ig_actif, ill); 1540 illg->ig_nactif++; 1541 illg->ig_next_ill = ill; 1542 rw_exit(&ipst->ips_ipmp_lock); 1543 1544 /* 1545 * Refresh ARP entries to use `ill', if need be. 1546 */ 1547 if (!ill->ill_isv6) 1548 ipmp_illgrp_refresh_arpent(illg); 1549 1550 /* 1551 * Finally, mark the group link up, if necessary. 1552 */ 1553 if (grp->gr_nactif++ == 0) { 1554 ASSERT(grp->gr_linkdownmp == NULL); 1555 grp->gr_linkdownmp = linkdownmp; 1556 put(illg->ig_ipmp_ill->ill_rq, linkupmp); 1557 } 1558 return (B_TRUE); 1559 fail: 1560 freemsg(actmp); 1561 freemsg(deactmp); 1562 freemsg(linkupmp); 1563 freemsg(linkdownmp); 1564 return (B_FALSE); 1565 } 1566 1567 /* 1568 * Deactivate `ill' so it will not be used to send or receive data traffic. 1569 */ 1570 static void 1571 ipmp_ill_deactivate(ill_t *ill) 1572 { 1573 ill_t *minill; 1574 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; 1575 mblk_t *mp; 1576 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1577 ipmp_illgrp_t *illg = ill->ill_grp; 1578 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1579 1580 ASSERT(IAM_WRITER_ILL(ill)); 1581 ASSERT(IS_UNDER_IPMP(ill)); 1582 1583 /* 1584 * Delete all IRE_CACHE entries for the group. (We cannot restrict 1585 * ourselves to entries with ire_stq == ill since there may be other 1586 * IREs that are backed by ACEs that are tied to this ill -- and thus 1587 * when those ACEs are deleted, the IREs will be adrift without any 1588 * AR_CN_ANNOUNCE notification from ARP.) 1589 */ 1590 if (ill->ill_isv6) 1591 ire_walk_v6(ill_grp_cache_delete, ill, ALL_ZONES, ipst); 1592 else 1593 ire_walk_v4(ill_grp_cache_delete, ill, ALL_ZONES, ipst); 1594 1595 /* 1596 * Pull the interface out of the active list. 1597 */ 1598 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1599 list_remove(&illg->ig_actif, ill); 1600 illg->ig_nactif--; 1601 illg->ig_next_ill = list_head(&illg->ig_actif); 1602 rw_exit(&ipst->ips_ipmp_lock); 1603 1604 /* 1605 * If the ill that's being deactivated had been nominated for 1606 * multicast/broadcast, nominate a new one. 1607 */ 1608 if (ill == illg->ig_cast_ill) 1609 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); 1610 1611 /* 1612 * Unbind all of the ipifs bound to this ill, and save 'em in a list; 1613 * we'll rebind them after we tell the resolver the ill is no longer 1614 * active. We must do things in this order or the resolver could 1615 * accidentally rebind to the ill we're trying to remove if multiple 1616 * ills in the group have the same hardware address (which is 1617 * unsupported, but shouldn't lead to a wedged machine). 1618 */ 1619 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { 1620 ipif->ipif_bound_next = ubheadipif; 1621 ubheadipif = ipif; 1622 } 1623 1624 if (!ill->ill_isv6) { 1625 /* 1626 * Tell ARP `ill' is no longer active in the group. 1627 */ 1628 mp = ill->ill_ardeact_mp; 1629 ill->ill_ardeact_mp = NULL; 1630 ASSERT(mp != NULL); 1631 putnext(illg->ig_ipmp_ill->ill_rq, mp); 1632 1633 /* 1634 * Refresh any ARP entries that had been using `ill'. 1635 */ 1636 ipmp_illgrp_refresh_arpent(illg); 1637 } 1638 1639 /* 1640 * Rebind each ipif from the deactivated ill to the active ill with 1641 * the fewest ipifs. If there are no active ills, the ipifs will 1642 * remain unbound. 1643 */ 1644 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { 1645 ubnextipif = ipif->ipif_bound_next; 1646 ipif->ipif_bound_next = NULL; 1647 1648 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 1649 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); 1650 } 1651 1652 /* 1653 * Finally, mark the group link down, if necessary. 1654 */ 1655 if (--grp->gr_nactif == 0) { 1656 mp = grp->gr_linkdownmp; 1657 grp->gr_linkdownmp = NULL; 1658 ASSERT(mp != NULL); 1659 put(illg->ig_ipmp_ill->ill_rq, mp); 1660 } 1661 } 1662 1663 /* 1664 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) 1665 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. 1666 */ 1667 static void 1668 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) 1669 { 1670 ipif_t *ipif; 1671 1672 ASSERT(IAM_WRITER_ILL(ill)); 1673 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); 1674 1675 /* 1676 * If `ill' is truly down, there are no messages to generate since: 1677 * 1678 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface 1679 * and its addresses by bringing them down. But that's already 1680 * true, so there's nothing to hide. 1681 * 1682 * 2. If cmd == RTM_ADD, then we're supposed to generate messages 1683 * indicating that any previously-hidden up addresses are again 1684 * back up (along with the interface). But they aren't, so 1685 * there's nothing to expose. 1686 */ 1687 if (ill->ill_ipif_up_count == 0) 1688 return; 1689 1690 if (cmd == RTM_ADD) 1691 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); 1692 1693 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1694 if (ipif->ipif_flags & IPIF_UP) 1695 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); 1696 1697 if (cmd == RTM_DELETE) 1698 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); 1699 } 1700 1701 /* 1702 * Bind the address named by `ipif' to the underlying ill named by `ill'. 1703 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' 1704 * will indicate to the resolver whether this is an initial bringup of 1705 * `ipif', or just a rebind to another ill. 1706 */ 1707 static void 1708 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) 1709 { 1710 int err = 0; 1711 ip_stack_t *ipst = ill->ill_ipst; 1712 1713 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); 1714 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); 1715 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); 1716 ASSERT(ipif->ipif_bound_ill == NULL); 1717 ASSERT(ipif->ipif_bound_next == NULL); 1718 1719 ipif->ipif_bound_next = ill->ill_bound_ipif; 1720 ill->ill_bound_ipif = ipif; 1721 ill->ill_bound_cnt++; 1722 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1723 ipif->ipif_bound_ill = ill; 1724 rw_exit(&ipst->ips_ipmp_lock); 1725 1726 /* 1727 * If necessary, tell ARP/NDP about the new mapping. Note that 1728 * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. 1729 */ 1730 if (act != Res_act_none) { 1731 if (ill->ill_isv6) { 1732 VERIFY(ipif_resolver_up(ipif, act) == 0); 1733 err = ipif_ndp_up(ipif, act == Res_act_initial); 1734 } else { 1735 err = ipif_resolver_up(ipif, act); 1736 } 1737 1738 /* 1739 * Since ipif_ndp_up() never returns EINPROGRESS and 1740 * ipif_resolver_up() only returns EINPROGRESS when the 1741 * associated ill is not up, we should never be here with 1742 * EINPROGRESS. We rely on this to simplify the design. 1743 */ 1744 ASSERT(err != EINPROGRESS); 1745 } 1746 /* TODO: retry binding on failure? when? */ 1747 ipif->ipif_bound = (err == 0); 1748 } 1749 1750 /* 1751 * Unbind the address named by `ipif' from the underlying ill named by `ill'. 1752 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. 1753 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is 1754 * B_TRUE, notify the resolver about the change. 1755 */ 1756 static ipif_t * 1757 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) 1758 { 1759 ill_t *ipmp_ill; 1760 ipif_t *previpif; 1761 ip_stack_t *ipst = ill->ill_ipst; 1762 1763 ASSERT(IAM_WRITER_ILL(ill)); 1764 ASSERT(IS_UNDER_IPMP(ill)); 1765 1766 ipmp_ill = ill->ill_grp->ig_ipmp_ill; 1767 1768 /* 1769 * If necessary, find an ipif to unbind. 1770 */ 1771 if (ipif == NULL) { 1772 if ((ipif = ill->ill_bound_ipif) == NULL) { 1773 ASSERT(ill->ill_bound_cnt == 0); 1774 return (NULL); 1775 } 1776 } 1777 1778 ASSERT(IAM_WRITER_IPIF(ipif)); 1779 ASSERT(IS_IPMP(ipif->ipif_ill)); 1780 ASSERT(ipif->ipif_bound_ill == ill); 1781 ASSERT(ill->ill_bound_cnt > 0); 1782 1783 /* 1784 * Unbind it. 1785 */ 1786 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1787 ipif->ipif_bound_ill = NULL; 1788 rw_exit(&ipst->ips_ipmp_lock); 1789 ill->ill_bound_cnt--; 1790 1791 if (ill->ill_bound_ipif == ipif) { 1792 ill->ill_bound_ipif = ipif->ipif_bound_next; 1793 } else { 1794 previpif = ill->ill_bound_ipif; 1795 while (previpif->ipif_bound_next != ipif) 1796 previpif = previpif->ipif_bound_next; 1797 1798 previpif->ipif_bound_next = ipif->ipif_bound_next; 1799 } 1800 ipif->ipif_bound_next = NULL; 1801 1802 /* 1803 * If requested, notify the resolvers (provided we're bound). 1804 */ 1805 if (notifyres && ipif->ipif_bound) { 1806 if (ill->ill_isv6) { 1807 ipif_ndp_down(ipif); 1808 } else { 1809 ASSERT(ipif->ipif_arp_del_mp != NULL); 1810 putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); 1811 ipif->ipif_arp_del_mp = NULL; 1812 } 1813 } 1814 ipif->ipif_bound = B_FALSE; 1815 1816 return (ipif); 1817 } 1818 1819 /* 1820 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if 1821 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this 1822 * to determine whether an ill should be considered active, other consumers 1823 * may race and learn about an ill that should be deactivated/activated before 1824 * IPMP has performed the activation/deactivation. This should be safe though 1825 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that 1826 * would've been cleaned up by ipmp_ill_deactivate(). 1827 */ 1828 boolean_t 1829 ipmp_ill_is_active(ill_t *ill) 1830 { 1831 phyint_t *phyi = ill->ill_phyint; 1832 1833 ASSERT(IS_UNDER_IPMP(ill)); 1834 ASSERT(IAM_WRITER_ILL(ill) || 1835 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); 1836 1837 /* 1838 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to 1839 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the 1840 * link flapping logic to be just in in.mpathd and allows us to ignore 1841 * changes to PHYI_RUNNING. 1842 */ 1843 return (!(ill->ill_ipif_up_count == 0 || 1844 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); 1845 } 1846 1847 /* 1848 * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet 1849 * IREs with a source address on `ill_arg'. 1850 */ 1851 static void 1852 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) 1853 { 1854 ill_t *ill = (ill_t *)ill_arg; 1855 1856 ASSERT(IAM_WRITER_ILL(ill)); 1857 ASSERT(!IS_IPMP(ill)); 1858 1859 if (ire->ire_ipif->ipif_ill != ill) 1860 return; 1861 1862 switch (ire->ire_type) { 1863 case IRE_HOST: 1864 case IRE_PREFIX: 1865 case IRE_DEFAULT: 1866 case IRE_CACHE: 1867 case IRE_IF_RESOLVER: 1868 case IRE_IF_NORESOLVER: 1869 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); 1870 ire->ire_marks |= IRE_MARK_TESTHIDDEN; 1871 break; 1872 default: 1873 break; 1874 } 1875 } 1876 1877 /* 1878 * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source 1879 * address on `ill_arg'. 1880 */ 1881 static void 1882 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) 1883 { 1884 ill_t *ill = (ill_t *)ill_arg; 1885 1886 ASSERT(IAM_WRITER_ILL(ill)); 1887 ASSERT(!IS_IPMP(ill)); 1888 1889 if (ire->ire_ipif->ipif_ill == ill) { 1890 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); 1891 ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; 1892 } 1893 } 1894 1895 /* 1896 * Return a held pointer to the IPMP ill for underlying interface `ill', or 1897 * NULL if one doesn't exist. (Unfortunately, this function needs to take an 1898 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1899 * ill_grp pointer may become stale when not inside an IPSQ and not holding 1900 * ipmp_lock.) Caller need not be inside the IPSQ. 1901 */ 1902 ill_t * 1903 ipmp_ill_hold_ipmp_ill(ill_t *ill) 1904 { 1905 ip_stack_t *ipst = ill->ill_ipst; 1906 ipmp_illgrp_t *illg; 1907 1908 ASSERT(!IS_IPMP(ill)); 1909 1910 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1911 illg = ill->ill_grp; 1912 if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill) == 0) { 1913 rw_exit(&ipst->ips_ipmp_lock); 1914 return (illg->ig_ipmp_ill); 1915 } 1916 /* 1917 * Assume `ill' was removed from the illgrp in the meantime. 1918 */ 1919 rw_exit(&ill->ill_ipst->ips_ipmp_lock); 1920 return (NULL); 1921 } 1922 1923 /* 1924 * Return the interface index for the IPMP ill tied to underlying interface 1925 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. 1926 */ 1927 uint_t 1928 ipmp_ill_get_ipmp_ifindex(const ill_t *ill) 1929 { 1930 uint_t ifindex = 0; 1931 ip_stack_t *ipst = ill->ill_ipst; 1932 ipmp_grp_t *grp; 1933 1934 ASSERT(!IS_IPMP(ill)); 1935 1936 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1937 if ((grp = ill->ill_phyint->phyint_grp) != NULL) 1938 ifindex = grp->gr_phyint->phyint_ifindex; 1939 rw_exit(&ipst->ips_ipmp_lock); 1940 return (ifindex); 1941 } 1942 1943 /* 1944 * Place phyint `phyi' into IPMP group `grp'. 1945 */ 1946 void 1947 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) 1948 { 1949 ill_t *ill; 1950 ipsq_t *ipsq = phyi->phyint_ipsq; 1951 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; 1952 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1953 1954 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1955 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); 1956 1957 /* 1958 * Send routing socket messages indicating that the phyint's ills 1959 * and ipifs vanished. 1960 */ 1961 if (phyi->phyint_illv4 != NULL) { 1962 ill = phyi->phyint_illv4; 1963 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1964 } 1965 1966 if (phyi->phyint_illv6 != NULL) { 1967 ill = phyi->phyint_illv6; 1968 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1969 } 1970 1971 /* 1972 * Snapshot the phyint's initial kstats as a baseline. 1973 */ 1974 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); 1975 1976 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1977 1978 phyi->phyint_grp = grp; 1979 if (++grp->gr_nif == 1) 1980 grp->gr_mactype = ill->ill_mactype; 1981 else 1982 ASSERT(grp->gr_mactype == ill->ill_mactype); 1983 1984 /* 1985 * Now that we're in the group, request a switch to the group's xop 1986 * when we ipsq_exit(). All future operations will be exclusive on 1987 * the group xop until ipmp_phyint_leave_grp() is called. 1988 */ 1989 ASSERT(ipsq->ipsq_swxop == NULL); 1990 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); 1991 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; 1992 1993 rw_exit(&ipst->ips_ipmp_lock); 1994 } 1995 1996 /* 1997 * Remove phyint `phyi' from its current IPMP group. 1998 */ 1999 void 2000 ipmp_phyint_leave_grp(phyint_t *phyi) 2001 { 2002 uint_t i; 2003 ipsq_t *ipsq = phyi->phyint_ipsq; 2004 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 2005 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 2006 2007 ASSERT(IAM_WRITER_IPSQ(ipsq)); 2008 2009 /* 2010 * If any of the phyint's ills are still in an illgrp, kick 'em out. 2011 */ 2012 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) 2013 ipmp_ill_leave_illgrp(phyi->phyint_illv4); 2014 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) 2015 ipmp_ill_leave_illgrp(phyi->phyint_illv6); 2016 2017 /* 2018 * Send routing socket messages indicating that the phyint's ills 2019 * and ipifs have reappeared. 2020 */ 2021 if (phyi->phyint_illv4 != NULL) 2022 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); 2023 if (phyi->phyint_illv6 != NULL) 2024 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); 2025 2026 /* 2027 * Calculate the phyint's cumulative kstats while it was in the group, 2028 * and add that to the group's baseline. 2029 */ 2030 ipmp_phyint_get_kstats(phyi, phyi_kstats); 2031 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2032 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 2033 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); 2034 } 2035 2036 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 2037 2038 phyi->phyint_grp->gr_nif--; 2039 phyi->phyint_grp = NULL; 2040 2041 /* 2042 * As our final act in leaving the group, request a switch back to our 2043 * IPSQ's own xop when we ipsq_exit(). 2044 */ 2045 ASSERT(ipsq->ipsq_swxop == NULL); 2046 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; 2047 2048 rw_exit(&ipst->ips_ipmp_lock); 2049 } 2050 2051 /* 2052 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. 2053 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. 2054 */ 2055 static void 2056 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) 2057 { 2058 uint_t i, j; 2059 const char *name; 2060 kstat_t *ksp; 2061 kstat_named_t *kn; 2062 2063 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); 2064 2065 /* 2066 * NOTE: ALL_ZONES here assumes that there's at most one link 2067 * with a given name on a given system (safe for now). 2068 */ 2069 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); 2070 if (ksp == NULL) 2071 return; 2072 2073 KSTAT_ENTER(ksp); 2074 2075 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 2076 /* 2077 * Bring kstats up-to-date before recording. 2078 */ 2079 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 2080 2081 kn = KSTAT_NAMED_PTR(ksp); 2082 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2083 name = ipmp_kstats[i].name; 2084 kstats[i] = 0; 2085 for (j = 0; j < ksp->ks_ndata; j++) { 2086 if (strcmp(kn[j].name, name) != 0) 2087 continue; 2088 2089 switch (kn[j].data_type) { 2090 case KSTAT_DATA_INT32: 2091 case KSTAT_DATA_UINT32: 2092 kstats[i] = kn[j].value.ui32; 2093 break; 2094 #ifdef _LP64 2095 case KSTAT_DATA_LONG: 2096 case KSTAT_DATA_ULONG: 2097 kstats[i] = kn[j].value.ul; 2098 break; 2099 #endif 2100 case KSTAT_DATA_INT64: 2101 case KSTAT_DATA_UINT64: 2102 kstats[i] = kn[j].value.ui64; 2103 break; 2104 } 2105 break; 2106 } 2107 } 2108 } 2109 2110 KSTAT_EXIT(ksp); 2111 kstat_rele(ksp); 2112 } 2113 2114 /* 2115 * Refresh the active state of all ills on `phyi'. 2116 */ 2117 void 2118 ipmp_phyint_refresh_active(phyint_t *phyi) 2119 { 2120 if (phyi->phyint_illv4 != NULL) 2121 ipmp_ill_refresh_active(phyi->phyint_illv4); 2122 if (phyi->phyint_illv6 != NULL) 2123 ipmp_ill_refresh_active(phyi->phyint_illv6); 2124 } 2125 2126 /* 2127 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one 2128 * doesn't exist. Caller need not be inside the IPSQ. 2129 */ 2130 ill_t * 2131 ipmp_ipif_hold_bound_ill(const ipif_t *ipif) 2132 { 2133 ill_t *boundill; 2134 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2135 2136 ASSERT(IS_IPMP(ipif->ipif_ill)); 2137 2138 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2139 boundill = ipif->ipif_bound_ill; 2140 if (boundill != NULL && ill_check_and_refhold(boundill) == 0) { 2141 rw_exit(&ipst->ips_ipmp_lock); 2142 return (boundill); 2143 } 2144 rw_exit(&ipst->ips_ipmp_lock); 2145 return (NULL); 2146 } 2147 2148 /* 2149 * Return a pointer to the underlying ill bound to `ipif', or NULL if one 2150 * doesn't exist. Caller must be inside the IPSQ. 2151 */ 2152 ill_t * 2153 ipmp_ipif_bound_ill(const ipif_t *ipif) 2154 { 2155 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); 2156 ASSERT(IS_IPMP(ipif->ipif_ill)); 2157 2158 return (ipif->ipif_bound_ill); 2159 } 2160 2161 /* 2162 * Check if `ipif' is a "stub" (placeholder address not being used). 2163 */ 2164 boolean_t 2165 ipmp_ipif_is_stubaddr(const ipif_t *ipif) 2166 { 2167 if (ipif->ipif_flags & IPIF_UP) 2168 return (B_FALSE); 2169 if (ipif->ipif_ill->ill_isv6) 2170 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2171 else 2172 return (ipif->ipif_lcl_addr == INADDR_ANY); 2173 } 2174 2175 /* 2176 * Check if `ipif' is an IPMP data address. 2177 */ 2178 boolean_t 2179 ipmp_ipif_is_dataaddr(const ipif_t *ipif) 2180 { 2181 if (ipif->ipif_flags & IPIF_NOFAILOVER) 2182 return (B_FALSE); 2183 if (ipif->ipif_ill->ill_isv6) 2184 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2185 else 2186 return (ipif->ipif_lcl_addr != INADDR_ANY); 2187 } 2188 2189 /* 2190 * Check if `ipif' is an IPIF_UP IPMP data address. 2191 */ 2192 static boolean_t 2193 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) 2194 { 2195 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); 2196 } 2197