1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 * 21 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 22 * Use is subject to license terms. 23 */ 24 25 #include <inet/arp.h> 26 #include <inet/ip.h> 27 #include <inet/ip6.h> 28 #include <inet/ip_if.h> 29 #include <inet/ip_ire.h> 30 #include <inet/ip_multi.h> 31 #include <inet/ip_rts.h> 32 #include <inet/mi.h> 33 #include <net/if_types.h> 34 #include <sys/dlpi.h> 35 #include <sys/kmem.h> 36 #include <sys/modhash.h> 37 #include <sys/sdt.h> 38 #include <sys/strsun.h> 39 #include <sys/sunddi.h> 40 #include <sys/types.h> 41 42 /* 43 * Convenience macros for getting the ip_stack_t associated with an 44 * ipmp_illgrp_t or ipmp_grp_t. 45 */ 46 #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) 47 #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) 48 49 /* 50 * Assorted constants that aren't important enough to be tunable. 51 */ 52 #define IPMP_GRP_HASH_SIZE 64 53 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ 54 55 /* 56 * Templates for IPMP ARP messages. 57 */ 58 static const arie_t ipmp_aract_template = { 59 AR_IPMP_ACTIVATE, 60 sizeof (arie_t), /* Name offset */ 61 sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ 62 }; 63 64 static const arie_t ipmp_ardeact_template = { 65 AR_IPMP_DEACTIVATE, 66 sizeof (arie_t), /* Name offset */ 67 sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ 68 }; 69 70 /* 71 * IPMP meta-interface kstats (based on those in PSARC/1997/198). 72 */ 73 static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { 74 { "obytes", KSTAT_DATA_UINT32 }, 75 { "obytes64", KSTAT_DATA_UINT64 }, 76 { "rbytes", KSTAT_DATA_UINT32 }, 77 { "rbytes64", KSTAT_DATA_UINT64 }, 78 { "opackets", KSTAT_DATA_UINT32 }, 79 { "opackets64", KSTAT_DATA_UINT64 }, 80 { "oerrors", KSTAT_DATA_UINT32 }, 81 { "ipackets", KSTAT_DATA_UINT32 }, 82 { "ipackets64", KSTAT_DATA_UINT64 }, 83 { "ierrors", KSTAT_DATA_UINT32 }, 84 { "multircv", KSTAT_DATA_UINT32 }, 85 { "multixmt", KSTAT_DATA_UINT32 }, 86 { "brdcstrcv", KSTAT_DATA_UINT32 }, 87 { "brdcstxmt", KSTAT_DATA_UINT32 }, 88 { "link_up", KSTAT_DATA_UINT32 } 89 }; 90 91 static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); 92 static int ipmp_grp_create_kstats(ipmp_grp_t *); 93 static int ipmp_grp_update_kstats(kstat_t *, int); 94 static void ipmp_grp_destroy_kstats(ipmp_grp_t *); 95 static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); 96 static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); 97 static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); 98 static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); 99 static boolean_t ipmp_ill_activate(ill_t *); 100 static void ipmp_ill_deactivate(ill_t *); 101 static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); 102 static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); 103 static void ipmp_ill_refresh_active_timer_start(ill_t *); 104 static void ipmp_ill_rtsaddrmsg(ill_t *, int); 105 static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); 106 static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); 107 static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); 108 static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); 109 110 /* 111 * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). 112 */ 113 void 114 ipmp_init(ip_stack_t *ipst) 115 { 116 ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", 117 IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, 118 mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); 119 rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); 120 } 121 122 /* 123 * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). 124 */ 125 void 126 ipmp_destroy(ip_stack_t *ipst) 127 { 128 mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); 129 rw_destroy(&ipst->ips_ipmp_lock); 130 } 131 132 /* 133 * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', 134 * and add it to the hash. On success, return a pointer to the created group. 135 * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP 136 * meta-interface associated with the group also has the same name (but they 137 * may differ later via ipmp_grp_rename()). 138 */ 139 ipmp_grp_t * 140 ipmp_grp_create(const char *grname, phyint_t *phyi) 141 { 142 ipmp_grp_t *grp; 143 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 144 mod_hash_hndl_t mh; 145 146 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 147 148 if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) 149 return (NULL); 150 151 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 152 (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); 153 154 /* 155 * Cache the group's phyint. This is safe since a phyint_t will 156 * outlive its ipmp_grp_t. 157 */ 158 grp->gr_phyint = phyi; 159 160 /* 161 * Create IPMP group kstats. 162 */ 163 if (ipmp_grp_create_kstats(grp) != 0) { 164 kmem_free(grp, sizeof (ipmp_grp_t)); 165 return (NULL); 166 } 167 168 /* 169 * Insert the group into the hash. 170 */ 171 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { 172 ipmp_grp_destroy_kstats(grp); 173 kmem_free(grp, sizeof (ipmp_grp_t)); 174 return (NULL); 175 } 176 ipmp_grp_insert(grp, mh); 177 178 return (grp); 179 } 180 181 /* 182 * Create IPMP kstat structures for `grp'. Return an errno upon failure. 183 */ 184 static int 185 ipmp_grp_create_kstats(ipmp_grp_t *grp) 186 { 187 kstat_t *ksp; 188 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 189 190 ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", 191 KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); 192 if (ksp == NULL) 193 return (ENOMEM); 194 195 ksp->ks_update = ipmp_grp_update_kstats; 196 ksp->ks_private = grp; 197 bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); 198 199 kstat_install(ksp); 200 grp->gr_ksp = ksp; 201 return (0); 202 } 203 204 /* 205 * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. 206 */ 207 static int 208 ipmp_grp_update_kstats(kstat_t *ksp, int rw) 209 { 210 uint_t i; 211 kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); 212 ipmp_grp_t *grp = ksp->ks_private; 213 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 214 ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; 215 phyint_t *phyi; 216 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 217 218 if (rw == KSTAT_WRITE) 219 return (EACCES); 220 221 /* 222 * Start with the group's baseline values. 223 */ 224 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 225 if (kn[i].data_type == KSTAT_DATA_UINT32) { 226 kn[i].value.ui32 = grp->gr_kstats0[i]; 227 } else { 228 ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); 229 kn[i].value.ui64 = grp->gr_kstats0[i]; 230 } 231 } 232 233 /* 234 * Add in the stats of each phyint currently in the group. Since we 235 * don't directly track the phyints in a group, we cheat by walking 236 * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while 237 * ill_g_lock is held.) 238 */ 239 rw_enter(&ipst->ips_ill_g_lock, RW_READER); 240 ipsq = grp_ipsq->ipsq_next; 241 for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { 242 phyi = ipsq->ipsq_phyint; 243 244 /* 245 * If a phyint in a group is being unplumbed, it's possible 246 * that ill_glist_delete() -> phyint_free() already freed the 247 * phyint (and set ipsq_phyint to NULL), but the unplumb 248 * operation has yet to complete (and thus ipsq_dq() has yet 249 * to remove the phyint's IPSQ from the group IPSQ's phyint 250 * list). We skip those phyints here (note that their kstats 251 * have already been added to gr_kstats0[]). 252 */ 253 if (phyi == NULL) 254 continue; 255 256 ipmp_phyint_get_kstats(phyi, phyi_kstats); 257 258 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 259 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 260 if (kn[i].data_type == KSTAT_DATA_UINT32) 261 kn[i].value.ui32 += phyi_kstats[i]; 262 else 263 kn[i].value.ui64 += phyi_kstats[i]; 264 } 265 } 266 267 kn[IPMP_KSTAT_LINK_UP].value.ui32 = 268 (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; 269 270 rw_exit(&ipst->ips_ill_g_lock); 271 return (0); 272 } 273 274 /* 275 * Destroy IPMP kstat structures for `grp'. 276 */ 277 static void 278 ipmp_grp_destroy_kstats(ipmp_grp_t *grp) 279 { 280 netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; 281 282 kstat_delete_netstack(grp->gr_ksp, id); 283 bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); 284 grp->gr_ksp = NULL; 285 } 286 287 /* 288 * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it 289 * does not exist. 290 */ 291 ipmp_grp_t * 292 ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) 293 { 294 ipmp_grp_t *grp; 295 296 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 297 298 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 299 (mod_hash_val_t *)&grp) == 0) 300 return (grp); 301 302 return (NULL); 303 } 304 305 /* 306 * Place information about group `grp' into `lifgr'. 307 */ 308 void 309 ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) 310 { 311 ill_t *ill; 312 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 313 314 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 315 316 lifgr->gi_v4 = (grp->gr_v4 != NULL); 317 lifgr->gi_v6 = (grp->gr_v6 != NULL); 318 lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; 319 lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; 320 lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; 321 (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); 322 lifgr->gi_m4ifname[0] = '\0'; 323 lifgr->gi_m6ifname[0] = '\0'; 324 lifgr->gi_bcifname[0] = '\0'; 325 326 if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { 327 (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); 328 (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); 329 } 330 331 if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) 332 (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); 333 } 334 335 /* 336 * Insert `grp' into the hash using the reserved hash entry `mh'. 337 * Caller must ensure `grp' is not yet in the hash. 338 */ 339 static void 340 ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) 341 { 342 int err; 343 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 344 345 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 346 347 /* 348 * Since grp->gr_name will exist at least as long as `grp' is in the 349 * hash, we use it directly as the key. 350 */ 351 err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, 352 (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); 353 if (err != 0) { 354 /* 355 * This should never happen since `mh' was preallocated. 356 */ 357 panic("cannot insert IPMP group \"%s\" (err %d)", 358 grp->gr_name, err); 359 } 360 } 361 362 /* 363 * Remove `grp' from the hash. Caller must ensure `grp' is in it. 364 */ 365 static void 366 ipmp_grp_remove(ipmp_grp_t *grp) 367 { 368 int err; 369 mod_hash_val_t val; 370 mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; 371 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 372 373 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 374 375 err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); 376 if (err != 0 || val != grp) { 377 panic("cannot remove IPMP group \"%s\" (err %d)", 378 grp->gr_name, err); 379 } 380 } 381 382 /* 383 * Attempt to rename `grp' to new name `grname'. Return an errno if the new 384 * group name already exists or is invalid, or if there isn't enough memory. 385 */ 386 int 387 ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) 388 { 389 mod_hash_hndl_t mh; 390 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 391 392 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 393 394 if (grname[0] == '\0') 395 return (EINVAL); 396 397 if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, 398 (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) 399 return (EEXIST); 400 401 /* 402 * Before we remove the group from the hash, ensure we'll be able to 403 * re-insert it by reserving space. 404 */ 405 if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) 406 return (ENOMEM); 407 408 ipmp_grp_remove(grp); 409 (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); 410 ipmp_grp_insert(grp, mh); 411 412 return (0); 413 } 414 415 /* 416 * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in 417 * the hash, and that there are no interfaces on it. 418 */ 419 void 420 ipmp_grp_destroy(ipmp_grp_t *grp) 421 { 422 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 423 424 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 425 426 /* 427 * If there are still interfaces using this group, panic before things 428 * go really off the rails. 429 */ 430 if (grp->gr_nif != 0) 431 panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); 432 433 ipmp_grp_remove(grp); 434 ipmp_grp_destroy_kstats(grp); 435 436 ASSERT(grp->gr_v4 == NULL); 437 ASSERT(grp->gr_v6 == NULL); 438 ASSERT(grp->gr_nv4 == 0); 439 ASSERT(grp->gr_nv6 == 0); 440 ASSERT(grp->gr_nactif == 0); 441 ASSERT(grp->gr_linkdownmp == NULL); 442 grp->gr_phyint = NULL; 443 444 kmem_free(grp, sizeof (ipmp_grp_t)); 445 } 446 447 /* 448 * Check whether `ill' is suitable for inclusion into `grp', and return an 449 * errno describing the problem (if any). NOTE: many of these errno values 450 * are interpreted by ifconfig, which will take corrective action and retry 451 * the SIOCSLIFGROUPNAME, so please exercise care when changing them. 452 */ 453 static int 454 ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) 455 { 456 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 457 458 ASSERT(IAM_WRITER_ILL(ill)); 459 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 460 461 /* 462 * To sidestep complicated address migration logic in the kernel and 463 * to force the kernel's all-hosts multicast memberships to be blown 464 * away, all addresses that had been brought up must be brought back 465 * down prior to adding an interface to a group. (This includes 466 * addresses currently down due to DAD.) Once the interface has been 467 * added to the group, its addresses can then be brought back up, at 468 * which point they will be moved to the IPMP meta-interface. 469 * NOTE: we do this before ill_appaddr_cnt() since bringing down the 470 * link-local causes in.ndpd to remove its ADDRCONF'd addresses. 471 */ 472 if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) 473 return (EADDRINUSE); 474 475 /* 476 * To avoid confusing applications by changing addresses that are 477 * under their control, all such control must be removed prior to 478 * adding an interface into a group. 479 */ 480 if (ill_appaddr_cnt(ill) != 0) 481 return (EADDRNOTAVAIL); 482 483 /* 484 * Since PTP addresses do not share the same broadcast domain, they 485 * are not allowed to be in an IPMP group. 486 */ 487 if (ill_ptpaddr_cnt(ill) != 0) 488 return (EINVAL); 489 490 /* 491 * An ill must support multicast to be allowed into a group. 492 */ 493 if (!(ill->ill_flags & ILLF_MULTICAST)) 494 return (ENOTSUP); 495 496 /* 497 * An ill must strictly be using ARP and/or ND for address 498 * resolution for it to be allowed into a group. 499 */ 500 if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) 501 return (ENOTSUP); 502 503 /* 504 * An ill cannot also be using usesrc groups. (Although usesrc uses 505 * ill_g_usesrc_lock, we don't need to grab it since usesrc also does 506 * all its modifications as writer.) 507 */ 508 if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) 509 return (ENOTSUP); 510 511 /* 512 * All ills in a group must be the same mactype. 513 */ 514 if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) 515 return (EINVAL); 516 517 return (0); 518 } 519 520 /* 521 * Check whether `phyi' is suitable for inclusion into `grp', and return an 522 * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() 523 * regarding errno values. 524 */ 525 int 526 ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) 527 { 528 int err = 0; 529 ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); 530 531 ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); 532 ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); 533 534 /* 535 * An interface cannot have address families plumbed that are not 536 * configured in the group. 537 */ 538 if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || 539 phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) 540 return (EAFNOSUPPORT); 541 542 if (phyi->phyint_illv4 != NULL) 543 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); 544 if (err == 0 && phyi->phyint_illv6 != NULL) 545 err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); 546 547 return (err); 548 } 549 550 /* 551 * Create a new illgrp on IPMP meta-interface `ill'. 552 */ 553 ipmp_illgrp_t * 554 ipmp_illgrp_create(ill_t *ill) 555 { 556 uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; 557 ipmp_illgrp_t *illg; 558 559 ASSERT(IAM_WRITER_ILL(ill)); 560 ASSERT(IS_IPMP(ill)); 561 ASSERT(ill->ill_grp == NULL); 562 563 if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) 564 return (NULL); 565 566 list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); 567 list_create(&illg->ig_actif, sizeof (ill_t), 568 offsetof(ill_t, ill_actnode)); 569 list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), 570 offsetof(ipmp_arpent_t, ia_node)); 571 572 illg->ig_ipmp_ill = ill; 573 ill->ill_grp = illg; 574 ipmp_illgrp_set_mtu(illg, mtu); 575 576 return (illg); 577 } 578 579 /* 580 * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. 581 */ 582 void 583 ipmp_illgrp_destroy(ipmp_illgrp_t *illg) 584 { 585 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 586 ASSERT(IS_IPMP(illg->ig_ipmp_ill)); 587 588 /* 589 * Verify `illg' is empty. 590 */ 591 ASSERT(illg->ig_next_ill == NULL); 592 ASSERT(illg->ig_cast_ill == NULL); 593 ASSERT(list_is_empty(&illg->ig_arpent)); 594 ASSERT(list_is_empty(&illg->ig_if)); 595 ASSERT(list_is_empty(&illg->ig_actif)); 596 ASSERT(illg->ig_nactif == 0); 597 598 /* 599 * Destroy `illg'. 600 */ 601 illg->ig_ipmp_ill->ill_grp = NULL; 602 illg->ig_ipmp_ill = NULL; 603 list_destroy(&illg->ig_if); 604 list_destroy(&illg->ig_actif); 605 list_destroy(&illg->ig_arpent); 606 kmem_free(illg, sizeof (ipmp_illgrp_t)); 607 } 608 609 /* 610 * Add `ipif' to the pool of usable data addresses on `illg' and attempt to 611 * bind it to an underlying ill, while keeping an even address distribution. 612 * If the bind is successful, return a pointer to the bound ill. 613 */ 614 ill_t * 615 ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 616 { 617 ill_t *minill; 618 ipmp_arpent_t *entp; 619 620 ASSERT(IAM_WRITER_IPIF(ipif)); 621 ASSERT(ipmp_ipif_is_dataaddr(ipif)); 622 623 /* 624 * IPMP data address mappings are internally managed by IP itself, so 625 * delete any existing ARP entries associated with the address. 626 */ 627 if (!ipif->ipif_isv6) { 628 entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); 629 if (entp != NULL) 630 ipmp_illgrp_destroy_arpent(illg, entp); 631 } 632 633 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 634 ipmp_ill_bind_ipif(minill, ipif, Res_act_none); 635 636 return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); 637 } 638 639 /* 640 * Delete `ipif' from the pool of usable data addresses on `illg'. If it's 641 * bound, unbind it from the underlying ill while keeping an even address 642 * distribution. 643 */ 644 void 645 ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) 646 { 647 ill_t *maxill, *boundill = ipif->ipif_bound_ill; 648 649 ASSERT(IAM_WRITER_IPIF(ipif)); 650 651 if (boundill != NULL) { 652 (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); 653 654 maxill = ipmp_illgrp_max_ill(illg); 655 if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { 656 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 657 ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); 658 } 659 } 660 } 661 662 /* 663 * Return the active ill with the greatest number of data addresses in `illg'. 664 */ 665 static ill_t * 666 ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) 667 { 668 ill_t *ill, *bestill = NULL; 669 670 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 671 672 ill = list_head(&illg->ig_actif); 673 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 674 if (bestill == NULL || 675 ill->ill_bound_cnt > bestill->ill_bound_cnt) { 676 bestill = ill; 677 } 678 } 679 return (bestill); 680 } 681 682 /* 683 * Return the active ill with the fewest number of data addresses in `illg'. 684 */ 685 static ill_t * 686 ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) 687 { 688 ill_t *ill, *bestill = NULL; 689 690 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 691 692 ill = list_head(&illg->ig_actif); 693 for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { 694 if (bestill == NULL || 695 ill->ill_bound_cnt < bestill->ill_bound_cnt) { 696 if (ill->ill_bound_cnt == 0) 697 return (ill); /* can't get better */ 698 bestill = ill; 699 } 700 } 701 return (bestill); 702 } 703 704 /* 705 * Return a pointer to IPMP meta-interface for `illg' (which must exist). 706 * Since ig_ipmp_ill never changes for a given illg, no locks are needed. 707 */ 708 ill_t * 709 ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) 710 { 711 return (illg->ig_ipmp_ill); 712 } 713 714 /* 715 * Return a pointer to the next available underlying ill in `illg', or NULL if 716 * one doesn't exist. Caller must be inside the IPSQ. 717 */ 718 ill_t * 719 ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) 720 { 721 ill_t *ill; 722 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 723 724 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 725 726 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 727 if ((ill = illg->ig_next_ill) != NULL) { 728 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 729 if (illg->ig_next_ill == NULL) 730 illg->ig_next_ill = list_head(&illg->ig_actif); 731 } 732 rw_exit(&ipst->ips_ipmp_lock); 733 734 return (ill); 735 } 736 737 /* 738 * Return a held pointer to the next available underlying ill in `illg', or 739 * NULL if one doesn't exist. Caller need not be inside the IPSQ. 740 */ 741 ill_t * 742 ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) 743 { 744 ill_t *ill; 745 uint_t i; 746 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 747 748 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 749 for (i = 0; i < illg->ig_nactif; i++) { 750 ill = illg->ig_next_ill; 751 illg->ig_next_ill = list_next(&illg->ig_actif, ill); 752 if (illg->ig_next_ill == NULL) 753 illg->ig_next_ill = list_head(&illg->ig_actif); 754 755 if (ILL_CAN_LOOKUP(ill)) { 756 ill_refhold(ill); 757 rw_exit(&ipst->ips_ipmp_lock); 758 return (ill); 759 } 760 } 761 rw_exit(&ipst->ips_ipmp_lock); 762 763 return (NULL); 764 } 765 766 /* 767 * Return a pointer to the nominated multicast ill in `illg', or NULL if one 768 * doesn't exist. Caller must be inside the IPSQ. 769 */ 770 ill_t * 771 ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) 772 { 773 /* 774 * Since an IPMP ill's ill_grp gets cleared during I_PUNLINK but 775 * this function can get called after that point, handle NULL. 776 */ 777 if (illg == NULL) 778 return (NULL); 779 780 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 781 return (illg->ig_cast_ill); 782 } 783 784 /* 785 * Return a held pointer to the nominated multicast ill in `illg', or NULL if 786 * one doesn't exist. Caller need not be inside the IPSQ. 787 */ 788 ill_t * 789 ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) 790 { 791 ill_t *castill; 792 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 793 794 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 795 castill = illg->ig_cast_ill; 796 if (castill != NULL && ILL_CAN_LOOKUP(castill)) { 797 ill_refhold(castill); 798 rw_exit(&ipst->ips_ipmp_lock); 799 return (castill); 800 } 801 rw_exit(&ipst->ips_ipmp_lock); 802 return (NULL); 803 } 804 805 /* 806 * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, 807 * any existing nomination is removed. Caller must be inside the IPSQ. 808 */ 809 static void 810 ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) 811 { 812 ill_t *ocastill = illg->ig_cast_ill; 813 ill_t *ipmp_ill = illg->ig_ipmp_ill; 814 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 815 816 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 817 818 /* 819 * Disable old nominated ill (if any). 820 */ 821 if (ocastill != NULL) { 822 DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, 823 illg, ill_t *, ocastill); 824 ASSERT(ocastill->ill_nom_cast); 825 ocastill->ill_nom_cast = B_FALSE; 826 /* 827 * If the IPMP meta-interface is down, we never did the join, 828 * so we must not try to leave. 829 */ 830 if (ipmp_ill->ill_dl_up) 831 ill_leave_multicast(ipmp_ill); 832 } 833 834 /* 835 * Set new nomination. 836 */ 837 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 838 illg->ig_cast_ill = castill; 839 rw_exit(&ipst->ips_ipmp_lock); 840 841 if (ocastill != NULL) { 842 /* 843 * Delete any IREs tied to the old nomination. We must do 844 * this after the new castill is set and has reached global 845 * visibility since the datapath has not been quiesced. 846 */ 847 ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, 848 ill_stq_cache_delete, ocastill, ocastill); 849 } 850 851 /* 852 * Enable new nominated ill (if any). 853 */ 854 if (castill != NULL) { 855 DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, 856 illg, ill_t *, castill); 857 ASSERT(!castill->ill_nom_cast); 858 castill->ill_nom_cast = B_TRUE; 859 /* 860 * If the IPMP meta-interface is down, the attempt to recover 861 * will silently fail but ill_need_recover_multicast will be 862 * erroneously cleared -- so check first. 863 */ 864 if (ipmp_ill->ill_dl_up) 865 ill_recover_multicast(ipmp_ill); 866 } 867 868 /* 869 * For IPv4, refresh our broadcast IREs. This needs to be done even 870 * if there's no new nomination since ill_refresh_bcast() still must 871 * update the IPMP meta-interface's broadcast IREs to point back at 872 * the IPMP meta-interface itself. 873 */ 874 if (!ipmp_ill->ill_isv6) 875 ill_refresh_bcast(ipmp_ill); 876 } 877 878 /* 879 * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an 880 * entry for the same IP address already exists, destroy it first. Return the 881 * created IPMP ARP entry, or NULL on failure. 882 */ 883 ipmp_arpent_t * 884 ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) 885 { 886 uchar_t *addrp; 887 area_t *area = (area_t *)mp->b_rptr; 888 ipmp_arpent_t *entp, *oentp; 889 890 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 891 ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); 892 893 if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) 894 return (NULL); 895 896 if ((mp = copyb(mp)) == NULL) { 897 kmem_free(entp, sizeof (ipmp_arpent_t)); 898 return (NULL); 899 } 900 901 DB_TYPE(mp) = M_PROTO; 902 entp->ia_area_mp = mp; 903 entp->ia_proxyarp = proxyarp; 904 addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, 905 sizeof (ipaddr_t)); 906 bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); 907 908 if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) 909 ipmp_illgrp_destroy_arpent(illg, oentp); 910 911 list_insert_head(&illg->ig_arpent, entp); 912 return (entp); 913 } 914 915 /* 916 * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. 917 */ 918 void 919 ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 920 { 921 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 922 923 list_remove(&illg->ig_arpent, entp); 924 freeb(entp->ia_area_mp); 925 kmem_free(entp, sizeof (ipmp_arpent_t)); 926 } 927 928 /* 929 * Mark that ARP has been notified about the IP address on `entp'; `illg' is 930 * taken as a debugging aid for DTrace FBT probes. 931 */ 932 /* ARGSUSED */ 933 void 934 ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) 935 { 936 entp->ia_notified = B_TRUE; 937 } 938 939 /* 940 * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is 941 * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. 942 */ 943 ipmp_arpent_t * 944 ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) 945 { 946 ipmp_arpent_t *entp = list_head(&illg->ig_arpent); 947 948 ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); 949 950 if (addrp == NULL) 951 return (entp); 952 953 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) 954 if (entp->ia_ipaddr == *addrp) 955 break; 956 return (entp); 957 } 958 959 /* 960 * Refresh ARP entries on `illg' to be distributed across its active 961 * interfaces. Entries that cannot be refreshed (e.g., because there are no 962 * active interfaces) are marked so that subsequent calls can try again. 963 */ 964 void 965 ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) 966 { 967 ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; 968 uint_t paddrlen = ipmp_ill->ill_phys_addr_length; 969 area_t *area; 970 mblk_t *area_mp; 971 uchar_t *physaddr; 972 ipmp_arpent_t *entp; 973 974 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 975 ASSERT(!ipmp_ill->ill_isv6); 976 977 ill = list_head(&illg->ig_actif); 978 entp = list_head(&illg->ig_arpent); 979 for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { 980 if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { 981 entp->ia_notified = B_FALSE; 982 continue; 983 } 984 985 area = (area_t *)entp->ia_area_mp->b_rptr; 986 ASSERT(paddrlen == ill->ill_phys_addr_length); 987 ASSERT(paddrlen == area->area_hw_addr_length); 988 physaddr = mi_offset_paramc(entp->ia_area_mp, 989 area->area_hw_addr_offset, paddrlen); 990 991 /* 992 * If this is a proxy ARP entry, we can skip notifying ARP if 993 * the entry is already up-to-date. If it has changed, we 994 * update the entry's hardware address before notifying ARP. 995 */ 996 if (entp->ia_proxyarp) { 997 if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && 998 entp->ia_notified) 999 continue; 1000 bcopy(ill->ill_phys_addr, physaddr, paddrlen); 1001 } 1002 1003 if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { 1004 entp->ia_notified = B_FALSE; 1005 continue; 1006 } 1007 1008 putnext(ipmp_ill->ill_rq, area_mp); 1009 ipmp_illgrp_mark_arpent(illg, entp); 1010 1011 if ((ill = list_next(&illg->ig_actif, ill)) == NULL) 1012 ill = list_head(&illg->ig_actif); 1013 } 1014 } 1015 1016 /* 1017 * Return an interface in `illg' with the specified `physaddr', or NULL if one 1018 * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. 1019 */ 1020 ill_t * 1021 ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) 1022 { 1023 ill_t *ill; 1024 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1025 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1026 1027 ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); 1028 1029 ill = list_head(&illg->ig_if); 1030 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1031 if (ill->ill_phys_addr_length == paddrlen && 1032 bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) 1033 return (ill); 1034 } 1035 return (NULL); 1036 } 1037 1038 /* 1039 * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. 1040 * Caller must be inside the IPSQ unless this is initialization. 1041 */ 1042 static void 1043 ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) 1044 { 1045 ill_t *ill = illg->ig_ipmp_ill; 1046 mblk_t *mp; 1047 1048 ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); 1049 1050 /* 1051 * If allocation fails, we have bigger problems than MTU. 1052 */ 1053 if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { 1054 illg->ig_mtu = mtu; 1055 put(ill->ill_rq, mp); 1056 } 1057 } 1058 1059 /* 1060 * Recalculate the IPMP group MTU for `illg', and update its associated IPMP 1061 * ill MTU if necessary. 1062 */ 1063 void 1064 ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) 1065 { 1066 ill_t *ill; 1067 ill_t *ipmp_ill = illg->ig_ipmp_ill; 1068 uint_t mtu = 0; 1069 1070 ASSERT(IAM_WRITER_ILL(ipmp_ill)); 1071 1072 /* 1073 * Since ill_max_mtu can only change under ill_lock, we hold ill_lock 1074 * for each ill as we iterate through the list. Any changes to the 1075 * ill_max_mtu will also trigger an update, so even if we missed it 1076 * this time around, the update will catch it. 1077 */ 1078 ill = list_head(&illg->ig_if); 1079 for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { 1080 mutex_enter(&ill->ill_lock); 1081 if (mtu == 0 || ill->ill_max_mtu < mtu) 1082 mtu = ill->ill_max_mtu; 1083 mutex_exit(&ill->ill_lock); 1084 } 1085 1086 /* 1087 * MTU must be at least the minimum MTU. 1088 */ 1089 mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); 1090 1091 if (illg->ig_mtu != mtu) 1092 ipmp_illgrp_set_mtu(illg, mtu); 1093 } 1094 1095 /* 1096 * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently 1097 * allow the same link to be established more than once. 1098 */ 1099 void 1100 ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) 1101 { 1102 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1103 1104 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1105 1106 if (illg->ig_ipmp_ill->ill_isv6) { 1107 ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); 1108 grp->gr_v6 = illg; 1109 } else { 1110 ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); 1111 grp->gr_v4 = illg; 1112 } 1113 } 1114 1115 /* 1116 * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp 1117 * cannot be unlinked (e.g., because there are still interfaces using it). 1118 */ 1119 int 1120 ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) 1121 { 1122 ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; 1123 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1124 1125 ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); 1126 1127 if (illg->ig_ipmp_ill->ill_isv6) { 1128 if (grp->gr_nv6 + grp->gr_pendv6 != 0) 1129 return (EBUSY); 1130 grp->gr_v6 = NULL; 1131 } else { 1132 if (grp->gr_nv4 + grp->gr_pendv4 != 0) 1133 return (EBUSY); 1134 grp->gr_v4 = NULL; 1135 } 1136 return (0); 1137 } 1138 1139 /* 1140 * Place `ill' into `illg', and rebalance the data addresses on `illg' 1141 * to be spread evenly across the ills now in it. Also, adjust the IPMP 1142 * ill as necessary to account for `ill' (e.g., MTU). 1143 */ 1144 void 1145 ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) 1146 { 1147 ill_t *ipmp_ill; 1148 ipif_t *ipif; 1149 ip_stack_t *ipst = ill->ill_ipst; 1150 1151 /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ 1152 ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); 1153 ASSERT(IAM_WRITER_ILL(ill)); 1154 ASSERT(ill->ill_grp == NULL); 1155 1156 ipmp_ill = illg->ig_ipmp_ill; 1157 1158 /* 1159 * Account for `ill' joining the illgrp. 1160 */ 1161 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1162 if (ill->ill_isv6) 1163 ill->ill_phyint->phyint_grp->gr_nv6++; 1164 else 1165 ill->ill_phyint->phyint_grp->gr_nv4++; 1166 rw_exit(&ipst->ips_ipmp_lock); 1167 1168 /* 1169 * Ensure the ILLF_ROUTER flag remains consistent across the group. 1170 */ 1171 mutex_enter(&ill->ill_lock); 1172 if (ipmp_ill->ill_flags & ILLF_ROUTER) 1173 ill->ill_flags |= ILLF_ROUTER; 1174 else 1175 ill->ill_flags &= ~ILLF_ROUTER; 1176 mutex_exit(&ill->ill_lock); 1177 1178 /* 1179 * Blow away all multicast memberships that currently exist on `ill'. 1180 * This may seem odd, but it's consistent with the application view 1181 * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). 1182 */ 1183 if (ill->ill_isv6) { 1184 reset_conn_ill(ill); 1185 reset_mrt_ill(ill); 1186 } else { 1187 ipif = ill->ill_ipif; 1188 for (; ipif != NULL; ipif = ipif->ipif_next) { 1189 reset_conn_ipif(ipif); 1190 reset_mrt_vif_ipif(ipif); 1191 } 1192 } 1193 ip_purge_allmulti(ill); 1194 1195 /* 1196 * Borrow the first ill's ill_phys_addr_length value for the illgrp's 1197 * physical address length. All other ills must have the same value, 1198 * since they are required to all be the same mactype. Also update 1199 * the IPMP ill's MTU and CoS marking, if necessary. 1200 */ 1201 if (list_is_empty(&illg->ig_if)) { 1202 ASSERT(ipmp_ill->ill_phys_addr_length == 0); 1203 /* 1204 * NOTE: we leave ill_phys_addr NULL since the IPMP group 1205 * doesn't have a physical address. This means that code must 1206 * not assume that ill_phys_addr is non-NULL just because 1207 * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. 1208 */ 1209 ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; 1210 ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; 1211 ipmp_ill->ill_type = ill->ill_type; 1212 1213 if (ill->ill_flags & ILLF_COS_ENABLED) { 1214 mutex_enter(&ipmp_ill->ill_lock); 1215 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1216 mutex_exit(&ipmp_ill->ill_lock); 1217 } 1218 ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); 1219 } else { 1220 ASSERT(ipmp_ill->ill_phys_addr_length == 1221 ill->ill_phys_addr_length); 1222 ASSERT(ipmp_ill->ill_type == ill->ill_type); 1223 1224 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1225 mutex_enter(&ipmp_ill->ill_lock); 1226 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1227 mutex_exit(&ipmp_ill->ill_lock); 1228 } 1229 if (illg->ig_mtu > ill->ill_max_mtu) 1230 ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); 1231 } 1232 1233 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1234 list_insert_tail(&illg->ig_if, ill); 1235 ill->ill_grp = illg; 1236 rw_exit(&ipst->ips_ill_g_lock); 1237 1238 /* 1239 * Hide the IREs on `ill' so that we don't accidentally find them when 1240 * sending data traffic. 1241 */ 1242 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); 1243 1244 /* 1245 * Merge any broadcast IREs, if need be. 1246 */ 1247 if (!ill->ill_isv6) 1248 ill_refresh_bcast(ill); 1249 1250 ipmp_ill_refresh_active(ill); 1251 } 1252 1253 /* 1254 * Remove `ill' from its illgrp, and rebalance the data addresses in that 1255 * illgrp to be spread evenly across the remaining ills. Also, adjust the 1256 * IPMP ill as necessary now that `ill' is removed (e.g., MTU). 1257 */ 1258 void 1259 ipmp_ill_leave_illgrp(ill_t *ill) 1260 { 1261 ill_t *ipmp_ill; 1262 ipif_t *ipif; 1263 ipmp_arpent_t *entp; 1264 ipmp_illgrp_t *illg = ill->ill_grp; 1265 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1266 1267 ASSERT(IS_UNDER_IPMP(ill)); 1268 ASSERT(IAM_WRITER_ILL(ill)); 1269 ASSERT(illg != NULL); 1270 1271 ipmp_ill = illg->ig_ipmp_ill; 1272 1273 /* 1274 * Cancel IPMP-specific ill timeouts. 1275 */ 1276 (void) untimeout(ill->ill_refresh_tid); 1277 1278 /* 1279 * Expose any previously-hidden IREs on `ill'. 1280 */ 1281 ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); 1282 1283 /* 1284 * Ensure the multicast state for each ipif on `ill' is down so that 1285 * our ipif_multicast_up() (once `ill' leaves the group) will rejoin 1286 * all eligible groups. 1287 */ 1288 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1289 if (ipif->ipif_flags & IPIF_UP) 1290 ipif_multicast_down(ipif); 1291 1292 /* 1293 * Account for `ill' leaving the illgrp. 1294 */ 1295 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1296 if (ill->ill_isv6) 1297 ill->ill_phyint->phyint_grp->gr_nv6--; 1298 else 1299 ill->ill_phyint->phyint_grp->gr_nv4--; 1300 rw_exit(&ipst->ips_ipmp_lock); 1301 1302 /* 1303 * Pull `ill' out of the interface lists. 1304 */ 1305 if (list_link_active(&ill->ill_actnode)) 1306 ipmp_ill_deactivate(ill); 1307 rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); 1308 list_remove(&illg->ig_if, ill); 1309 ill->ill_grp = NULL; 1310 rw_exit(&ipst->ips_ill_g_lock); 1311 1312 /* 1313 * Recreate any broadcast IREs that had been shared, if need be. 1314 */ 1315 if (!ill->ill_isv6) 1316 ill_refresh_bcast(ill); 1317 1318 /* 1319 * Re-establish multicast memberships that were previously being 1320 * handled by the IPMP meta-interface. 1321 */ 1322 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1323 if (ipif->ipif_flags & IPIF_UP) 1324 ipif_multicast_up(ipif); 1325 1326 /* 1327 * Refresh the group MTU based on the new interface list. 1328 */ 1329 ipmp_illgrp_refresh_mtu(illg); 1330 1331 if (list_is_empty(&illg->ig_if)) { 1332 /* 1333 * No ills left in the illgrp; we no longer have a physical 1334 * address length, nor can we support ARP, CoS, or anything 1335 * else that depends on knowing the link layer type. 1336 */ 1337 while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) 1338 ipmp_illgrp_destroy_arpent(illg, entp); 1339 1340 ipmp_ill->ill_phys_addr_length = 0; 1341 ipmp_ill->ill_nd_lla_len = 0; 1342 ipmp_ill->ill_type = IFT_OTHER; 1343 mutex_enter(&ipmp_ill->ill_lock); 1344 ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; 1345 mutex_exit(&ipmp_ill->ill_lock); 1346 } else { 1347 /* 1348 * If `ill' didn't support CoS, see if it can now be enabled. 1349 */ 1350 if (!(ill->ill_flags & ILLF_COS_ENABLED)) { 1351 ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); 1352 1353 ill = list_head(&illg->ig_if); 1354 do { 1355 if (!(ill->ill_flags & ILLF_COS_ENABLED)) 1356 break; 1357 } while ((ill = list_next(&illg->ig_if, ill)) != NULL); 1358 1359 if (ill == NULL) { 1360 mutex_enter(&ipmp_ill->ill_lock); 1361 ipmp_ill->ill_flags |= ILLF_COS_ENABLED; 1362 mutex_exit(&ipmp_ill->ill_lock); 1363 } 1364 } 1365 } 1366 } 1367 1368 /* 1369 * Check if `ill' should be active, and activate or deactivate if need be. 1370 * Return B_FALSE if a refresh was necessary but could not be performed. 1371 */ 1372 static boolean_t 1373 ipmp_ill_try_refresh_active(ill_t *ill) 1374 { 1375 boolean_t refreshed = B_TRUE; 1376 1377 ASSERT(IAM_WRITER_ILL(ill)); 1378 ASSERT(IS_UNDER_IPMP(ill)); 1379 1380 if (ipmp_ill_is_active(ill)) { 1381 if (!list_link_active(&ill->ill_actnode)) 1382 refreshed = ipmp_ill_activate(ill); 1383 } else { 1384 if (list_link_active(&ill->ill_actnode)) 1385 ipmp_ill_deactivate(ill); 1386 } 1387 1388 return (refreshed); 1389 } 1390 1391 /* 1392 * Check if `ill' should be active, and activate or deactivate if need be. 1393 * If the refresh fails, schedule a timer to try again later. 1394 */ 1395 void 1396 ipmp_ill_refresh_active(ill_t *ill) 1397 { 1398 if (!ipmp_ill_try_refresh_active(ill)) 1399 ipmp_ill_refresh_active_timer_start(ill); 1400 } 1401 1402 /* 1403 * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. 1404 */ 1405 static void 1406 ipmp_ill_refresh_active_timer(void *ill_arg) 1407 { 1408 ill_t *ill = ill_arg; 1409 boolean_t refreshed = B_FALSE; 1410 1411 /* 1412 * Clear ill_refresh_tid to indicate that no timeout is pending 1413 * (another thread could schedule a new timeout while we're still 1414 * running, but that's harmless). If the ill is going away, bail. 1415 */ 1416 mutex_enter(&ill->ill_lock); 1417 ill->ill_refresh_tid = 0; 1418 if (ill->ill_state_flags & ILL_CONDEMNED) { 1419 mutex_exit(&ill->ill_lock); 1420 return; 1421 } 1422 mutex_exit(&ill->ill_lock); 1423 1424 if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { 1425 refreshed = ipmp_ill_try_refresh_active(ill); 1426 ipsq_exit(ill->ill_phyint->phyint_ipsq); 1427 } 1428 1429 /* 1430 * If the refresh failed, schedule another attempt. 1431 */ 1432 if (!refreshed) 1433 ipmp_ill_refresh_active_timer_start(ill); 1434 } 1435 1436 /* 1437 * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. 1438 */ 1439 static void 1440 ipmp_ill_refresh_active_timer_start(ill_t *ill) 1441 { 1442 mutex_enter(&ill->ill_lock); 1443 1444 /* 1445 * If the ill is going away or a refresh is already scheduled, bail. 1446 */ 1447 if (ill->ill_refresh_tid != 0 || 1448 (ill->ill_state_flags & ILL_CONDEMNED)) { 1449 mutex_exit(&ill->ill_lock); 1450 return; 1451 } 1452 1453 ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, 1454 SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); 1455 1456 mutex_exit(&ill->ill_lock); 1457 } 1458 1459 /* 1460 * Activate `ill' so it will be used to send and receive data traffic. Return 1461 * B_FALSE if `ill' cannot be activated. Note that we allocate any messages 1462 * needed to deactivate `ill' here as well so that deactivation cannot fail. 1463 */ 1464 static boolean_t 1465 ipmp_ill_activate(ill_t *ill) 1466 { 1467 ipif_t *ipif; 1468 mblk_t *actmp = NULL, *deactmp = NULL; 1469 mblk_t *linkupmp = NULL, *linkdownmp = NULL; 1470 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1471 const char *grifname = grp->gr_ifname; 1472 ipmp_illgrp_t *illg = ill->ill_grp; 1473 ill_t *maxill; 1474 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1475 1476 ASSERT(IAM_WRITER_ILL(ill)); 1477 ASSERT(IS_UNDER_IPMP(ill)); 1478 1479 /* 1480 * If this will be the first active interface in the group, allocate 1481 * the link-up and link-down messages. 1482 */ 1483 if (grp->gr_nactif == 0) { 1484 linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); 1485 linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); 1486 if (linkupmp == NULL || linkdownmp == NULL) 1487 goto fail; 1488 } 1489 1490 /* 1491 * For IPv4, allocate the activate/deactivate messages, and tell ARP. 1492 */ 1493 if (!ill->ill_isv6) { 1494 actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); 1495 deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); 1496 if (actmp == NULL || deactmp == NULL) 1497 goto fail; 1498 1499 ASSERT(ill->ill_ardeact_mp == NULL); 1500 ill->ill_ardeact_mp = deactmp; 1501 putnext(illg->ig_ipmp_ill->ill_rq, actmp); 1502 } 1503 1504 if (list_is_empty(&illg->ig_actif)) { 1505 /* 1506 * Now that we have an active ill, nominate it for multicast 1507 * and broadcast duties. Do this before ipmp_ill_bind_ipif() 1508 * since that may need to send multicast packets (e.g., IPv6 1509 * neighbor discovery probes). 1510 */ 1511 ipmp_illgrp_set_cast(illg, ill); 1512 1513 /* 1514 * This is the first active ill in the illgrp -- add 'em all. 1515 * We can access/walk ig_ipmp_ill's ipif list since we're 1516 * writer on its IPSQ as well. 1517 */ 1518 ipif = illg->ig_ipmp_ill->ill_ipif; 1519 for (; ipif != NULL; ipif = ipif->ipif_next) 1520 if (ipmp_ipif_is_up_dataaddr(ipif)) 1521 ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); 1522 } else { 1523 /* 1524 * Redistribute the addresses by moving them from the ill with 1525 * the most addresses until the ill being activated is at the 1526 * same level as the rest of the ills. 1527 */ 1528 for (;;) { 1529 maxill = ipmp_illgrp_max_ill(illg); 1530 ASSERT(maxill != NULL); 1531 if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) 1532 break; 1533 ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); 1534 ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); 1535 } 1536 1537 /* 1538 * TODO: explore whether it's advantageous to flush IRE_CACHE 1539 * bindings to force existing connections to be redistributed 1540 * to the new ill. 1541 */ 1542 } 1543 1544 /* 1545 * Put the interface in the active list. 1546 */ 1547 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1548 list_insert_tail(&illg->ig_actif, ill); 1549 illg->ig_nactif++; 1550 illg->ig_next_ill = ill; 1551 rw_exit(&ipst->ips_ipmp_lock); 1552 1553 /* 1554 * Refresh ARP entries to use `ill', if need be. 1555 */ 1556 if (!ill->ill_isv6) 1557 ipmp_illgrp_refresh_arpent(illg); 1558 1559 /* 1560 * Finally, mark the group link up, if necessary. 1561 */ 1562 if (grp->gr_nactif++ == 0) { 1563 ASSERT(grp->gr_linkdownmp == NULL); 1564 grp->gr_linkdownmp = linkdownmp; 1565 put(illg->ig_ipmp_ill->ill_rq, linkupmp); 1566 } 1567 return (B_TRUE); 1568 fail: 1569 freemsg(actmp); 1570 freemsg(deactmp); 1571 freemsg(linkupmp); 1572 freemsg(linkdownmp); 1573 return (B_FALSE); 1574 } 1575 1576 /* 1577 * Deactivate `ill' so it will not be used to send or receive data traffic. 1578 */ 1579 static void 1580 ipmp_ill_deactivate(ill_t *ill) 1581 { 1582 ill_t *minill; 1583 ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; 1584 mblk_t *mp; 1585 ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; 1586 ipmp_illgrp_t *illg = ill->ill_grp; 1587 ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); 1588 1589 ASSERT(IAM_WRITER_ILL(ill)); 1590 ASSERT(IS_UNDER_IPMP(ill)); 1591 1592 /* 1593 * Delete IRE_CACHE entries tied to this ill before they become stale. 1594 */ 1595 ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, 1596 ill_stq_cache_delete, ill, ill); 1597 1598 /* 1599 * Pull the interface out of the active list. 1600 */ 1601 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1602 list_remove(&illg->ig_actif, ill); 1603 illg->ig_nactif--; 1604 illg->ig_next_ill = list_head(&illg->ig_actif); 1605 rw_exit(&ipst->ips_ipmp_lock); 1606 1607 /* 1608 * If the ill that's being deactivated had been nominated for 1609 * multicast/broadcast, nominate a new one. 1610 */ 1611 if (ill == illg->ig_cast_ill) 1612 ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); 1613 1614 /* 1615 * Unbind all of the ipifs bound to this ill, and save 'em in a list; 1616 * we'll rebind them after we tell the resolver the ill is no longer 1617 * active. We must do things in this order or the resolver could 1618 * accidentally rebind to the ill we're trying to remove if multiple 1619 * ills in the group have the same hardware address (which is 1620 * unsupported, but shouldn't lead to a wedged machine). 1621 */ 1622 while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { 1623 ipif->ipif_bound_next = ubheadipif; 1624 ubheadipif = ipif; 1625 } 1626 1627 if (!ill->ill_isv6) { 1628 /* 1629 * Tell ARP `ill' is no longer active in the group. 1630 */ 1631 mp = ill->ill_ardeact_mp; 1632 ill->ill_ardeact_mp = NULL; 1633 ASSERT(mp != NULL); 1634 putnext(illg->ig_ipmp_ill->ill_rq, mp); 1635 1636 /* 1637 * Refresh any ARP entries that had been using `ill'. 1638 */ 1639 ipmp_illgrp_refresh_arpent(illg); 1640 } 1641 1642 /* 1643 * Rebind each ipif from the deactivated ill to the active ill with 1644 * the fewest ipifs. If there are no active ills, the ipifs will 1645 * remain unbound. 1646 */ 1647 for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { 1648 ubnextipif = ipif->ipif_bound_next; 1649 ipif->ipif_bound_next = NULL; 1650 1651 if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) 1652 ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); 1653 } 1654 1655 /* 1656 * Finally, mark the group link down, if necessary. 1657 */ 1658 if (--grp->gr_nactif == 0) { 1659 mp = grp->gr_linkdownmp; 1660 grp->gr_linkdownmp = NULL; 1661 ASSERT(mp != NULL); 1662 put(illg->ig_ipmp_ill->ill_rq, mp); 1663 } 1664 } 1665 1666 /* 1667 * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) 1668 * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. 1669 */ 1670 static void 1671 ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) 1672 { 1673 ipif_t *ipif; 1674 1675 ASSERT(IAM_WRITER_ILL(ill)); 1676 ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); 1677 1678 /* 1679 * If `ill' is truly down, there are no messages to generate since: 1680 * 1681 * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface 1682 * and its addresses by bringing them down. But that's already 1683 * true, so there's nothing to hide. 1684 * 1685 * 2. If cmd == RTM_ADD, then we're supposed to generate messages 1686 * indicating that any previously-hidden up addresses are again 1687 * back up (along with the interface). But they aren't, so 1688 * there's nothing to expose. 1689 */ 1690 if (ill->ill_ipif_up_count == 0) 1691 return; 1692 1693 if (cmd == RTM_ADD) 1694 ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); 1695 1696 for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) 1697 if (ipif->ipif_flags & IPIF_UP) 1698 ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); 1699 1700 if (cmd == RTM_DELETE) 1701 ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); 1702 } 1703 1704 /* 1705 * Bind the address named by `ipif' to the underlying ill named by `ill'. 1706 * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' 1707 * will indicate to the resolver whether this is an initial bringup of 1708 * `ipif', or just a rebind to another ill. 1709 */ 1710 static void 1711 ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) 1712 { 1713 int err = 0; 1714 ip_stack_t *ipst = ill->ill_ipst; 1715 1716 ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); 1717 ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); 1718 ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); 1719 ASSERT(ipif->ipif_bound_ill == NULL); 1720 ASSERT(ipif->ipif_bound_next == NULL); 1721 1722 ipif->ipif_bound_next = ill->ill_bound_ipif; 1723 ill->ill_bound_ipif = ipif; 1724 ill->ill_bound_cnt++; 1725 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1726 ipif->ipif_bound_ill = ill; 1727 rw_exit(&ipst->ips_ipmp_lock); 1728 1729 /* 1730 * If necessary, tell ARP/NDP about the new mapping. Note that 1731 * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. 1732 */ 1733 if (act != Res_act_none) { 1734 if (ill->ill_isv6) { 1735 VERIFY(ipif_resolver_up(ipif, act) == 0); 1736 err = ipif_ndp_up(ipif, act == Res_act_initial); 1737 } else { 1738 err = ipif_resolver_up(ipif, act); 1739 } 1740 1741 /* 1742 * Since ipif_ndp_up() never returns EINPROGRESS and 1743 * ipif_resolver_up() only returns EINPROGRESS when the 1744 * associated ill is not up, we should never be here with 1745 * EINPROGRESS. We rely on this to simplify the design. 1746 */ 1747 ASSERT(err != EINPROGRESS); 1748 } 1749 /* TODO: retry binding on failure? when? */ 1750 ipif->ipif_bound = (err == 0); 1751 } 1752 1753 /* 1754 * Unbind the address named by `ipif' from the underlying ill named by `ill'. 1755 * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. 1756 * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is 1757 * B_TRUE, notify the resolver about the change. 1758 */ 1759 static ipif_t * 1760 ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) 1761 { 1762 ill_t *ipmp_ill; 1763 ipif_t *previpif; 1764 ip_stack_t *ipst = ill->ill_ipst; 1765 1766 ASSERT(IAM_WRITER_ILL(ill)); 1767 ASSERT(IS_UNDER_IPMP(ill)); 1768 1769 ipmp_ill = ill->ill_grp->ig_ipmp_ill; 1770 1771 /* 1772 * If necessary, find an ipif to unbind. 1773 */ 1774 if (ipif == NULL) { 1775 if ((ipif = ill->ill_bound_ipif) == NULL) { 1776 ASSERT(ill->ill_bound_cnt == 0); 1777 return (NULL); 1778 } 1779 } 1780 1781 ASSERT(IAM_WRITER_IPIF(ipif)); 1782 ASSERT(IS_IPMP(ipif->ipif_ill)); 1783 ASSERT(ipif->ipif_bound_ill == ill); 1784 ASSERT(ill->ill_bound_cnt > 0); 1785 1786 /* 1787 * Unbind it. 1788 */ 1789 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1790 ipif->ipif_bound_ill = NULL; 1791 rw_exit(&ipst->ips_ipmp_lock); 1792 ill->ill_bound_cnt--; 1793 1794 if (ill->ill_bound_ipif == ipif) { 1795 ill->ill_bound_ipif = ipif->ipif_bound_next; 1796 } else { 1797 previpif = ill->ill_bound_ipif; 1798 while (previpif->ipif_bound_next != ipif) 1799 previpif = previpif->ipif_bound_next; 1800 1801 previpif->ipif_bound_next = ipif->ipif_bound_next; 1802 } 1803 ipif->ipif_bound_next = NULL; 1804 1805 /* 1806 * If requested, notify the resolvers (provided we're bound). 1807 */ 1808 if (notifyres && ipif->ipif_bound) { 1809 if (ill->ill_isv6) { 1810 ipif_ndp_down(ipif); 1811 } else { 1812 ASSERT(ipif->ipif_arp_del_mp != NULL); 1813 putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); 1814 ipif->ipif_arp_del_mp = NULL; 1815 } 1816 } 1817 ipif->ipif_bound = B_FALSE; 1818 1819 return (ipif); 1820 } 1821 1822 /* 1823 * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if 1824 * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this 1825 * to determine whether an ill should be considered active, other consumers 1826 * may race and learn about an ill that should be deactivated/activated before 1827 * IPMP has performed the activation/deactivation. This should be safe though 1828 * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that 1829 * would've been cleaned up by ipmp_ill_deactivate(). 1830 */ 1831 boolean_t 1832 ipmp_ill_is_active(ill_t *ill) 1833 { 1834 phyint_t *phyi = ill->ill_phyint; 1835 1836 ASSERT(IS_UNDER_IPMP(ill)); 1837 ASSERT(IAM_WRITER_ILL(ill) || 1838 (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); 1839 1840 /* 1841 * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to 1842 * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the 1843 * link flapping logic to be just in in.mpathd and allows us to ignore 1844 * changes to PHYI_RUNNING. 1845 */ 1846 return (!(ill->ill_ipif_up_count == 0 || 1847 (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); 1848 } 1849 1850 /* 1851 * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet 1852 * IREs with a source address on `ill_arg'. 1853 */ 1854 static void 1855 ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) 1856 { 1857 ill_t *ill = (ill_t *)ill_arg; 1858 1859 ASSERT(IAM_WRITER_ILL(ill)); 1860 ASSERT(!IS_IPMP(ill)); 1861 1862 if (ire->ire_ipif->ipif_ill != ill) 1863 return; 1864 1865 switch (ire->ire_type) { 1866 case IRE_HOST: 1867 case IRE_PREFIX: 1868 case IRE_DEFAULT: 1869 case IRE_CACHE: 1870 case IRE_IF_RESOLVER: 1871 case IRE_IF_NORESOLVER: 1872 DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); 1873 ire->ire_marks |= IRE_MARK_TESTHIDDEN; 1874 break; 1875 default: 1876 break; 1877 } 1878 } 1879 1880 /* 1881 * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source 1882 * address on `ill_arg'. 1883 */ 1884 static void 1885 ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) 1886 { 1887 ill_t *ill = (ill_t *)ill_arg; 1888 1889 ASSERT(IAM_WRITER_ILL(ill)); 1890 ASSERT(!IS_IPMP(ill)); 1891 1892 if (ire->ire_ipif->ipif_ill == ill) { 1893 DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); 1894 ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; 1895 } 1896 } 1897 1898 /* 1899 * Return a held pointer to the IPMP ill for underlying interface `ill', or 1900 * NULL if one doesn't exist. (Unfortunately, this function needs to take an 1901 * underlying ill rather than an ipmp_illgrp_t because an underlying ill's 1902 * ill_grp pointer may become stale when not under an IPSQ and not holding 1903 * ipmp_lock.) Caller need not be inside the IPSQ. 1904 */ 1905 ill_t * 1906 ipmp_ill_hold_ipmp_ill(ill_t *ill) 1907 { 1908 ip_stack_t *ipst = ill->ill_ipst; 1909 ipmp_illgrp_t *illg; 1910 1911 ASSERT(!IS_IPMP(ill)); 1912 1913 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1914 illg = ill->ill_grp; 1915 if (illg != NULL && ILL_CAN_LOOKUP(illg->ig_ipmp_ill)) { 1916 ill_refhold(illg->ig_ipmp_ill); 1917 rw_exit(&ipst->ips_ipmp_lock); 1918 return (illg->ig_ipmp_ill); 1919 } 1920 /* 1921 * Assume `ill' was removed from the illgrp in the meantime. 1922 */ 1923 rw_exit(&ill->ill_ipst->ips_ipmp_lock); 1924 return (NULL); 1925 } 1926 1927 /* 1928 * Return the interface index for the IPMP ill tied to underlying interface 1929 * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. 1930 */ 1931 uint_t 1932 ipmp_ill_get_ipmp_ifindex(const ill_t *ill) 1933 { 1934 uint_t ifindex = 0; 1935 ip_stack_t *ipst = ill->ill_ipst; 1936 ipmp_grp_t *grp; 1937 1938 ASSERT(!IS_IPMP(ill)); 1939 1940 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 1941 if ((grp = ill->ill_phyint->phyint_grp) != NULL) 1942 ifindex = grp->gr_phyint->phyint_ifindex; 1943 rw_exit(&ipst->ips_ipmp_lock); 1944 return (ifindex); 1945 } 1946 1947 /* 1948 * Place phyint `phyi' into IPMP group `grp'. 1949 */ 1950 void 1951 ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) 1952 { 1953 ill_t *ill; 1954 ipsq_t *ipsq = phyi->phyint_ipsq; 1955 ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; 1956 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 1957 1958 ASSERT(IAM_WRITER_IPSQ(ipsq)); 1959 ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); 1960 1961 /* 1962 * Send routing socket messages indicating that the phyint's ills 1963 * and ipifs vanished. 1964 */ 1965 if (phyi->phyint_illv4 != NULL) { 1966 ill = phyi->phyint_illv4; 1967 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1968 } 1969 1970 if (phyi->phyint_illv6 != NULL) { 1971 ill = phyi->phyint_illv6; 1972 ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); 1973 } 1974 1975 /* 1976 * Snapshot the phyint's initial kstats as a baseline. 1977 */ 1978 ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); 1979 1980 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 1981 1982 phyi->phyint_grp = grp; 1983 if (++grp->gr_nif == 1) 1984 grp->gr_mactype = ill->ill_mactype; 1985 else 1986 ASSERT(grp->gr_mactype == ill->ill_mactype); 1987 1988 /* 1989 * Now that we're in the group, request a switch to the group's xop 1990 * when we ipsq_exit(). All future operations will be exclusive on 1991 * the group xop until ipmp_phyint_leave_grp() is called. 1992 */ 1993 ASSERT(ipsq->ipsq_swxop == NULL); 1994 ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); 1995 ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; 1996 1997 rw_exit(&ipst->ips_ipmp_lock); 1998 } 1999 2000 /* 2001 * Remove phyint `phyi' from its current IPMP group. 2002 */ 2003 void 2004 ipmp_phyint_leave_grp(phyint_t *phyi) 2005 { 2006 uint_t i; 2007 ipsq_t *ipsq = phyi->phyint_ipsq; 2008 ip_stack_t *ipst = PHYINT_TO_IPST(phyi); 2009 uint64_t phyi_kstats[IPMP_KSTAT_MAX]; 2010 2011 ASSERT(IAM_WRITER_IPSQ(ipsq)); 2012 2013 /* 2014 * If any of the phyint's ills are still in an illgrp, kick 'em out. 2015 */ 2016 if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) 2017 ipmp_ill_leave_illgrp(phyi->phyint_illv4); 2018 if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) 2019 ipmp_ill_leave_illgrp(phyi->phyint_illv6); 2020 2021 /* 2022 * Send routing socket messages indicating that the phyint's ills 2023 * and ipifs have reappeared. 2024 */ 2025 if (phyi->phyint_illv4 != NULL) 2026 ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); 2027 if (phyi->phyint_illv6 != NULL) 2028 ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); 2029 2030 /* 2031 * Calculate the phyint's cumulative kstats while it was in the group, 2032 * and add that to the group's baseline. 2033 */ 2034 ipmp_phyint_get_kstats(phyi, phyi_kstats); 2035 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2036 phyi_kstats[i] -= phyi->phyint_kstats0[i]; 2037 atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); 2038 } 2039 2040 rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); 2041 2042 phyi->phyint_grp->gr_nif--; 2043 phyi->phyint_grp = NULL; 2044 2045 /* 2046 * As our final act in leaving the group, request a switch back to our 2047 * IPSQ's own xop when we ipsq_exit(). 2048 */ 2049 ASSERT(ipsq->ipsq_swxop == NULL); 2050 ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; 2051 2052 rw_exit(&ipst->ips_ipmp_lock); 2053 } 2054 2055 /* 2056 * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. 2057 * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. 2058 */ 2059 static void 2060 ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) 2061 { 2062 uint_t i, j; 2063 const char *name; 2064 kstat_t *ksp; 2065 kstat_named_t *kn; 2066 2067 bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); 2068 2069 /* 2070 * NOTE: ALL_ZONES here assumes that there's at most one link 2071 * with a given name on a given system (safe for now). 2072 */ 2073 ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); 2074 if (ksp == NULL) 2075 return; 2076 2077 KSTAT_ENTER(ksp); 2078 2079 if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { 2080 /* 2081 * Bring kstats up-to-date before recording. 2082 */ 2083 (void) KSTAT_UPDATE(ksp, KSTAT_READ); 2084 2085 kn = KSTAT_NAMED_PTR(ksp); 2086 for (i = 0; i < IPMP_KSTAT_MAX; i++) { 2087 name = ipmp_kstats[i].name; 2088 kstats[i] = 0; 2089 for (j = 0; j < ksp->ks_ndata; j++) { 2090 if (strcmp(kn[j].name, name) != 0) 2091 continue; 2092 2093 switch (kn[j].data_type) { 2094 case KSTAT_DATA_INT32: 2095 case KSTAT_DATA_UINT32: 2096 kstats[i] = kn[j].value.ui32; 2097 break; 2098 #ifdef _LP64 2099 case KSTAT_DATA_LONG: 2100 case KSTAT_DATA_ULONG: 2101 kstats[i] = kn[j].value.ul; 2102 break; 2103 #endif 2104 case KSTAT_DATA_INT64: 2105 case KSTAT_DATA_UINT64: 2106 kstats[i] = kn[j].value.ui64; 2107 break; 2108 } 2109 break; 2110 } 2111 } 2112 } 2113 2114 KSTAT_EXIT(ksp); 2115 kstat_rele(ksp); 2116 } 2117 2118 /* 2119 * Refresh the active state of all ills on `phyi'. 2120 */ 2121 void 2122 ipmp_phyint_refresh_active(phyint_t *phyi) 2123 { 2124 if (phyi->phyint_illv4 != NULL) 2125 ipmp_ill_refresh_active(phyi->phyint_illv4); 2126 if (phyi->phyint_illv6 != NULL) 2127 ipmp_ill_refresh_active(phyi->phyint_illv6); 2128 } 2129 2130 /* 2131 * Return a held pointer to the underlying ill bound to `ipif', or NULL if one 2132 * doesn't exist. Caller need not be inside the IPSQ. 2133 */ 2134 ill_t * 2135 ipmp_ipif_hold_bound_ill(const ipif_t *ipif) 2136 { 2137 ill_t *boundill; 2138 ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; 2139 2140 ASSERT(IS_IPMP(ipif->ipif_ill)); 2141 2142 rw_enter(&ipst->ips_ipmp_lock, RW_READER); 2143 boundill = ipif->ipif_bound_ill; 2144 if (boundill != NULL && ILL_CAN_LOOKUP(boundill)) { 2145 ill_refhold(boundill); 2146 rw_exit(&ipst->ips_ipmp_lock); 2147 return (boundill); 2148 } 2149 rw_exit(&ipst->ips_ipmp_lock); 2150 return (NULL); 2151 } 2152 2153 /* 2154 * Return a pointer to the underlying ill bound to `ipif', or NULL if one 2155 * doesn't exist. Caller must be inside the IPSQ. 2156 */ 2157 ill_t * 2158 ipmp_ipif_bound_ill(const ipif_t *ipif) 2159 { 2160 ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); 2161 ASSERT(IS_IPMP(ipif->ipif_ill)); 2162 2163 return (ipif->ipif_bound_ill); 2164 } 2165 2166 /* 2167 * Check if `ipif' is a "stub" (placeholder address not being used). 2168 */ 2169 boolean_t 2170 ipmp_ipif_is_stubaddr(const ipif_t *ipif) 2171 { 2172 if (ipif->ipif_flags & IPIF_UP) 2173 return (B_FALSE); 2174 if (ipif->ipif_ill->ill_isv6) 2175 return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2176 else 2177 return (ipif->ipif_lcl_addr == INADDR_ANY); 2178 } 2179 2180 /* 2181 * Check if `ipif' is an IPMP data address. 2182 */ 2183 boolean_t 2184 ipmp_ipif_is_dataaddr(const ipif_t *ipif) 2185 { 2186 if (ipif->ipif_flags & IPIF_NOFAILOVER) 2187 return (B_FALSE); 2188 if (ipif->ipif_ill->ill_isv6) 2189 return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); 2190 else 2191 return (ipif->ipif_lcl_addr != INADDR_ANY); 2192 } 2193 2194 /* 2195 * Check if `ipif' is an IPIF_UP IPMP data address. 2196 */ 2197 static boolean_t 2198 ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) 2199 { 2200 return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); 2201 } 2202