1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 /* 32 * Global list of phyints, phyint instances, phyint groups and the anonymous 33 * group; the latter is initialized in phyint_init(). 34 */ 35 struct phyint *phyints = NULL; 36 struct phyint_instance *phyint_instances = NULL; 37 struct phyint_group *phyint_groups = NULL; 38 struct phyint_group *phyint_anongroup; 39 40 /* 41 * Grouplist signature; initialized in phyint_init(). 42 */ 43 static uint64_t phyint_grouplistsig; 44 45 static void phyint_inst_insert(struct phyint_instance *pii); 46 static void phyint_inst_print(struct phyint_instance *pii); 47 48 static void phyint_insert(struct phyint *pi, struct phyint_group *pg); 49 static void phyint_delete(struct phyint *pi); 50 51 static void phyint_group_insert(struct phyint_group *pg); 52 static void phyint_group_delete(struct phyint_group *pg); 53 static struct phyint_group *phyint_group_lookup(const char *pg_name); 54 static struct phyint_group *phyint_group_create(const char *pg_name); 55 56 static void logint_print(struct logint *li); 57 static void logint_insert(struct phyint_instance *pii, struct logint *li); 58 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name); 59 60 static void target_print(struct target *tg); 61 static void target_insert(struct phyint_instance *pii, struct target *tg); 62 static struct target *target_first(struct phyint_instance *pii); 63 static struct target *target_select_best(struct phyint_instance *pii); 64 static void target_flush_hosts(struct phyint_group *pg); 65 66 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg); 67 68 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii); 69 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii); 70 71 static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask); 72 static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2, 73 int prefix_len); 74 75 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi); 76 static int phyint_group_state_event(struct phyint_group *pg); 77 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t); 78 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, 79 ipmp_if_op_t op); 80 81 static uint64_t gensig(void); 82 83 /* Initialize any per-file global state. Returns 0 on success, -1 on failure */ 84 int 85 phyint_init(void) 86 { 87 phyint_grouplistsig = gensig(); 88 if (track_all_phyints) { 89 phyint_anongroup = phyint_group_create(""); 90 if (phyint_anongroup == NULL) 91 return (-1); 92 phyint_group_insert(phyint_anongroup); 93 } 94 return (0); 95 } 96 97 /* Return the phyint with the given name */ 98 struct phyint * 99 phyint_lookup(const char *name) 100 { 101 struct phyint *pi; 102 103 if (debug & D_PHYINT) 104 logdebug("phyint_lookup(%s)\n", name); 105 106 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 107 if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0) 108 break; 109 } 110 return (pi); 111 } 112 113 /* Return the phyint instance with the given name and the given family */ 114 struct phyint_instance * 115 phyint_inst_lookup(int af, char *name) 116 { 117 struct phyint *pi; 118 119 if (debug & D_PHYINT) 120 logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name); 121 122 assert(af == AF_INET || af == AF_INET6); 123 124 pi = phyint_lookup(name); 125 if (pi == NULL) 126 return (NULL); 127 128 return (PHYINT_INSTANCE(pi, af)); 129 } 130 131 static struct phyint_group * 132 phyint_group_lookup(const char *pg_name) 133 { 134 struct phyint_group *pg; 135 136 if (debug & D_PHYINT) 137 logdebug("phyint_group_lookup(%s)\n", pg_name); 138 139 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 140 if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0) 141 break; 142 } 143 return (pg); 144 } 145 146 /* 147 * Insert the phyint in the linked list of all phyints. If the phyint belongs 148 * to some group, insert it in the phyint group list. 149 */ 150 static void 151 phyint_insert(struct phyint *pi, struct phyint_group *pg) 152 { 153 if (debug & D_PHYINT) 154 logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name); 155 156 /* Insert the phyint at the head of the 'all phyints' list */ 157 pi->pi_next = phyints; 158 pi->pi_prev = NULL; 159 if (phyints != NULL) 160 phyints->pi_prev = pi; 161 phyints = pi; 162 163 /* 164 * Insert the phyint at the head of the 'phyint_group members' list 165 * of the phyint group to which it belongs. 166 */ 167 pi->pi_pgnext = NULL; 168 pi->pi_pgprev = NULL; 169 pi->pi_group = pg; 170 171 pi->pi_pgnext = pg->pg_phyint; 172 if (pi->pi_pgnext != NULL) 173 pi->pi_pgnext->pi_pgprev = pi; 174 pg->pg_phyint = pi; 175 176 pg->pg_sig++; 177 (void) phyint_group_member_event(pg, pi, IPMP_IF_ADD); 178 } 179 180 /* Insert the phyint instance in the linked list of all phyint instances. */ 181 static void 182 phyint_inst_insert(struct phyint_instance *pii) 183 { 184 if (debug & D_PHYINT) { 185 logdebug("phyint_inst_insert(%s %s)\n", 186 AF_STR(pii->pii_af), pii->pii_name); 187 } 188 189 /* 190 * Insert the phyint at the head of the 'all phyint instances' list. 191 */ 192 pii->pii_next = phyint_instances; 193 pii->pii_prev = NULL; 194 if (phyint_instances != NULL) 195 phyint_instances->pii_prev = pii; 196 phyint_instances = pii; 197 } 198 199 /* 200 * Create a new phyint with the given parameters. Also insert it into 201 * the list of all phyints and the list of phyint group members by calling 202 * phyint_insert(). 203 */ 204 static struct phyint * 205 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, 206 uint64_t flags) 207 { 208 struct phyint *pi; 209 210 pi = calloc(1, sizeof (struct phyint)); 211 if (pi == NULL) { 212 logperror("phyint_create: calloc"); 213 return (NULL); 214 } 215 216 /* 217 * Record the phyint values. Also insert the phyint into the 218 * phyint group by calling phyint_insert(). 219 */ 220 (void) strlcpy(pi->pi_name, pi_name, sizeof (pi->pi_name)); 221 pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; 222 pi->pi_ifindex = ifindex; 223 pi->pi_icmpid = 224 htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF)); 225 /* 226 * We optimistically start in the PI_RUNNING state. Later (in 227 * process_link_state_changes()), we will readjust this to match the 228 * current state of the link. Further, if test addresses are 229 * subsequently assigned, we will transition to PI_NOTARGETS and then 230 * either PI_RUNNING or PI_FAILED, depending on the result of the test 231 * probes. 232 */ 233 pi->pi_state = PI_RUNNING; 234 pi->pi_flags = PHYINT_FLAGS(flags); 235 /* 236 * Initialise the link state. The link state is initialised to 237 * up, so that if the link is down when IPMP starts monitoring 238 * the interface, it will appear as though there has been a 239 * transition from the link up to link down. This avoids 240 * having to treat this situation as a special case. 241 */ 242 INIT_LINK_STATE(pi); 243 244 /* 245 * Insert the phyint in the list of all phyints, and the 246 * list of phyint group members 247 */ 248 phyint_insert(pi, pg); 249 250 /* 251 * If we are joining a failed group, mark the interface as 252 * failed. 253 */ 254 if (GROUP_FAILED(pg)) 255 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 256 257 return (pi); 258 } 259 260 /* 261 * Create a new phyint instance belonging to the phyint 'pi' and address 262 * family 'af'. Also insert it into the list of all phyint instances by 263 * calling phyint_inst_insert(). 264 */ 265 static struct phyint_instance * 266 phyint_inst_create(struct phyint *pi, int af) 267 { 268 struct phyint_instance *pii; 269 270 pii = calloc(1, sizeof (struct phyint_instance)); 271 if (pii == NULL) { 272 logperror("phyint_inst_create: calloc"); 273 return (NULL); 274 } 275 276 /* 277 * Attach the phyint instance to the phyint. 278 * Set the back pointers as well 279 */ 280 pii->pii_phyint = pi; 281 if (af == AF_INET) 282 pi->pi_v4 = pii; 283 else 284 pi->pi_v6 = pii; 285 286 pii->pii_in_use = 1; 287 pii->pii_probe_sock = -1; 288 pii->pii_snxt = 1; 289 pii->pii_af = af; 290 pii->pii_fd_hrtime = gethrtime() + 291 (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC); 292 pii->pii_flags = pi->pi_flags; 293 294 /* Insert the phyint instance in the list of all phyint instances. */ 295 phyint_inst_insert(pii); 296 return (pii); 297 } 298 299 /* 300 * Change the state of phyint `pi' to state `state'. 301 */ 302 void 303 phyint_chstate(struct phyint *pi, enum pi_state state) 304 { 305 /* 306 * To simplify things, some callers always set a given state 307 * regardless of the previous state of the phyint (e.g., setting 308 * PI_RUNNING when it's already set). We shouldn't bother 309 * generating an event or consuming a signature for these, since 310 * the actual state of the interface is unchanged. 311 */ 312 if (pi->pi_state == state) 313 return; 314 315 pi->pi_state = state; 316 pi->pi_group->pg_sig++; 317 (void) phyint_state_event(pi->pi_group, pi); 318 } 319 320 /* 321 * Note that the type of phyint `pi' has changed. 322 */ 323 void 324 phyint_newtype(struct phyint *pi) 325 { 326 pi->pi_group->pg_sig++; 327 (void) phyint_state_event(pi->pi_group, pi); 328 } 329 330 /* 331 * Insert the phyint group in the linked list of all phyint groups 332 * at the head of the list 333 */ 334 static void 335 phyint_group_insert(struct phyint_group *pg) 336 { 337 pg->pg_next = phyint_groups; 338 pg->pg_prev = NULL; 339 if (phyint_groups != NULL) 340 phyint_groups->pg_prev = pg; 341 phyint_groups = pg; 342 343 phyint_grouplistsig++; 344 (void) phyint_group_change_event(pg, IPMP_GROUP_ADD); 345 } 346 347 /* 348 * Create a new phyint group called 'name'. 349 */ 350 static struct phyint_group * 351 phyint_group_create(const char *name) 352 { 353 struct phyint_group *pg; 354 355 if (debug & D_PHYINT) 356 logdebug("phyint_group_create(%s)\n", name); 357 358 pg = calloc(1, sizeof (struct phyint_group)); 359 if (pg == NULL) { 360 logperror("phyint_group_create: calloc"); 361 return (NULL); 362 } 363 364 (void) strlcpy(pg->pg_name, name, sizeof (pg->pg_name)); 365 pg->pg_sig = gensig(); 366 367 pg->pg_fdt = user_failure_detection_time; 368 pg->pg_probeint = user_probe_interval; 369 370 return (pg); 371 } 372 373 /* 374 * Change the state of the phyint group `pg' to state `state'. 375 */ 376 void 377 phyint_group_chstate(struct phyint_group *pg, enum pg_state state) 378 { 379 assert(pg != phyint_anongroup); 380 381 switch (state) { 382 case PG_FAILED: 383 pg->pg_groupfailed = 1; 384 385 /* 386 * We can never know with certainty that a group has 387 * failed. It is possible that all known targets have 388 * failed simultaneously, and new targets have come up 389 * instead. If the targets are routers then router 390 * discovery will kick in, and we will see the new routers 391 * thru routing socket messages. But if the targets are 392 * hosts, we have to discover it by multicast. So flush 393 * all the host targets. The next probe will send out a 394 * multicast echo request. If this is a group failure, we 395 * will still not see any response, otherwise we will 396 * clear the pg_groupfailed flag after we get 397 * NUM_PROBE_REPAIRS consecutive unicast replies on any 398 * phyint. 399 */ 400 target_flush_hosts(pg); 401 break; 402 403 case PG_RUNNING: 404 pg->pg_groupfailed = 0; 405 break; 406 407 default: 408 logerr("phyint_group_chstate: invalid group state %d; " 409 "aborting\n", state); 410 abort(); 411 } 412 413 pg->pg_sig++; 414 (void) phyint_group_state_event(pg); 415 } 416 417 /* 418 * Create a new phyint instance and initialize it from the values supplied by 419 * the kernel. Always check for ENXIO before logging any error, because the 420 * interface could have vanished after completion of SIOCGLIFCONF. 421 * Return values: 422 * pointer to the phyint instance on success 423 * NULL on failure Eg. if the phyint instance is not found in the kernel 424 */ 425 struct phyint_instance * 426 phyint_inst_init_from_k(int af, char *pi_name) 427 { 428 char pg_name[LIFNAMSIZ + 1]; 429 int ifsock; 430 uint_t ifindex; 431 uint64_t flags; 432 struct lifreq lifr; 433 struct phyint *pi; 434 struct phyint_instance *pii; 435 boolean_t pg_created; 436 boolean_t pi_created; 437 struct phyint_group *pg; 438 439 retry: 440 pii = NULL; 441 pi = NULL; 442 pg = NULL; 443 pi_created = _B_FALSE; 444 pg_created = _B_FALSE; 445 446 if (debug & D_PHYINT) { 447 logdebug("phyint_inst_init_from_k(%s %s)\n", 448 AF_STR(af), pi_name); 449 } 450 451 assert(af == AF_INET || af == AF_INET6); 452 453 /* Get the socket for doing ioctls */ 454 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 455 456 /* 457 * Get the interface flags. Ignore loopback and multipoint 458 * interfaces. 459 */ 460 (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); 461 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 462 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 463 if (errno != ENXIO) { 464 logperror("phyint_inst_init_from_k:" 465 " ioctl (get flags)"); 466 } 467 return (NULL); 468 } 469 flags = lifr.lifr_flags; 470 if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK)) 471 return (NULL); 472 473 /* 474 * Get the ifindex for recording later in our tables, in case we need 475 * to create a new phyint. 476 */ 477 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) { 478 if (errno != ENXIO) { 479 logperror("phyint_inst_init_from_k: " 480 " ioctl (get lifindex)"); 481 } 482 return (NULL); 483 } 484 ifindex = lifr.lifr_index; 485 486 /* 487 * Get the phyint group name of this phyint, from the kernel. 488 */ 489 if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) { 490 if (errno != ENXIO) { 491 logperror("phyint_inst_init_from_k: " 492 "ioctl (get group name)"); 493 } 494 return (NULL); 495 } 496 (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); 497 pg_name[sizeof (pg_name) - 1] = '\0'; 498 499 /* 500 * If the phyint is not part of any group, pg_name is the 501 * null string. If 'track_all_phyints' is false, there is no 502 * need to create a phyint. 503 */ 504 if (pg_name[0] == '\0' && !track_all_phyints) { 505 /* 506 * If the IFF_FAILED or IFF_OFFLINE flags are set, reset 507 * them. These flags shouldn't be set if IPMP isn't 508 * tracking the interface. 509 */ 510 if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) { 511 lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE); 512 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 513 if (errno != ENXIO) { 514 logperror("phyint_inst_init_from_k:" 515 " ioctl (set flags)"); 516 } 517 } 518 } 519 return (NULL); 520 } 521 522 /* 523 * We need to create a new phyint instance. A phyint instance 524 * belongs to a phyint, and the phyint belongs to a phyint group. 525 * So we first lookup the 'parents' and if they don't exist then 526 * we create them. 527 */ 528 pg = phyint_group_lookup(pg_name); 529 if (pg == NULL) { 530 pg = phyint_group_create(pg_name); 531 if (pg == NULL) { 532 logerr("phyint_inst_init_from_k:" 533 " unable to create group %s\n", pg_name); 534 return (NULL); 535 } 536 phyint_group_insert(pg); 537 pg_created = _B_TRUE; 538 } 539 540 /* 541 * Lookup the phyint. If the phyint does not exist create it. 542 */ 543 pi = phyint_lookup(pi_name); 544 if (pi == NULL) { 545 pi = phyint_create(pi_name, pg, ifindex, flags); 546 if (pi == NULL) { 547 logerr("phyint_inst_init_from_k:" 548 " unable to create phyint %s\n", pi_name); 549 if (pg_created) 550 phyint_group_delete(pg); 551 return (NULL); 552 } 553 pi_created = _B_TRUE; 554 } else { 555 /* The phyint exists already. */ 556 assert(pi_created == _B_FALSE); 557 /* 558 * Normally we should see consistent values for the IPv4 and 559 * IPv6 instances, for phyint properties. If we don't, it 560 * means things have changed underneath us, and we should 561 * resync our tables with the kernel. Check whether the 562 * interface index has changed. If so, it is most likely 563 * the interface has been unplumbed and replumbed, 564 * while we are yet to update our tables. Do it now. 565 */ 566 if (pi->pi_ifindex != ifindex) { 567 if (pg_created) 568 phyint_group_delete(pg); 569 phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); 570 goto retry; 571 } 572 assert(PHYINT_INSTANCE(pi, af) == NULL); 573 574 /* 575 * If the group name seen by the IPv4 and IPv6 instances 576 * are different, it is most likely the groupname has 577 * changed, while we are yet to update our tables. Do it now. 578 */ 579 if (strcmp(pi->pi_group->pg_name, pg_name) != 0) { 580 if (pg_created) 581 phyint_group_delete(pg); 582 restore_phyint(pi); 583 phyint_inst_delete(PHYINT_INSTANCE(pi, 584 AF_OTHER(af))); 585 goto retry; 586 } 587 } 588 589 /* 590 * Create a new phyint instance, corresponding to the 'af' 591 * passed in. 592 */ 593 pii = phyint_inst_create(pi, af); 594 if (pii == NULL) { 595 logerr("phyint_inst_init_from_k: unable to create" 596 "phyint inst %s\n", pi->pi_name); 597 if (pi_created) { 598 /* 599 * Deleting the phyint will delete the phyint group 600 * if this is the last phyint in the group. 601 */ 602 phyint_delete(pi); 603 } 604 return (NULL); 605 } 606 607 return (pii); 608 } 609 610 /* 611 * Bind pii_probe_sock to the address associated with pii_probe_logint. 612 * This socket will be used for sending and receiving ICMP/ICMPv6 probes to 613 * targets. Do the common part in this function, and complete the 614 * initializations by calling the protocol specific functions 615 * phyint_inst_v{4,6}_sockinit() respectively. 616 * 617 * Return values: _B_TRUE/_B_FALSE for success or failure respectively. 618 */ 619 boolean_t 620 phyint_inst_sockinit(struct phyint_instance *pii) 621 { 622 boolean_t success; 623 struct phyint_group *pg; 624 625 if (debug & D_PHYINT) { 626 logdebug("phyint_inst_sockinit(%s %s)\n", 627 AF_STR(pii->pii_af), pii->pii_name); 628 } 629 630 assert(pii->pii_probe_logint != NULL); 631 assert(pii->pii_probe_logint->li_flags & IFF_UP); 632 assert(pii->pii_probe_logint->li_flags & IFF_NOFAILOVER); 633 assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6); 634 635 /* 636 * If the socket is already bound, close pii_probe_sock 637 */ 638 if (pii->pii_probe_sock != -1) 639 close_probe_socket(pii, _B_TRUE); 640 641 /* 642 * If the phyint is not part of a named group and track_all_phyints is 643 * false, simply return. 644 */ 645 pg = pii->pii_phyint->pi_group; 646 if (pg == phyint_anongroup && !track_all_phyints) { 647 if (debug & D_PHYINT) 648 logdebug("phyint_inst_sockinit: no group\n"); 649 return (_B_FALSE); 650 } 651 652 /* 653 * Initialize the socket by calling the protocol specific function. 654 * If it succeeds, add the socket to the poll list. 655 */ 656 if (pii->pii_af == AF_INET6) 657 success = phyint_inst_v6_sockinit(pii); 658 else 659 success = phyint_inst_v4_sockinit(pii); 660 661 if (success && (poll_add(pii->pii_probe_sock) == 0)) 662 return (_B_TRUE); 663 664 /* Something failed, cleanup and return false */ 665 if (pii->pii_probe_sock != -1) 666 close_probe_socket(pii, _B_FALSE); 667 668 return (_B_FALSE); 669 } 670 671 /* 672 * IPv6 specific part in initializing the pii_probe_sock. This socket is 673 * used to send/receive ICMPv6 probe packets. 674 */ 675 static boolean_t 676 phyint_inst_v6_sockinit(struct phyint_instance *pii) 677 { 678 icmp6_filter_t filter; 679 int hopcount = 1; 680 int int_op; 681 struct sockaddr_in6 testaddr; 682 683 /* 684 * Open a raw socket with ICMPv6 protocol. 685 * 686 * Use IPV6_DONTFAILOVER_IF to make sure that probes go out 687 * on the specified phyint only, and are not subject to load 688 * balancing. Bind to the src address chosen will ensure that 689 * the responses are received only on the specified phyint. 690 * 691 * Set the hopcount to 1 so that probe packets are not routed. 692 * Disable multicast loopback. Set the receive filter to 693 * receive only ICMPv6 echo replies. 694 */ 695 pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6); 696 if (pii->pii_probe_sock < 0) { 697 logperror_pii(pii, "phyint_inst_v6_sockinit: socket"); 698 return (_B_FALSE); 699 } 700 701 bzero(&testaddr, sizeof (testaddr)); 702 testaddr.sin6_family = AF_INET6; 703 testaddr.sin6_port = 0; 704 testaddr.sin6_addr = pii->pii_probe_logint->li_addr; 705 706 if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr, 707 sizeof (testaddr)) < 0) { 708 logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind"); 709 return (_B_FALSE); 710 } 711 712 /* 713 * IPV6_DONTFAILOVER_IF option takes precedence over setting 714 * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again. 715 */ 716 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF, 717 (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) { 718 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 719 " IPV6_DONTFAILOVER_IF"); 720 return (_B_FALSE); 721 } 722 723 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS, 724 (char *)&hopcount, sizeof (hopcount)) < 0) { 725 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 726 " IPV6_UNICAST_HOPS"); 727 return (_B_FALSE); 728 } 729 730 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, 731 (char *)&hopcount, sizeof (hopcount)) < 0) { 732 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 733 " IPV6_MULTICAST_HOPS"); 734 return (_B_FALSE); 735 } 736 737 int_op = 0; /* used to turn off option */ 738 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, 739 (char *)&int_op, sizeof (int_op)) < 0) { 740 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 741 " IPV6_MULTICAST_LOOP"); 742 return (_B_FALSE); 743 } 744 745 /* 746 * Filter out so that we only receive ICMP echo replies 747 */ 748 ICMP6_FILTER_SETBLOCKALL(&filter); 749 ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter); 750 751 if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER, 752 (char *)&filter, sizeof (filter)) < 0) { 753 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 754 " ICMP6_FILTER"); 755 return (_B_FALSE); 756 } 757 758 /* Enable receipt of ancillary data */ 759 int_op = 1; 760 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, 761 (char *)&int_op, sizeof (int_op)) < 0) { 762 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 763 " IPV6_RECVHOPLIMIT"); 764 return (_B_FALSE); 765 } 766 767 return (_B_TRUE); 768 } 769 770 /* 771 * IPv4 specific part in initializing the pii_probe_sock. This socket is 772 * used to send/receive ICMPv4 probe packets. 773 */ 774 static boolean_t 775 phyint_inst_v4_sockinit(struct phyint_instance *pii) 776 { 777 struct sockaddr_in testaddr; 778 char char_op; 779 int ttl = 1; 780 char char_ttl = 1; 781 782 /* 783 * Open a raw socket with ICMPv4 protocol. 784 * 785 * Use IP_DONTFAILOVER_IF to make sure that probes go out 786 * on the specified phyint only, and are not subject to load 787 * balancing. Bind to the src address chosen will ensure that 788 * the responses are received only on the specified phyint. 789 * 790 * Set the ttl to 1 so that probe packets are not routed. 791 * Disable multicast loopback. 792 */ 793 pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP); 794 if (pii->pii_probe_sock < 0) { 795 logperror_pii(pii, "phyint_inst_v4_sockinit: socket"); 796 return (_B_FALSE); 797 } 798 799 bzero(&testaddr, sizeof (testaddr)); 800 testaddr.sin_family = AF_INET; 801 testaddr.sin_port = 0; 802 IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr, 803 &testaddr.sin_addr); 804 805 if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr, 806 sizeof (testaddr)) < 0) { 807 logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind"); 808 return (_B_FALSE); 809 } 810 811 /* 812 * IP_DONTFAILOVER_IF option takes precedence over setting 813 * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again. 814 */ 815 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF, 816 (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) { 817 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 818 " IP_DONTFAILOVER"); 819 return (_B_FALSE); 820 } 821 822 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL, 823 (char *)&ttl, sizeof (ttl)) < 0) { 824 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 825 " IP_TTL"); 826 return (_B_FALSE); 827 } 828 829 char_op = 0; /* used to turn off option */ 830 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP, 831 (char *)&char_op, sizeof (char_op)) == -1) { 832 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 833 " IP_MULTICAST_LOOP"); 834 return (_B_FALSE); 835 } 836 837 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL, 838 (char *)&char_ttl, sizeof (char_ttl)) == -1) { 839 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 840 " IP_MULTICAST_TTL"); 841 return (_B_FALSE); 842 } 843 844 return (_B_TRUE); 845 } 846 847 /* 848 * Remove the phyint group from the list of 'all phyint groups' 849 * and free it. 850 */ 851 static void 852 phyint_group_delete(struct phyint_group *pg) 853 { 854 /* 855 * The anonymous group always exists, even when empty. 856 */ 857 if (pg == phyint_anongroup) 858 return; 859 860 if (debug & D_PHYINT) 861 logdebug("phyint_group_delete('%s')\n", pg->pg_name); 862 863 /* 864 * The phyint group must be empty, and must not have any phyints. 865 * The phyint group must be in the list of all phyint groups 866 */ 867 assert(pg->pg_phyint == NULL); 868 assert(phyint_groups == pg || pg->pg_prev != NULL); 869 870 if (pg->pg_prev != NULL) 871 pg->pg_prev->pg_next = pg->pg_next; 872 else 873 phyint_groups = pg->pg_next; 874 875 if (pg->pg_next != NULL) 876 pg->pg_next->pg_prev = pg->pg_prev; 877 878 pg->pg_next = NULL; 879 pg->pg_prev = NULL; 880 881 phyint_grouplistsig++; 882 (void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE); 883 884 free(pg); 885 } 886 887 /* 888 * Extract information from the kernel about the desired phyint. 889 * Look only for properties of the phyint and not properties of logints. 890 * Take appropriate action on the changes. 891 * Return codes: 892 * PI_OK 893 * The phyint exists in the kernel and matches our knowledge 894 * of the phyint. 895 * PI_DELETED 896 * The phyint has vanished in the kernel. 897 * PI_IFINDEX_CHANGED 898 * The phyint's interface index has changed. 899 * Ask the caller to delete and recreate the phyint. 900 * PI_IOCTL_ERROR 901 * Some ioctl error. Don't change anything. 902 * PI_GROUP_CHANGED 903 * The phyint has changed group. 904 */ 905 int 906 phyint_inst_update_from_k(struct phyint_instance *pii) 907 { 908 struct lifreq lifr; 909 int ifsock; 910 struct phyint *pi; 911 912 pi = pii->pii_phyint; 913 914 if (debug & D_PHYINT) { 915 logdebug("phyint_inst_update_from_k(%s %s)\n", 916 AF_STR(pii->pii_af), pi->pi_name); 917 } 918 919 /* 920 * Get the ifindex from the kernel, for comparison with the 921 * value in our tables. 922 */ 923 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 924 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 925 926 ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6; 927 if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) { 928 if (errno == ENXIO) { 929 return (PI_DELETED); 930 } else { 931 logperror_pii(pii, "phyint_inst_update_from_k:" 932 " ioctl (get lifindex)"); 933 return (PI_IOCTL_ERROR); 934 } 935 } 936 937 if (lifr.lifr_index != pi->pi_ifindex) { 938 /* 939 * The index has changed. Most likely the interface has 940 * been unplumbed and replumbed. Ask the caller to take 941 * appropriate action. 942 */ 943 if (debug & D_PHYINT) { 944 logdebug("phyint_inst_update_from_k:" 945 " old index %d new index %d\n", 946 pi->pi_ifindex, lifr.lifr_index); 947 } 948 return (PI_IFINDEX_CHANGED); 949 } 950 951 /* 952 * Get the group name from the kernel, for comparison with 953 * the value in our tables. 954 */ 955 if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) { 956 if (errno == ENXIO) { 957 return (PI_DELETED); 958 } else { 959 logperror_pii(pii, "phyint_inst_update_from_k:" 960 " ioctl (get groupname)"); 961 return (PI_IOCTL_ERROR); 962 } 963 } 964 965 /* 966 * If the phyint has changed group i.e. if the phyint group name 967 * returned by the kernel is different, ask the caller to delete 968 * and recreate the phyint in the right group 969 */ 970 if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) { 971 /* Groupname has changed */ 972 if (debug & D_PHYINT) { 973 logdebug("phyint_inst_update_from_k:" 974 " groupname change\n"); 975 } 976 return (PI_GROUP_CHANGED); 977 } 978 979 /* 980 * Get the current phyint flags from the kernel, and determine what 981 * flags have changed by comparing against our tables. Note that the 982 * IFF_INACTIVE processing in initifs() relies on this call to ensure 983 * that IFF_INACTIVE is really still set on the interface. 984 */ 985 if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) { 986 if (errno == ENXIO) { 987 return (PI_DELETED); 988 } else { 989 logperror_pii(pii, "phyint_inst_update_from_k: " 990 " ioctl (get flags)"); 991 return (PI_IOCTL_ERROR); 992 } 993 } 994 995 pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags); 996 if (pi->pi_v4 != NULL) 997 pi->pi_v4->pii_flags = pi->pi_flags; 998 if (pi->pi_v6 != NULL) 999 pi->pi_v6->pii_flags = pi->pi_flags; 1000 1001 if (pi->pi_flags & IFF_FAILED) { 1002 /* 1003 * If we are in the running and full state, we have 1004 * completed failbacks successfully and we would have 1005 * expected IFF_FAILED to have been clear. That it is 1006 * set means there was a race condition. Some other 1007 * process turned on the IFF_FAILED flag. Since the 1008 * flag setting is not atomic, i.e. a get ioctl followed 1009 * by a set ioctl, and since there is no way to set an 1010 * individual flag bit, this could have occurred. 1011 */ 1012 if (pi->pi_state == PI_RUNNING && pi->pi_full) 1013 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1014 } else { 1015 /* 1016 * If we are in the failed state, there was a race. 1017 * we have completed failover successfully because our 1018 * state is failed and empty. Some other process turned 1019 * off the IFF_FAILED flag. Same comment as above 1020 */ 1021 if (pi->pi_state == PI_FAILED && pi->pi_empty) 1022 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1023 } 1024 1025 /* No change in phyint status */ 1026 return (PI_OK); 1027 } 1028 1029 /* 1030 * Delete the phyint. Remove it from the list of all phyints, and the 1031 * list of phyint group members. If the group becomes empty, delete the 1032 * group also. 1033 */ 1034 static void 1035 phyint_delete(struct phyint *pi) 1036 { 1037 struct phyint_group *pg = pi->pi_group; 1038 1039 if (debug & D_PHYINT) 1040 logdebug("phyint_delete(%s)\n", pi->pi_name); 1041 1042 /* Both IPv4 and IPv6 phyint instances must have been deleted. */ 1043 assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL); 1044 1045 /* 1046 * The phyint must belong to a group. 1047 */ 1048 assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL); 1049 1050 /* The phyint must be in the list of all phyints */ 1051 assert(phyints == pi || pi->pi_prev != NULL); 1052 1053 /* Remove the phyint from the phyint group list */ 1054 pg->pg_sig++; 1055 (void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE); 1056 1057 if (pi->pi_pgprev == NULL) { 1058 /* Phyint is the 1st in the phyint group list */ 1059 pg->pg_phyint = pi->pi_pgnext; 1060 } else { 1061 pi->pi_pgprev->pi_pgnext = pi->pi_pgnext; 1062 } 1063 if (pi->pi_pgnext != NULL) 1064 pi->pi_pgnext->pi_pgprev = pi->pi_pgprev; 1065 pi->pi_pgnext = NULL; 1066 pi->pi_pgprev = NULL; 1067 1068 /* Remove the phyint from the global list of phyints */ 1069 if (pi->pi_prev == NULL) { 1070 /* Phyint is the 1st in the list */ 1071 phyints = pi->pi_next; 1072 } else { 1073 pi->pi_prev->pi_next = pi->pi_next; 1074 } 1075 if (pi->pi_next != NULL) 1076 pi->pi_next->pi_prev = pi->pi_prev; 1077 pi->pi_next = NULL; 1078 pi->pi_prev = NULL; 1079 1080 free(pi); 1081 1082 /* Delete the phyint_group if the last phyint has been deleted */ 1083 if (pg->pg_phyint == NULL) 1084 phyint_group_delete(pg); 1085 } 1086 1087 /* 1088 * Delete (unlink and free), the phyint instance. 1089 */ 1090 void 1091 phyint_inst_delete(struct phyint_instance *pii) 1092 { 1093 struct phyint *pi = pii->pii_phyint; 1094 1095 assert(pi != NULL); 1096 1097 if (debug & D_PHYINT) { 1098 logdebug("phyint_inst_delete(%s %s)\n", 1099 AF_STR(pii->pii_af), pi->pi_name); 1100 } 1101 1102 /* 1103 * If the phyint instance has associated probe targets 1104 * delete all the targets 1105 */ 1106 while (pii->pii_targets != NULL) 1107 target_delete(pii->pii_targets); 1108 1109 /* 1110 * Delete all the logints associated with this phyint 1111 * instance. 1112 */ 1113 while (pii->pii_logint != NULL) 1114 logint_delete(pii->pii_logint); 1115 1116 /* 1117 * Close the socket used to send probes to targets from this phyint. 1118 */ 1119 if (pii->pii_probe_sock != -1) 1120 close_probe_socket(pii, _B_TRUE); 1121 1122 /* 1123 * Phyint instance must be in the list of all phyint instances. 1124 * Remove phyint instance from the global list of phyint instances. 1125 */ 1126 assert(phyint_instances == pii || pii->pii_prev != NULL); 1127 if (pii->pii_prev == NULL) { 1128 /* Phyint is the 1st in the list */ 1129 phyint_instances = pii->pii_next; 1130 } else { 1131 pii->pii_prev->pii_next = pii->pii_next; 1132 } 1133 if (pii->pii_next != NULL) 1134 pii->pii_next->pii_prev = pii->pii_prev; 1135 pii->pii_next = NULL; 1136 pii->pii_prev = NULL; 1137 1138 /* 1139 * Reset the phyint instance pointer in the phyint. 1140 * If this is the last phyint instance (being deleted) on this 1141 * phyint, then delete the phyint. 1142 */ 1143 if (pii->pii_af == AF_INET) 1144 pi->pi_v4 = NULL; 1145 else 1146 pi->pi_v6 = NULL; 1147 1148 if (pi->pi_v4 == NULL && pi->pi_v6 == NULL) 1149 phyint_delete(pi); 1150 1151 free(pii); 1152 } 1153 1154 static void 1155 phyint_inst_print(struct phyint_instance *pii) 1156 { 1157 struct logint *li; 1158 struct target *tg; 1159 char abuf[INET6_ADDRSTRLEN]; 1160 int most_recent; 1161 int i; 1162 1163 if (pii->pii_phyint == NULL) { 1164 logdebug("pii->pi_phyint NULL can't print\n"); 1165 return; 1166 } 1167 1168 logdebug("\nPhyint instance: %s %s index %u state %x flags %llx " 1169 "sock %x in_use %d empty %x full %x\n", 1170 AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex, 1171 pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock, 1172 pii->pii_in_use, pii->pii_phyint->pi_empty, 1173 pii->pii_phyint->pi_full); 1174 1175 for (li = pii->pii_logint; li != NULL; li = li->li_next) 1176 logint_print(li); 1177 1178 logdebug("\n"); 1179 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1180 target_print(tg); 1181 1182 if (pii->pii_targets == NULL) 1183 logdebug("pi_targets NULL\n"); 1184 1185 if (pii->pii_target_next != NULL) { 1186 logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af), 1187 pr_addr(pii->pii_af, pii->pii_target_next->tg_address, 1188 abuf, sizeof (abuf))); 1189 } else { 1190 logdebug("pi_target_next NULL\n"); 1191 } 1192 1193 if (pii->pii_rtt_target_next != NULL) { 1194 logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af), 1195 pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address, 1196 abuf, sizeof (abuf))); 1197 } else { 1198 logdebug("pi_rtt_target_next NULL\n"); 1199 } 1200 1201 if (pii->pii_targets != NULL) { 1202 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 1203 1204 i = most_recent; 1205 do { 1206 if (pii->pii_probes[i].pr_target != NULL) { 1207 logdebug("#%d target %s ", i, 1208 pr_addr(pii->pii_af, 1209 pii->pii_probes[i].pr_target->tg_address, 1210 abuf, sizeof (abuf))); 1211 } else { 1212 logdebug("#%d target NULL ", i); 1213 } 1214 logdebug("time_sent %u status %d time_ack/lost %u\n", 1215 pii->pii_probes[i].pr_time_sent, 1216 pii->pii_probes[i].pr_status, 1217 pii->pii_probes[i].pr_time_lost); 1218 i = PROBE_INDEX_PREV(i); 1219 } while (i != most_recent); 1220 } 1221 } 1222 1223 /* 1224 * Lookup a logint based on the logical interface name, on the given 1225 * phyint instance. 1226 */ 1227 static struct logint * 1228 logint_lookup(struct phyint_instance *pii, char *name) 1229 { 1230 struct logint *li; 1231 1232 if (debug & D_LOGINT) { 1233 logdebug("logint_lookup(%s, %s)\n", 1234 AF_STR(pii->pii_af), name); 1235 } 1236 1237 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 1238 if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0) 1239 break; 1240 } 1241 return (li); 1242 } 1243 1244 /* 1245 * Insert a logint at the head of the list of logints of the given 1246 * phyint instance 1247 */ 1248 static void 1249 logint_insert(struct phyint_instance *pii, struct logint *li) 1250 { 1251 li->li_next = pii->pii_logint; 1252 li->li_prev = NULL; 1253 if (pii->pii_logint != NULL) 1254 pii->pii_logint->li_prev = li; 1255 pii->pii_logint = li; 1256 li->li_phyint_inst = pii; 1257 } 1258 1259 /* 1260 * Create a new named logint, on the specified phyint instance. 1261 */ 1262 static struct logint * 1263 logint_create(struct phyint_instance *pii, char *name) 1264 { 1265 struct logint *li; 1266 1267 if (debug & D_LOGINT) { 1268 logdebug("logint_create(%s %s %s)\n", 1269 AF_STR(pii->pii_af), pii->pii_name, name); 1270 } 1271 1272 li = calloc(1, sizeof (struct logint)); 1273 if (li == NULL) { 1274 logperror("logint_create: calloc"); 1275 return (NULL); 1276 } 1277 1278 (void) strncpy(li->li_name, name, sizeof (li->li_name)); 1279 li->li_name[sizeof (li->li_name) - 1] = '\0'; 1280 logint_insert(pii, li); 1281 return (li); 1282 } 1283 1284 /* 1285 * Initialize the logint based on the data returned by the kernel. 1286 */ 1287 void 1288 logint_init_from_k(struct phyint_instance *pii, char *li_name) 1289 { 1290 int ifsock; 1291 uint64_t flags; 1292 uint64_t saved_flags; 1293 struct logint *li; 1294 struct lifreq lifr; 1295 struct in6_addr test_subnet; 1296 struct in6_addr test_subnet_mask; 1297 struct in6_addr testaddr; 1298 int test_subnet_len; 1299 struct sockaddr_in6 *sin6; 1300 struct sockaddr_in *sin; 1301 char abuf[INET6_ADDRSTRLEN]; 1302 boolean_t ptp = _B_FALSE; 1303 struct in6_addr tgaddr; 1304 1305 if (debug & D_LOGINT) { 1306 logdebug("logint_init_from_k(%s %s)\n", 1307 AF_STR(pii->pii_af), li_name); 1308 } 1309 1310 /* Get the socket for doing ioctls */ 1311 ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6; 1312 1313 /* 1314 * Get the flags from the kernel. Also serves as a check whether 1315 * the logical still exists. If it doesn't exist, no need to proceed 1316 * any further. li_in_use will make the caller clean up the logint 1317 */ 1318 (void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name)); 1319 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 1320 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 1321 /* Interface may have vanished */ 1322 if (errno != ENXIO) { 1323 logperror_pii(pii, "logint_init_from_k: " 1324 "ioctl (get flags)"); 1325 } 1326 return; 1327 } 1328 1329 flags = lifr.lifr_flags; 1330 1331 /* 1332 * Verified the logint exists. Now lookup the logint in our tables. 1333 * If it does not exist, create a new logint. 1334 */ 1335 li = logint_lookup(pii, li_name); 1336 if (li == NULL) { 1337 li = logint_create(pii, li_name); 1338 if (li == NULL) { 1339 /* 1340 * Pretend the interface does not exist 1341 * in the kernel 1342 */ 1343 return; 1344 } 1345 } 1346 1347 /* 1348 * Update li->li_flags with the new flags, after saving the old 1349 * value. This is used later to check what flags has changed and 1350 * take any action 1351 */ 1352 saved_flags = li->li_flags; 1353 li->li_flags = flags; 1354 1355 /* 1356 * Get the address, prefix, prefixlength and update the logint. 1357 * Check if anything has changed. If the logint used for the 1358 * test address has changed, take suitable action. 1359 */ 1360 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 1361 /* Interface may have vanished */ 1362 if (errno != ENXIO) { 1363 logperror_li(li, "logint_init_from_k: (get addr)"); 1364 } 1365 goto error; 1366 } 1367 1368 if (pii->pii_af == AF_INET) { 1369 sin = (struct sockaddr_in *)&lifr.lifr_addr; 1370 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr); 1371 } else { 1372 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 1373 testaddr = sin6->sin6_addr; 1374 } 1375 1376 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 1377 ptp = _B_TRUE; 1378 if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) { 1379 if (errno != ENXIO) { 1380 logperror_li(li, "logint_init_from_k:" 1381 " (get dstaddr)"); 1382 } 1383 goto error; 1384 } 1385 if (pii->pii_af == AF_INET) { 1386 sin = (struct sockaddr_in *)&lifr.lifr_addr; 1387 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr); 1388 } else { 1389 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 1390 tgaddr = sin6->sin6_addr; 1391 } 1392 } else { 1393 if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { 1394 /* Interface may have vanished */ 1395 if (errno != ENXIO) { 1396 logperror_li(li, "logint_init_from_k:" 1397 " (get subnet)"); 1398 } 1399 goto error; 1400 } 1401 if (lifr.lifr_subnet.ss_family == AF_INET6) { 1402 sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; 1403 test_subnet = sin6->sin6_addr; 1404 test_subnet_len = lifr.lifr_addrlen; 1405 } else { 1406 sin = (struct sockaddr_in *)&lifr.lifr_subnet; 1407 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); 1408 test_subnet_len = lifr.lifr_addrlen + 1409 (IPV6_ABITS - IP_ABITS); 1410 } 1411 (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask); 1412 } 1413 1414 /* 1415 * Also record the OINDEX for completeness. This information is 1416 * not used. 1417 */ 1418 if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) { 1419 if (errno != ENXIO) { 1420 logperror_li(li, "logint_init_from_k:" 1421 " (get lifoindex)"); 1422 } 1423 goto error; 1424 } 1425 1426 /* 1427 * If this is the logint corresponding to the test address used for 1428 * sending probes, then if anything significant has changed we need to 1429 * determine the test address again. We ignore changes to the 1430 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of 1431 * course. 1432 */ 1433 if (pii->pii_probe_logint == li) { 1434 if (((li->li_flags ^ saved_flags) & 1435 ~(IFF_FAILED | IFF_RUNNING)) != 0 || 1436 !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) || 1437 (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet, 1438 &li->li_subnet)) || 1439 (!ptp && test_subnet_len != li->li_subnet_len) || 1440 (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) { 1441 /* 1442 * Something significant that affects the testaddress 1443 * has changed. Redo the testaddress selection later on 1444 * in select_test_ifs(). For now do the cleanup and 1445 * set pii_probe_logint to NULL. 1446 */ 1447 if (pii->pii_probe_sock != -1) 1448 close_probe_socket(pii, _B_TRUE); 1449 pii->pii_probe_logint = NULL; 1450 } 1451 } 1452 1453 1454 /* Update the logint with the values obtained from the kernel. */ 1455 li->li_addr = testaddr; 1456 li->li_in_use = 1; 1457 li->li_oifindex = lifr.lifr_index; 1458 if (ptp) { 1459 li->li_dstaddr = tgaddr; 1460 li->li_subnet_len = (pii->pii_af == AF_INET) ? 1461 IP_ABITS : IPV6_ABITS; 1462 } else { 1463 li->li_subnet = test_subnet; 1464 li->li_subnet_len = test_subnet_len; 1465 } 1466 1467 if (debug & D_LOGINT) 1468 logint_print(li); 1469 1470 return; 1471 1472 error: 1473 logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n", 1474 AF_STR(pii->pii_af), pii->pii_name, li->li_name, 1475 pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf))); 1476 logint_delete(li); 1477 } 1478 1479 /* 1480 * Delete (unlink and free) a logint. 1481 */ 1482 void 1483 logint_delete(struct logint *li) 1484 { 1485 struct phyint_instance *pii; 1486 1487 pii = li->li_phyint_inst; 1488 assert(pii != NULL); 1489 1490 if (debug & D_LOGINT) { 1491 int af; 1492 char abuf[INET6_ADDRSTRLEN]; 1493 1494 af = pii->pii_af; 1495 logdebug("logint_delete(%s %s %s/%u)\n", 1496 AF_STR(af), li->li_name, 1497 pr_addr(af, li->li_addr, abuf, sizeof (abuf)), 1498 li->li_subnet_len); 1499 } 1500 1501 /* logint must be in the list of logints */ 1502 assert(pii->pii_logint == li || li->li_prev != NULL); 1503 1504 /* Remove the logint from the list of logints */ 1505 if (li->li_prev == NULL) { 1506 /* logint is the 1st in the list */ 1507 pii->pii_logint = li->li_next; 1508 } else { 1509 li->li_prev->li_next = li->li_next; 1510 } 1511 if (li->li_next != NULL) 1512 li->li_next->li_prev = li->li_prev; 1513 li->li_next = NULL; 1514 li->li_prev = NULL; 1515 1516 /* 1517 * If this logint is also being used for probing, then close the 1518 * associated socket, if it exists. 1519 */ 1520 if (pii->pii_probe_logint == li) { 1521 if (pii->pii_probe_sock != -1) 1522 close_probe_socket(pii, _B_TRUE); 1523 pii->pii_probe_logint = NULL; 1524 } 1525 1526 free(li); 1527 } 1528 1529 static void 1530 logint_print(struct logint *li) 1531 { 1532 char abuf[INET6_ADDRSTRLEN]; 1533 int af; 1534 1535 af = li->li_phyint_inst->pii_af; 1536 1537 logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name, 1538 pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len); 1539 1540 logdebug("\tFlags: %llx in_use %d oifindex %d\n", 1541 li->li_flags, li->li_in_use, li->li_oifindex); 1542 } 1543 1544 char * 1545 pr_addr(int af, struct in6_addr addr, char *abuf, int len) 1546 { 1547 struct in_addr addr_v4; 1548 1549 if (af == AF_INET) { 1550 IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4); 1551 (void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len); 1552 } else { 1553 (void) inet_ntop(AF_INET6, (void *)&addr, abuf, len); 1554 } 1555 return (abuf); 1556 } 1557 1558 /* Lookup target on its address */ 1559 struct target * 1560 target_lookup(struct phyint_instance *pii, struct in6_addr addr) 1561 { 1562 struct target *tg; 1563 1564 if (debug & D_TARGET) { 1565 char abuf[INET6_ADDRSTRLEN]; 1566 1567 logdebug("target_lookup(%s %s): addr %s\n", 1568 AF_STR(pii->pii_af), pii->pii_name, 1569 pr_addr(pii->pii_af, addr, abuf, sizeof (abuf))); 1570 } 1571 1572 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1573 if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr)) 1574 break; 1575 } 1576 return (tg); 1577 } 1578 1579 /* 1580 * Find and return the next active target, for the next probe. 1581 * If no active targets are available, return NULL. 1582 */ 1583 struct target * 1584 target_next(struct target *tg) 1585 { 1586 struct phyint_instance *pii = tg->tg_phyint_inst; 1587 struct target *marker = tg; 1588 hrtime_t now; 1589 1590 now = gethrtime(); 1591 1592 /* 1593 * Target must be in the list of targets for this phyint 1594 * instance. 1595 */ 1596 assert(pii->pii_targets == tg || tg->tg_prev != NULL); 1597 assert(pii->pii_targets != NULL); 1598 1599 /* Return the next active target */ 1600 do { 1601 /* 1602 * Go to the next target. If we hit the end, 1603 * reset the ptr to the head 1604 */ 1605 tg = tg->tg_next; 1606 if (tg == NULL) 1607 tg = pii->pii_targets; 1608 1609 assert(TG_STATUS_VALID(tg->tg_status)); 1610 1611 switch (tg->tg_status) { 1612 case TG_ACTIVE: 1613 return (tg); 1614 1615 case TG_UNUSED: 1616 assert(pii->pii_targets_are_routers); 1617 if (pii->pii_ntargets < MAX_PROBE_TARGETS) { 1618 /* 1619 * Bubble up the unused target to active 1620 */ 1621 tg->tg_status = TG_ACTIVE; 1622 pii->pii_ntargets++; 1623 return (tg); 1624 } 1625 break; 1626 1627 case TG_SLOW: 1628 assert(pii->pii_targets_are_routers); 1629 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1630 /* 1631 * Bubble up the slow target to unused 1632 */ 1633 tg->tg_status = TG_UNUSED; 1634 } 1635 break; 1636 1637 case TG_DEAD: 1638 assert(pii->pii_targets_are_routers); 1639 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1640 /* 1641 * Bubble up the dead target to slow 1642 */ 1643 tg->tg_status = TG_SLOW; 1644 tg->tg_latime = now; 1645 } 1646 break; 1647 } 1648 1649 } while (tg != marker); 1650 1651 return (NULL); 1652 } 1653 1654 /* 1655 * Select the best available target, that is not already TG_ACTIVE, 1656 * for the caller. The caller will determine whether it wants to 1657 * make the returned target TG_ACTIVE. 1658 * The selection order is as follows. 1659 * 1. pick a TG_UNSED target, if it exists. 1660 * 2. else pick a TG_SLOW target that has recovered, if it exists 1661 * 3. else pick any TG_SLOW target, if it exists 1662 * 4. else pick a TG_DEAD target that has recovered, if it exists 1663 * 5. else pick any TG_DEAD target, if it exists 1664 * 6. else return null 1665 */ 1666 static struct target * 1667 target_select_best(struct phyint_instance *pii) 1668 { 1669 struct target *tg; 1670 struct target *slow = NULL; 1671 struct target *dead = NULL; 1672 struct target *slow_recovered = NULL; 1673 struct target *dead_recovered = NULL; 1674 hrtime_t now; 1675 1676 now = gethrtime(); 1677 1678 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1679 assert(TG_STATUS_VALID(tg->tg_status)); 1680 1681 switch (tg->tg_status) { 1682 case TG_UNUSED: 1683 return (tg); 1684 1685 case TG_SLOW: 1686 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1687 slow_recovered = tg; 1688 /* 1689 * Promote the slow_recoverd to unused 1690 */ 1691 tg->tg_status = TG_UNUSED; 1692 } else { 1693 slow = tg; 1694 } 1695 break; 1696 1697 case TG_DEAD: 1698 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1699 dead_recovered = tg; 1700 /* 1701 * Promote the dead_recoverd to slow 1702 */ 1703 tg->tg_status = TG_SLOW; 1704 tg->tg_latime = now; 1705 } else { 1706 dead = tg; 1707 } 1708 break; 1709 1710 default: 1711 break; 1712 } 1713 } 1714 1715 if (slow_recovered != NULL) 1716 return (slow_recovered); 1717 else if (slow != NULL) 1718 return (slow); 1719 else if (dead_recovered != NULL) 1720 return (dead_recovered); 1721 else 1722 return (dead); 1723 } 1724 1725 /* 1726 * Some target was deleted. If we don't have even MIN_PROBE_TARGETS 1727 * that are active, pick the next best below. 1728 */ 1729 static void 1730 target_activate_all(struct phyint_instance *pii) 1731 { 1732 struct target *tg; 1733 1734 assert(pii->pii_ntargets == 0); 1735 assert(pii->pii_target_next == NULL); 1736 assert(pii->pii_rtt_target_next == NULL); 1737 assert(pii->pii_targets_are_routers); 1738 1739 while (pii->pii_ntargets < MIN_PROBE_TARGETS) { 1740 tg = target_select_best(pii); 1741 if (tg == NULL) { 1742 /* We are out of targets */ 1743 return; 1744 } 1745 1746 assert(TG_STATUS_VALID(tg->tg_status)); 1747 assert(tg->tg_status != TG_ACTIVE); 1748 tg->tg_status = TG_ACTIVE; 1749 pii->pii_ntargets++; 1750 if (pii->pii_target_next == NULL) { 1751 pii->pii_target_next = tg; 1752 pii->pii_rtt_target_next = tg; 1753 } 1754 } 1755 } 1756 1757 static struct target * 1758 target_first(struct phyint_instance *pii) 1759 { 1760 struct target *tg; 1761 1762 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1763 assert(TG_STATUS_VALID(tg->tg_status)); 1764 if (tg->tg_status == TG_ACTIVE) 1765 break; 1766 } 1767 1768 return (tg); 1769 } 1770 1771 /* 1772 * Create a default target entry. 1773 */ 1774 void 1775 target_create(struct phyint_instance *pii, struct in6_addr addr, 1776 boolean_t is_router) 1777 { 1778 struct target *tg; 1779 struct phyint *pi; 1780 struct logint *li; 1781 1782 if (debug & D_TARGET) { 1783 char abuf[INET6_ADDRSTRLEN]; 1784 1785 logdebug("target_create(%s %s, %s)\n", 1786 AF_STR(pii->pii_af), pii->pii_name, 1787 pr_addr(pii->pii_af, addr, abuf, sizeof (abuf))); 1788 } 1789 1790 /* 1791 * If the test address is not yet initialized, do not add 1792 * any target, since we cannot determine whether the target 1793 * belongs to the same subnet as the test address. 1794 */ 1795 li = pii->pii_probe_logint; 1796 if (li == NULL) 1797 return; 1798 1799 /* 1800 * If there are multiple subnets associated with an interface, then 1801 * add the target to this phyint instance, only if it belongs to the 1802 * same subnet as the test address. The reason is that interface 1803 * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER 1804 * addresses, will disappear after failover, and the targets will not 1805 * be reachable from this interface. 1806 */ 1807 if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len)) 1808 return; 1809 1810 if (pii->pii_targets != NULL) { 1811 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1812 if (is_router) { 1813 if (!pii->pii_targets_are_routers) { 1814 /* 1815 * Prefer router over hosts. Using hosts is a 1816 * fallback mechanism, hence delete all host 1817 * targets. 1818 */ 1819 while (pii->pii_targets != NULL) 1820 target_delete(pii->pii_targets); 1821 } 1822 } else { 1823 /* 1824 * Routers take precedence over hosts. If this 1825 * is a router list and we are trying to add a 1826 * host, just return. If this is a host list 1827 * and if we have sufficient targets, just return 1828 */ 1829 if (pii->pii_targets_are_routers || 1830 pii->pii_ntargets == MAX_PROBE_TARGETS) 1831 return; 1832 } 1833 } 1834 1835 tg = calloc(1, sizeof (struct target)); 1836 if (tg == NULL) { 1837 logperror("target_create: calloc"); 1838 return; 1839 } 1840 1841 tg->tg_phyint_inst = pii; 1842 tg->tg_address = addr; 1843 tg->tg_in_use = 1; 1844 tg->tg_rtt_sa = -1; 1845 tg->tg_num_deferred = 0; 1846 1847 /* 1848 * If this is the first target, set 'pii_targets_are_routers' 1849 * The list of targets is either a list of hosts or list or 1850 * routers, but not a mix. 1851 */ 1852 if (pii->pii_targets == NULL) { 1853 assert(pii->pii_ntargets == 0); 1854 assert(pii->pii_target_next == NULL); 1855 assert(pii->pii_rtt_target_next == NULL); 1856 pii->pii_targets_are_routers = is_router ? 1 : 0; 1857 } 1858 1859 if (pii->pii_ntargets == MAX_PROBE_TARGETS) { 1860 assert(pii->pii_targets_are_routers); 1861 assert(pii->pii_target_next != NULL); 1862 assert(pii->pii_rtt_target_next != NULL); 1863 tg->tg_status = TG_UNUSED; 1864 } else { 1865 if (pii->pii_ntargets == 0) { 1866 assert(pii->pii_target_next == NULL); 1867 pii->pii_target_next = tg; 1868 pii->pii_rtt_target_next = tg; 1869 } 1870 pii->pii_ntargets++; 1871 tg->tg_status = TG_ACTIVE; 1872 } 1873 1874 target_insert(pii, tg); 1875 1876 /* 1877 * Change state to PI_RUNNING if this phyint instance is capable of 1878 * sending and receiving probes -- that is, if we know of at least 1 1879 * target, and this phyint instance is probe-capable. For more 1880 * details, see the phyint state diagram in mpd_probe.c. 1881 */ 1882 pi = pii->pii_phyint; 1883 if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) { 1884 if (pi->pi_flags & IFF_FAILED) 1885 phyint_chstate(pi, PI_FAILED); 1886 else 1887 phyint_chstate(pi, PI_RUNNING); 1888 } 1889 } 1890 1891 /* 1892 * Add the target address named by `addr' to phyint instance `pii' if it does 1893 * not already exist. If the target is a router, `is_router' should be set to 1894 * B_TRUE. 1895 */ 1896 void 1897 target_add(struct phyint_instance *pii, struct in6_addr addr, 1898 boolean_t is_router) 1899 { 1900 struct target *tg; 1901 1902 if (pii == NULL) 1903 return; 1904 1905 tg = target_lookup(pii, addr); 1906 1907 /* 1908 * If the target does not exist, create it; target_create() will set 1909 * tg_in_use to true. If it exists already, and it is a router 1910 * target, set tg_in_use to to true, so that init_router_targets() 1911 * won't delete it 1912 */ 1913 if (tg == NULL) 1914 target_create(pii, addr, is_router); 1915 else if (is_router) 1916 tg->tg_in_use = 1; 1917 } 1918 1919 /* 1920 * Insert target at head of linked list of targets for the associated 1921 * phyint instance 1922 */ 1923 static void 1924 target_insert(struct phyint_instance *pii, struct target *tg) 1925 { 1926 tg->tg_next = pii->pii_targets; 1927 tg->tg_prev = NULL; 1928 if (tg->tg_next != NULL) 1929 tg->tg_next->tg_prev = tg; 1930 pii->pii_targets = tg; 1931 } 1932 1933 /* 1934 * Delete a target (unlink and free). 1935 */ 1936 void 1937 target_delete(struct target *tg) 1938 { 1939 int af; 1940 struct phyint_instance *pii; 1941 struct phyint_instance *pii_other; 1942 1943 pii = tg->tg_phyint_inst; 1944 af = pii->pii_af; 1945 1946 if (debug & D_TARGET) { 1947 char abuf[INET6_ADDRSTRLEN]; 1948 1949 logdebug("target_delete(%s %s, %s)\n", 1950 AF_STR(af), pii->pii_name, 1951 pr_addr(af, tg->tg_address, abuf, sizeof (abuf))); 1952 } 1953 1954 /* 1955 * Target must be in the list of targets for this phyint 1956 * instance. 1957 */ 1958 assert(pii->pii_targets == tg || tg->tg_prev != NULL); 1959 1960 /* 1961 * Reset all references to 'tg' in the probe information 1962 * for this phyint. 1963 */ 1964 reset_pii_probes(pii, tg); 1965 1966 /* 1967 * Remove this target from the list of targets of this 1968 * phyint instance. 1969 */ 1970 if (tg->tg_prev == NULL) { 1971 pii->pii_targets = tg->tg_next; 1972 } else { 1973 tg->tg_prev->tg_next = tg->tg_next; 1974 } 1975 1976 if (tg->tg_next != NULL) 1977 tg->tg_next->tg_prev = tg->tg_prev; 1978 1979 tg->tg_next = NULL; 1980 tg->tg_prev = NULL; 1981 1982 if (tg->tg_status == TG_ACTIVE) 1983 pii->pii_ntargets--; 1984 1985 /* 1986 * Adjust the next target to probe, if it points to 1987 * to the currently deleted target. 1988 */ 1989 if (pii->pii_target_next == tg) 1990 pii->pii_target_next = target_first(pii); 1991 1992 if (pii->pii_rtt_target_next == tg) 1993 pii->pii_rtt_target_next = target_first(pii); 1994 1995 free(tg); 1996 1997 /* 1998 * The number of active targets pii_ntargets == 0 iff 1999 * the next active target pii->pii_target_next == NULL 2000 */ 2001 if (pii->pii_ntargets != 0) { 2002 assert(pii->pii_target_next != NULL); 2003 assert(pii->pii_rtt_target_next != NULL); 2004 assert(pii->pii_target_next->tg_status == TG_ACTIVE); 2005 assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE); 2006 return; 2007 } 2008 2009 /* At this point, we don't have any active targets. */ 2010 assert(pii->pii_target_next == NULL); 2011 assert(pii->pii_rtt_target_next == NULL); 2012 2013 if (pii->pii_targets_are_routers) { 2014 /* 2015 * Activate any TG_SLOW or TG_DEAD router targets, 2016 * since we don't have any other targets 2017 */ 2018 target_activate_all(pii); 2019 2020 if (pii->pii_ntargets != 0) { 2021 assert(pii->pii_target_next != NULL); 2022 assert(pii->pii_rtt_target_next != NULL); 2023 assert(pii->pii_target_next->tg_status == TG_ACTIVE); 2024 assert(pii->pii_rtt_target_next->tg_status == 2025 TG_ACTIVE); 2026 return; 2027 } 2028 } 2029 2030 /* 2031 * If we still don't have any active targets, the list must 2032 * must be really empty. There aren't even TG_SLOW or TG_DEAD 2033 * targets. Zero out the probe stats since it will not be 2034 * relevant any longer. 2035 */ 2036 assert(pii->pii_targets == NULL); 2037 clear_pii_probe_stats(pii); 2038 pii_other = phyint_inst_other(pii); 2039 2040 /* 2041 * If there are no targets on both instances and the interface is 2042 * online, go back to PI_NOTARGETS state, since we cannot probe this 2043 * phyint any more. For more details, please see phyint state 2044 * diagram in mpd_probe.c. 2045 */ 2046 if (!PROBE_CAPABLE(pii_other) && 2047 pii->pii_phyint->pi_state != PI_OFFLINE) 2048 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 2049 } 2050 2051 /* 2052 * Flush the target list of every phyint in the group, if the list 2053 * is a host target list. This is called if group failure is suspected. 2054 * If all targets have failed, multicast will subsequently discover new 2055 * targets. Else it is a group failure. 2056 * Note: This function is a no-op if the list is a router target list. 2057 */ 2058 static void 2059 target_flush_hosts(struct phyint_group *pg) 2060 { 2061 struct phyint *pi; 2062 struct phyint_instance *pii; 2063 2064 if (debug & D_TARGET) 2065 logdebug("target_flush_hosts(%s)\n", pg->pg_name); 2066 2067 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 2068 pii = pi->pi_v4; 2069 if (pii != NULL && !pii->pii_targets_are_routers) { 2070 /* 2071 * Delete all the targets. When the list becomes 2072 * empty, target_delete() will set pii->pii_targets 2073 * to NULL. 2074 */ 2075 while (pii->pii_targets != NULL) 2076 target_delete(pii->pii_targets); 2077 } 2078 pii = pi->pi_v6; 2079 if (pii != NULL && !pii->pii_targets_are_routers) { 2080 /* 2081 * Delete all the targets. When the list becomes 2082 * empty, target_delete() will set pii->pii_targets 2083 * to NULL. 2084 */ 2085 while (pii->pii_targets != NULL) 2086 target_delete(pii->pii_targets); 2087 } 2088 } 2089 } 2090 2091 /* 2092 * Reset all references to 'target' in the probe info, as this target is 2093 * being deleted. The pr_target field is guaranteed to be non-null if 2094 * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that 2095 * pr_target will not be accessed unconditionally. 2096 */ 2097 static void 2098 reset_pii_probes(struct phyint_instance *pii, struct target *tg) 2099 { 2100 int i; 2101 2102 for (i = 0; i < PROBE_STATS_COUNT; i++) { 2103 if (pii->pii_probes[i].pr_target == tg) { 2104 pii->pii_probes[i].pr_target = NULL; 2105 if (pii->pii_probes[i].pr_status == PR_UNACKED) 2106 pii->pii_probes[i].pr_status = PR_LOST; 2107 } 2108 } 2109 2110 } 2111 2112 /* 2113 * Clear the probe statistics array. 2114 */ 2115 void 2116 clear_pii_probe_stats(struct phyint_instance *pii) 2117 { 2118 bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT); 2119 /* Reset the next probe index in the probe stats array */ 2120 pii->pii_probe_next = 0; 2121 } 2122 2123 static void 2124 target_print(struct target *tg) 2125 { 2126 char abuf[INET6_ADDRSTRLEN]; 2127 char buf[128]; 2128 char buf2[128]; 2129 int af; 2130 int i; 2131 2132 af = tg->tg_phyint_inst->pii_af; 2133 2134 logdebug("Target on %s %s addr %s\n" 2135 "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n", 2136 AF_STR(af), tg->tg_phyint_inst->pii_name, 2137 pr_addr(af, tg->tg_address, abuf, sizeof (abuf)), 2138 tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd, 2139 tg->tg_crtt, tg->tg_in_use); 2140 2141 buf[0] = '\0'; 2142 for (i = 0; i < tg->tg_num_deferred; i++) { 2143 (void) snprintf(buf2, sizeof (buf2), " %dms", 2144 tg->tg_deferred[i]); 2145 (void) strlcat(buf, buf2, sizeof (buf)); 2146 } 2147 logdebug("deferred rtts:%s\n", buf); 2148 } 2149 2150 void 2151 phyint_inst_print_all(void) 2152 { 2153 struct phyint_instance *pii; 2154 2155 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2156 phyint_inst_print(pii); 2157 } 2158 } 2159 2160 /* 2161 * Convert length for a mask to the mask. 2162 */ 2163 static void 2164 ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask) 2165 { 2166 int j; 2167 2168 assert(masklen <= IPV6_ABITS); 2169 bzero((char *)bitmask, sizeof (*bitmask)); 2170 2171 /* Make the 'masklen' leftmost bits one */ 2172 for (j = 0; masklen > 8; masklen -= 8, j++) 2173 bitmask->s6_addr[j] = 0xff; 2174 2175 bitmask->s6_addr[j] = 0xff << (8 - masklen); 2176 2177 } 2178 2179 /* 2180 * Compare two prefixes that have the same prefix length. 2181 * Fails if the prefix length is unreasonable. 2182 */ 2183 static boolean_t 2184 prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) 2185 { 2186 uchar_t mask; 2187 int j; 2188 2189 if (prefix_len < 0 || prefix_len > IPV6_ABITS) 2190 return (_B_FALSE); 2191 2192 for (j = 0; prefix_len > 8; prefix_len -= 8, j++) 2193 if (p1.s6_addr[j] != p2.s6_addr[j]) 2194 return (_B_FALSE); 2195 2196 /* Make the N leftmost bits one */ 2197 mask = 0xff << (8 - prefix_len); 2198 if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask)) 2199 return (_B_FALSE); 2200 2201 return (_B_TRUE); 2202 } 2203 2204 /* 2205 * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both 2206 * IPv4 and IPv6 put together. The phyint with the least such number 2207 * will be used as the failover destination, if no standby interface is 2208 * available 2209 */ 2210 int 2211 logint_upcount(struct phyint *pi) 2212 { 2213 struct logint *li; 2214 struct phyint_instance *pii; 2215 int count = 0; 2216 2217 pii = pi->pi_v4; 2218 if (pii != NULL) { 2219 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 2220 if ((li->li_flags & 2221 (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { 2222 count++; 2223 } 2224 } 2225 } 2226 2227 pii = pi->pi_v6; 2228 if (pii != NULL) { 2229 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 2230 if ((li->li_flags & 2231 (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { 2232 count++; 2233 } 2234 } 2235 } 2236 2237 return (count); 2238 } 2239 2240 /* 2241 * Get the phyint instance with the other (IPv4 / IPv6) protocol 2242 */ 2243 struct phyint_instance * 2244 phyint_inst_other(struct phyint_instance *pii) 2245 { 2246 if (pii->pii_af == AF_INET) 2247 return (pii->pii_phyint->pi_v6); 2248 else 2249 return (pii->pii_phyint->pi_v4); 2250 } 2251 2252 /* 2253 * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'. 2254 * Before sending the event, it prepends the current version of the IPMP 2255 * sysevent API. Returns 0 on success, -1 on failure (in either case, 2256 * `nvl' is freed). 2257 */ 2258 static int 2259 post_event(const char *subclass, nvlist_t *nvl) 2260 { 2261 sysevent_id_t eid; 2262 2263 /* 2264 * Since sysevents don't work yet in non-global zones, there cannot 2265 * possibly be any consumers yet, so don't bother trying to generate 2266 * them. (Otherwise, we'll spew warnings.) 2267 */ 2268 if (getzoneid() != GLOBAL_ZONEID) { 2269 nvlist_free(nvl); 2270 return (0); 2271 } 2272 2273 errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION, 2274 IPMP_EVENT_CUR_VERSION); 2275 if (errno != 0) { 2276 logerr("cannot create `%s' event: %s", subclass, 2277 strerror(errno)); 2278 goto failed; 2279 } 2280 2281 if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR, 2282 "in.mpathd", nvl, &eid) == -1) { 2283 logerr("cannot send `%s' event: %s\n", subclass, 2284 strerror(errno)); 2285 goto failed; 2286 } 2287 2288 nvlist_free(nvl); 2289 return (0); 2290 failed: 2291 nvlist_free(nvl); 2292 return (-1); 2293 } 2294 2295 /* 2296 * Return the external IPMP state associated with phyint `pi'. 2297 */ 2298 static ipmp_if_state_t 2299 ifstate(struct phyint *pi) 2300 { 2301 switch (pi->pi_state) { 2302 case PI_NOTARGETS: 2303 return (IPMP_IF_UNKNOWN); 2304 2305 case PI_OFFLINE: 2306 return (IPMP_IF_OFFLINE); 2307 2308 case PI_FAILED: 2309 return (IPMP_IF_FAILED); 2310 2311 case PI_RUNNING: 2312 return (IPMP_IF_OK); 2313 } 2314 2315 logerr("ifstate: unknown state %d; aborting\n", pi->pi_state); 2316 abort(); 2317 /* NOTREACHED */ 2318 } 2319 2320 /* 2321 * Return the external IPMP interface type associated with phyint `pi'. 2322 */ 2323 static ipmp_if_type_t 2324 iftype(struct phyint *pi) 2325 { 2326 if (pi->pi_flags & IFF_STANDBY) 2327 return (IPMP_IF_STANDBY); 2328 else 2329 return (IPMP_IF_NORMAL); 2330 } 2331 2332 /* 2333 * Return the external IPMP group state associated with phyint group `pg'. 2334 */ 2335 static ipmp_group_state_t 2336 groupstate(struct phyint_group *pg) 2337 { 2338 return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK); 2339 } 2340 2341 /* 2342 * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'. 2343 * Returns 0 on success, -1 on failure. 2344 */ 2345 static int 2346 phyint_group_state_event(struct phyint_group *pg) 2347 { 2348 nvlist_t *nvl; 2349 2350 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2351 if (errno != 0) { 2352 logperror("cannot create `group state change' event"); 2353 return (-1); 2354 } 2355 2356 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2357 if (errno != 0) 2358 goto failed; 2359 2360 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2361 if (errno != 0) 2362 goto failed; 2363 2364 errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg)); 2365 if (errno != 0) 2366 goto failed; 2367 2368 return (post_event(ESC_IPMP_GROUP_STATE, nvl)); 2369 failed: 2370 logperror("cannot create `group state change' event"); 2371 nvlist_free(nvl); 2372 return (-1); 2373 } 2374 2375 /* 2376 * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group 2377 * `pg'. Returns 0 on success, -1 on failure. 2378 */ 2379 static int 2380 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op) 2381 { 2382 nvlist_t *nvl; 2383 2384 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2385 if (errno != 0) { 2386 logperror("cannot create `group change' event"); 2387 return (-1); 2388 } 2389 2390 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2391 if (errno != 0) 2392 goto failed; 2393 2394 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2395 if (errno != 0) 2396 goto failed; 2397 2398 errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE, 2399 phyint_grouplistsig); 2400 if (errno != 0) 2401 goto failed; 2402 2403 errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op); 2404 if (errno != 0) 2405 goto failed; 2406 2407 return (post_event(ESC_IPMP_GROUP_CHANGE, nvl)); 2408 failed: 2409 logperror("cannot create `group change' event"); 2410 nvlist_free(nvl); 2411 return (-1); 2412 } 2413 2414 /* 2415 * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in 2416 * group `pg'. Returns 0 on success, -1 on failure. 2417 */ 2418 static int 2419 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, 2420 ipmp_if_op_t op) 2421 { 2422 nvlist_t *nvl; 2423 2424 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2425 if (errno != 0) { 2426 logperror("cannot create `group member change' event"); 2427 return (-1); 2428 } 2429 2430 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2431 if (errno != 0) 2432 goto failed; 2433 2434 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2435 if (errno != 0) 2436 goto failed; 2437 2438 errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op); 2439 if (errno != 0) 2440 goto failed; 2441 2442 errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name); 2443 if (errno != 0) 2444 goto failed; 2445 2446 errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi)); 2447 if (errno != 0) 2448 goto failed; 2449 2450 errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi)); 2451 if (errno != 0) 2452 goto failed; 2453 2454 return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl)); 2455 failed: 2456 logperror("cannot create `group member change' event"); 2457 nvlist_free(nvl); 2458 return (-1); 2459 2460 } 2461 2462 /* 2463 * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'. 2464 * Returns 0 on success, -1 on failure. 2465 */ 2466 static int 2467 phyint_state_event(struct phyint_group *pg, struct phyint *pi) 2468 { 2469 nvlist_t *nvl; 2470 2471 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2472 if (errno != 0) { 2473 logperror("cannot create `interface change' event"); 2474 return (-1); 2475 } 2476 2477 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2478 if (errno != 0) 2479 goto failed; 2480 2481 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2482 if (errno != 0) 2483 goto failed; 2484 2485 errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name); 2486 if (errno != 0) 2487 goto failed; 2488 2489 errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi)); 2490 if (errno != 0) 2491 goto failed; 2492 2493 errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi)); 2494 if (errno != 0) 2495 goto failed; 2496 2497 return (post_event(ESC_IPMP_IF_CHANGE, nvl)); 2498 failed: 2499 logperror("cannot create `interface change' event"); 2500 nvlist_free(nvl); 2501 return (-1); 2502 2503 } 2504 2505 /* 2506 * Generate a signature for use. The signature is conceptually divided 2507 * into two pieces: a random 16-bit "generation number" and a 48-bit 2508 * monotonically increasing integer. The generation number protects 2509 * against stale updates to entities (e.g., IPMP groups) that have been 2510 * deleted and since recreated. 2511 */ 2512 static uint64_t 2513 gensig(void) 2514 { 2515 static int seeded = 0; 2516 2517 if (seeded == 0) { 2518 srand48((long)gethrtime()); 2519 seeded++; 2520 } 2521 2522 return ((uint64_t)lrand48() << 48 | 1); 2523 } 2524 2525 /* 2526 * Store the information associated with group `grname' into a dynamically 2527 * allocated structure pointed to by `*grinfopp'. Returns an IPMP error code. 2528 */ 2529 unsigned int 2530 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) 2531 { 2532 struct phyint_group *pg; 2533 struct phyint *pi; 2534 char (*ifs)[LIFNAMSIZ]; 2535 unsigned int nif, i; 2536 2537 pg = phyint_group_lookup(grname); 2538 if (pg == NULL) 2539 return (IPMP_EUNKGROUP); 2540 2541 /* 2542 * Tally up the number of interfaces, allocate an array to hold them, 2543 * and insert their names into the array. 2544 */ 2545 for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) 2546 nif++; 2547 2548 ifs = alloca(nif * sizeof (*ifs)); 2549 for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) { 2550 assert(i < nif); 2551 (void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ); 2552 } 2553 assert(i == nif); 2554 2555 *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, 2556 groupstate(pg), nif, ifs); 2557 return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2558 } 2559 2560 /* 2561 * Store the information associated with interface `ifname' into a dynamically 2562 * allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code. 2563 */ 2564 unsigned int 2565 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp) 2566 { 2567 struct phyint *pi; 2568 2569 pi = phyint_lookup(ifname); 2570 if (pi == NULL) 2571 return (IPMP_EUNKIF); 2572 2573 *ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name, 2574 ifstate(pi), iftype(pi)); 2575 return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2576 } 2577 2578 /* 2579 * Store the current list of IPMP groups into a dynamically allocated 2580 * structure pointed to by `*grlistpp'. Returns an IPMP error code. 2581 */ 2582 unsigned int 2583 getgrouplist(ipmp_grouplist_t **grlistpp) 2584 { 2585 struct phyint_group *pg; 2586 char (*groups)[LIFGRNAMSIZ]; 2587 unsigned int i, ngroup; 2588 2589 /* 2590 * Tally up the number of groups, allocate an array to hold them, and 2591 * insert their names into the array. 2592 */ 2593 for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next) 2594 ngroup++; 2595 2596 groups = alloca(ngroup * sizeof (*groups)); 2597 for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) { 2598 assert(i < ngroup); 2599 (void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ); 2600 } 2601 assert(i == ngroup); 2602 2603 *grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups); 2604 return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2605 } 2606 2607 /* 2608 * Store a snapshot of the IPMP subsystem into a dynamically allocated 2609 * structure pointed to by `*snapp'. Returns an IPMP error code. 2610 */ 2611 unsigned int 2612 getsnap(ipmp_snap_t **snapp) 2613 { 2614 ipmp_grouplist_t *grlistp; 2615 ipmp_groupinfo_t *grinfop; 2616 ipmp_ifinfo_t *ifinfop; 2617 ipmp_snap_t *snap; 2618 struct phyint *pi; 2619 unsigned int i; 2620 int retval; 2621 2622 snap = ipmp_snap_create(); 2623 if (snap == NULL) 2624 return (IPMP_ENOMEM); 2625 2626 /* 2627 * Add group list. 2628 */ 2629 retval = getgrouplist(&snap->sn_grlistp); 2630 if (retval != IPMP_SUCCESS) { 2631 ipmp_snap_free(snap); 2632 return (retval); 2633 } 2634 2635 /* 2636 * Add information for each group in the list. 2637 */ 2638 grlistp = snap->sn_grlistp; 2639 for (i = 0; i < grlistp->gl_ngroup; i++) { 2640 retval = getgroupinfo(grlistp->gl_groups[i], &grinfop); 2641 if (retval != IPMP_SUCCESS) { 2642 ipmp_snap_free(snap); 2643 return (retval); 2644 } 2645 retval = ipmp_snap_addgroupinfo(snap, grinfop); 2646 if (retval != IPMP_SUCCESS) { 2647 ipmp_freegroupinfo(grinfop); 2648 ipmp_snap_free(snap); 2649 return (retval); 2650 } 2651 } 2652 2653 /* 2654 * Add information for each configured phyint. 2655 */ 2656 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2657 retval = getifinfo(pi->pi_name, &ifinfop); 2658 if (retval != IPMP_SUCCESS) { 2659 ipmp_snap_free(snap); 2660 return (retval); 2661 } 2662 retval = ipmp_snap_addifinfo(snap, ifinfop); 2663 if (retval != IPMP_SUCCESS) { 2664 ipmp_freeifinfo(ifinfop); 2665 ipmp_snap_free(snap); 2666 return (retval); 2667 } 2668 } 2669 2670 *snapp = snap; 2671 return (IPMP_SUCCESS); 2672 } 2673