1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include "mpd_defs.h" 30 #include "mpd_tables.h" 31 32 /* 33 * Global list of phyints, phyint instances, phyint groups and the anonymous 34 * group; the latter is initialized in phyint_init(). 35 */ 36 struct phyint *phyints = NULL; 37 struct phyint_instance *phyint_instances = NULL; 38 struct phyint_group *phyint_groups = NULL; 39 struct phyint_group *phyint_anongroup; 40 41 /* 42 * Grouplist signature; initialized in phyint_init(). 43 */ 44 static uint64_t phyint_grouplistsig; 45 46 static void phyint_inst_insert(struct phyint_instance *pii); 47 static void phyint_inst_print(struct phyint_instance *pii); 48 49 static void phyint_insert(struct phyint *pi, struct phyint_group *pg); 50 static void phyint_delete(struct phyint *pi); 51 52 static void phyint_group_insert(struct phyint_group *pg); 53 static void phyint_group_delete(struct phyint_group *pg); 54 static struct phyint_group *phyint_group_lookup(const char *pg_name); 55 static struct phyint_group *phyint_group_create(const char *pg_name); 56 57 static void logint_print(struct logint *li); 58 static void logint_insert(struct phyint_instance *pii, struct logint *li); 59 static struct logint *logint_lookup(struct phyint_instance *pii, char *li_name); 60 61 static void target_print(struct target *tg); 62 static void target_insert(struct phyint_instance *pii, struct target *tg); 63 static struct target *target_first(struct phyint_instance *pii); 64 static struct target *target_select_best(struct phyint_instance *pii); 65 static void target_flush_hosts(struct phyint_group *pg); 66 67 static void reset_pii_probes(struct phyint_instance *pii, struct target *tg); 68 69 static boolean_t phyint_inst_v6_sockinit(struct phyint_instance *pii); 70 static boolean_t phyint_inst_v4_sockinit(struct phyint_instance *pii); 71 72 static void ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask); 73 static boolean_t prefix_equal(struct in6_addr p1, struct in6_addr p2, 74 int prefix_len); 75 76 static int phyint_state_event(struct phyint_group *pg, struct phyint *pi); 77 static int phyint_group_state_event(struct phyint_group *pg); 78 static int phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t); 79 static int phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, 80 ipmp_if_op_t op); 81 82 static uint64_t gensig(void); 83 84 /* Initialize any per-file global state. Returns 0 on success, -1 on failure */ 85 int 86 phyint_init(void) 87 { 88 phyint_grouplistsig = gensig(); 89 if (track_all_phyints) { 90 phyint_anongroup = phyint_group_create(""); 91 if (phyint_anongroup == NULL) 92 return (-1); 93 phyint_group_insert(phyint_anongroup); 94 } 95 return (0); 96 } 97 98 /* Return the phyint with the given name */ 99 struct phyint * 100 phyint_lookup(const char *name) 101 { 102 struct phyint *pi; 103 104 if (debug & D_PHYINT) 105 logdebug("phyint_lookup(%s)\n", name); 106 107 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 108 if (strncmp(pi->pi_name, name, sizeof (pi->pi_name)) == 0) 109 break; 110 } 111 return (pi); 112 } 113 114 /* Return the phyint instance with the given name and the given family */ 115 struct phyint_instance * 116 phyint_inst_lookup(int af, char *name) 117 { 118 struct phyint *pi; 119 120 if (debug & D_PHYINT) 121 logdebug("phyint_inst_lookup(%s %s)\n", AF_STR(af), name); 122 123 assert(af == AF_INET || af == AF_INET6); 124 125 pi = phyint_lookup(name); 126 if (pi == NULL) 127 return (NULL); 128 129 return (PHYINT_INSTANCE(pi, af)); 130 } 131 132 static struct phyint_group * 133 phyint_group_lookup(const char *pg_name) 134 { 135 struct phyint_group *pg; 136 137 if (debug & D_PHYINT) 138 logdebug("phyint_group_lookup(%s)\n", pg_name); 139 140 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 141 if (strncmp(pg->pg_name, pg_name, sizeof (pg->pg_name)) == 0) 142 break; 143 } 144 return (pg); 145 } 146 147 /* 148 * Insert the phyint in the linked list of all phyints. If the phyint belongs 149 * to some group, insert it in the phyint group list. 150 */ 151 static void 152 phyint_insert(struct phyint *pi, struct phyint_group *pg) 153 { 154 if (debug & D_PHYINT) 155 logdebug("phyint_insert(%s '%s')\n", pi->pi_name, pg->pg_name); 156 157 /* Insert the phyint at the head of the 'all phyints' list */ 158 pi->pi_next = phyints; 159 pi->pi_prev = NULL; 160 if (phyints != NULL) 161 phyints->pi_prev = pi; 162 phyints = pi; 163 164 /* 165 * Insert the phyint at the head of the 'phyint_group members' list 166 * of the phyint group to which it belongs. 167 */ 168 pi->pi_pgnext = NULL; 169 pi->pi_pgprev = NULL; 170 pi->pi_group = pg; 171 172 pi->pi_pgnext = pg->pg_phyint; 173 if (pi->pi_pgnext != NULL) 174 pi->pi_pgnext->pi_pgprev = pi; 175 pg->pg_phyint = pi; 176 177 pg->pg_sig++; 178 (void) phyint_group_member_event(pg, pi, IPMP_IF_ADD); 179 } 180 181 /* Insert the phyint instance in the linked list of all phyint instances. */ 182 static void 183 phyint_inst_insert(struct phyint_instance *pii) 184 { 185 if (debug & D_PHYINT) { 186 logdebug("phyint_inst_insert(%s %s)\n", 187 AF_STR(pii->pii_af), pii->pii_name); 188 } 189 190 /* 191 * Insert the phyint at the head of the 'all phyint instances' list. 192 */ 193 pii->pii_next = phyint_instances; 194 pii->pii_prev = NULL; 195 if (phyint_instances != NULL) 196 phyint_instances->pii_prev = pii; 197 phyint_instances = pii; 198 } 199 200 /* 201 * Create a new phyint with the given parameters. Also insert it into 202 * the list of all phyints and the list of phyint group members by calling 203 * phyint_insert(). 204 */ 205 static struct phyint * 206 phyint_create(char *pi_name, struct phyint_group *pg, uint_t ifindex, 207 uint64_t flags) 208 { 209 struct phyint *pi; 210 211 pi = calloc(1, sizeof (struct phyint)); 212 if (pi == NULL) { 213 logperror("phyint_create: calloc"); 214 return (NULL); 215 } 216 217 /* 218 * Record the phyint values. Also insert the phyint into the 219 * phyint group by calling phyint_insert(). 220 */ 221 (void) strncpy(pi->pi_name, pi_name, sizeof (pi->pi_name)); 222 pi->pi_name[sizeof (pi->pi_name) - 1] = '\0'; 223 pi->pi_ifindex = ifindex; 224 pi->pi_icmpid = 225 htons(((getpid() & 0xFF) << 8) | (pi->pi_ifindex & 0xFF)); 226 /* 227 * We optimistically start in the PI_RUNNING state. Later (in 228 * process_link_state_changes()), we will readjust this to match the 229 * current state of the link. Further, if test addresses are 230 * subsequently assigned, we will transition to PI_NOTARGETS and then 231 * either PI_RUNNING or PI_FAILED, depending on the result of the test 232 * probes. 233 */ 234 pi->pi_state = PI_RUNNING; 235 pi->pi_flags = PHYINT_FLAGS(flags); 236 /* 237 * Initialise the link state. The link state is initialised to 238 * up, so that if the link is down when IPMP starts monitoring 239 * the interface, it will appear as though there has been a 240 * transition from the link up to link down. This avoids 241 * having to treat this situation as a special case. 242 */ 243 INIT_LINK_STATE(pi); 244 245 /* 246 * Insert the phyint in the list of all phyints, and the 247 * list of phyint group members 248 */ 249 phyint_insert(pi, pg); 250 251 /* 252 * If we are joining a failed group, mark the interface as 253 * failed. 254 */ 255 if (GROUP_FAILED(pg)) 256 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 257 258 return (pi); 259 } 260 261 /* 262 * Create a new phyint instance belonging to the phyint 'pi' and address 263 * family 'af'. Also insert it into the list of all phyint instances by 264 * calling phyint_inst_insert(). 265 */ 266 static struct phyint_instance * 267 phyint_inst_create(struct phyint *pi, int af) 268 { 269 struct phyint_instance *pii; 270 271 pii = calloc(1, sizeof (struct phyint_instance)); 272 if (pii == NULL) { 273 logperror("phyint_inst_create: calloc"); 274 return (NULL); 275 } 276 277 /* 278 * Attach the phyint instance to the phyint. 279 * Set the back pointers as well 280 */ 281 pii->pii_phyint = pi; 282 if (af == AF_INET) 283 pi->pi_v4 = pii; 284 else 285 pi->pi_v6 = pii; 286 287 pii->pii_in_use = 1; 288 pii->pii_probe_sock = -1; 289 pii->pii_snxt = 1; 290 pii->pii_af = af; 291 pii->pii_fd_hrtime = gethrtime() + 292 (FAILURE_DETECTION_QP * (hrtime_t)NANOSEC); 293 pii->pii_flags = pi->pi_flags; 294 295 /* Insert the phyint instance in the list of all phyint instances. */ 296 phyint_inst_insert(pii); 297 return (pii); 298 } 299 300 /* 301 * Change the state of phyint `pi' to state `state'. 302 */ 303 void 304 phyint_chstate(struct phyint *pi, enum pi_state state) 305 { 306 /* 307 * To simplify things, some callers always set a given state 308 * regardless of the previous state of the phyint (e.g., setting 309 * PI_RUNNING when it's already set). We shouldn't bother 310 * generating an event or consuming a signature for these, since 311 * the actual state of the interface is unchanged. 312 */ 313 if (pi->pi_state == state) 314 return; 315 316 pi->pi_state = state; 317 pi->pi_group->pg_sig++; 318 (void) phyint_state_event(pi->pi_group, pi); 319 } 320 321 /* 322 * Note that the type of phyint `pi' has changed. 323 */ 324 void 325 phyint_newtype(struct phyint *pi) 326 { 327 pi->pi_group->pg_sig++; 328 (void) phyint_state_event(pi->pi_group, pi); 329 } 330 331 /* 332 * Insert the phyint group in the linked list of all phyint groups 333 * at the head of the list 334 */ 335 static void 336 phyint_group_insert(struct phyint_group *pg) 337 { 338 pg->pg_next = phyint_groups; 339 pg->pg_prev = NULL; 340 if (phyint_groups != NULL) 341 phyint_groups->pg_prev = pg; 342 phyint_groups = pg; 343 344 phyint_grouplistsig++; 345 (void) phyint_group_change_event(pg, IPMP_GROUP_ADD); 346 } 347 348 /* 349 * Create a new phyint group called 'name'. 350 */ 351 static struct phyint_group * 352 phyint_group_create(const char *name) 353 { 354 struct phyint_group *pg; 355 356 if (debug & D_PHYINT) 357 logdebug("phyint_group_create(%s)\n", name); 358 359 pg = calloc(1, sizeof (struct phyint_group)); 360 if (pg == NULL) { 361 logperror("phyint_group_create: calloc"); 362 return (NULL); 363 } 364 365 (void) strncpy(pg->pg_name, name, sizeof (pg->pg_name)); 366 pg->pg_name[sizeof (pg->pg_name) - 1] = '\0'; 367 pg->pg_sig = gensig(); 368 369 pg->pg_fdt = user_failure_detection_time; 370 pg->pg_probeint = user_probe_interval; 371 372 return (pg); 373 } 374 375 /* 376 * Change the state of the phyint group `pg' to state `state'. 377 */ 378 void 379 phyint_group_chstate(struct phyint_group *pg, enum pg_state state) 380 { 381 assert(pg != phyint_anongroup); 382 383 switch (state) { 384 case PG_FAILED: 385 pg->pg_groupfailed = 1; 386 387 /* 388 * We can never know with certainty that a group has 389 * failed. It is possible that all known targets have 390 * failed simultaneously, and new targets have come up 391 * instead. If the targets are routers then router 392 * discovery will kick in, and we will see the new routers 393 * thru routing socket messages. But if the targets are 394 * hosts, we have to discover it by multicast. So flush 395 * all the host targets. The next probe will send out a 396 * multicast echo request. If this is a group failure, we 397 * will still not see any response, otherwise we will 398 * clear the pg_groupfailed flag after we get 399 * NUM_PROBE_REPAIRS consecutive unicast replies on any 400 * phyint. 401 */ 402 target_flush_hosts(pg); 403 break; 404 405 case PG_RUNNING: 406 pg->pg_groupfailed = 0; 407 break; 408 409 default: 410 logerr("phyint_group_chstate: invalid group state %d; " 411 "aborting\n", state); 412 abort(); 413 } 414 415 pg->pg_sig++; 416 (void) phyint_group_state_event(pg); 417 } 418 419 /* 420 * Create a new phyint instance and initialize it from the values supplied by 421 * the kernel. Always check for ENXIO before logging any error, because the 422 * interface could have vanished after completion of SIOCGLIFCONF. 423 * Return values: 424 * pointer to the phyint instance on success 425 * NULL on failure Eg. if the phyint instance is not found in the kernel 426 */ 427 struct phyint_instance * 428 phyint_inst_init_from_k(int af, char *pi_name) 429 { 430 char pg_name[LIFNAMSIZ + 1]; 431 int ifsock; 432 uint_t ifindex; 433 uint64_t flags; 434 struct lifreq lifr; 435 struct phyint *pi; 436 struct phyint_instance *pii; 437 boolean_t pg_created; 438 boolean_t pi_created; 439 struct phyint_group *pg; 440 441 retry: 442 pii = NULL; 443 pi = NULL; 444 pg = NULL; 445 pi_created = _B_FALSE; 446 pg_created = _B_FALSE; 447 448 if (debug & D_PHYINT) { 449 logdebug("phyint_inst_init_from_k(%s %s)\n", 450 AF_STR(af), pi_name); 451 } 452 453 assert(af == AF_INET || af == AF_INET6); 454 455 /* Get the socket for doing ioctls */ 456 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 457 458 /* 459 * Get the interface flags. Ignore loopback and multipoint 460 * interfaces. 461 */ 462 (void) strncpy(lifr.lifr_name, pi_name, sizeof (lifr.lifr_name)); 463 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 464 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 465 if (errno != ENXIO) { 466 logperror("phyint_inst_init_from_k:" 467 " ioctl (get flags)"); 468 } 469 return (NULL); 470 } 471 flags = lifr.lifr_flags; 472 if (!(flags & IFF_MULTICAST) || (flags & IFF_LOOPBACK)) 473 return (NULL); 474 475 /* 476 * Get the ifindex for recording later in our tables, in case we need 477 * to create a new phyint. 478 */ 479 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) { 480 if (errno != ENXIO) { 481 logperror("phyint_inst_init_from_k: " 482 " ioctl (get lifindex)"); 483 } 484 return (NULL); 485 } 486 ifindex = lifr.lifr_index; 487 488 /* 489 * Get the phyint group name of this phyint, from the kernel. 490 */ 491 if (ioctl(ifsock, SIOCGLIFGROUPNAME, (char *)&lifr) < 0) { 492 if (errno != ENXIO) { 493 logperror("phyint_inst_init_from_k: " 494 "ioctl (get group name)"); 495 } 496 return (NULL); 497 } 498 (void) strncpy(pg_name, lifr.lifr_groupname, sizeof (pg_name)); 499 pg_name[sizeof (pg_name) - 1] = '\0'; 500 501 /* 502 * If the phyint is not part of any group, pg_name is the 503 * null string. If 'track_all_phyints' is false, there is no 504 * need to create a phyint. 505 */ 506 if (pg_name[0] == '\0' && !track_all_phyints) { 507 /* 508 * If the IFF_FAILED or IFF_OFFLINE flags are set, reset 509 * them. These flags shouldn't be set if IPMP isn't 510 * tracking the interface. 511 */ 512 if ((flags & (IFF_FAILED | IFF_OFFLINE)) != 0) { 513 lifr.lifr_flags = flags & ~(IFF_FAILED | IFF_OFFLINE); 514 if (ioctl(ifsock, SIOCSLIFFLAGS, (char *)&lifr) < 0) { 515 if (errno != ENXIO) { 516 logperror("phyint_inst_init_from_k:" 517 " ioctl (set flags)"); 518 } 519 } 520 } 521 return (NULL); 522 } 523 524 /* 525 * We need to create a new phyint instance. A phyint instance 526 * belongs to a phyint, and the phyint belongs to a phyint group. 527 * So we first lookup the 'parents' and if they don't exist then 528 * we create them. 529 */ 530 pg = phyint_group_lookup(pg_name); 531 if (pg == NULL) { 532 pg = phyint_group_create(pg_name); 533 if (pg == NULL) { 534 logerr("phyint_inst_init_from_k:" 535 " unable to create group %s\n", pg_name); 536 return (NULL); 537 } 538 phyint_group_insert(pg); 539 pg_created = _B_TRUE; 540 } 541 542 /* 543 * Lookup the phyint. If the phyint does not exist create it. 544 */ 545 pi = phyint_lookup(pi_name); 546 if (pi == NULL) { 547 pi = phyint_create(pi_name, pg, ifindex, flags); 548 if (pi == NULL) { 549 logerr("phyint_inst_init_from_k:" 550 " unable to create phyint %s\n", pi_name); 551 if (pg_created) 552 phyint_group_delete(pg); 553 return (NULL); 554 } 555 pi_created = _B_TRUE; 556 } else { 557 /* The phyint exists already. */ 558 assert(pi_created == _B_FALSE); 559 /* 560 * Normally we should see consistent values for the IPv4 and 561 * IPv6 instances, for phyint properties. If we don't, it 562 * means things have changed underneath us, and we should 563 * resync our tables with the kernel. Check whether the 564 * interface index has changed. If so, it is most likely 565 * the interface has been unplumbed and replumbed, 566 * while we are yet to update our tables. Do it now. 567 */ 568 if (pi->pi_ifindex != ifindex) { 569 if (pg_created) 570 phyint_group_delete(pg); 571 phyint_inst_delete(PHYINT_INSTANCE(pi, AF_OTHER(af))); 572 goto retry; 573 } 574 assert(PHYINT_INSTANCE(pi, af) == NULL); 575 576 /* 577 * If the group name seen by the IPv4 and IPv6 instances 578 * are different, it is most likely the groupname has 579 * changed, while we are yet to update our tables. Do it now. 580 */ 581 if (strcmp(pi->pi_group->pg_name, pg_name) != 0) { 582 if (pg_created) 583 phyint_group_delete(pg); 584 restore_phyint(pi); 585 phyint_inst_delete(PHYINT_INSTANCE(pi, 586 AF_OTHER(af))); 587 goto retry; 588 } 589 } 590 591 /* 592 * Create a new phyint instance, corresponding to the 'af' 593 * passed in. 594 */ 595 pii = phyint_inst_create(pi, af); 596 if (pii == NULL) { 597 logerr("phyint_inst_init_from_k: unable to create" 598 "phyint inst %s\n", pi->pi_name); 599 if (pi_created) { 600 /* 601 * Deleting the phyint will delete the phyint group 602 * if this is the last phyint in the group. 603 */ 604 phyint_delete(pi); 605 } 606 return (NULL); 607 } 608 609 return (pii); 610 } 611 612 /* 613 * Bind the pii_probe_sock to the chosen IFF_NOFAILOVER address in 614 * pii_probe_logint. This socket will be used for sending and receiving 615 * ICMP/ICMPv6 probes to targets. Do the common part in this function, and 616 * complete the initializations by calling the protocol specific functions 617 * phyint_inst_v{4,6}_sockinit() respectively. 618 * 619 * Return values: _B_TRUE/_B_FALSE for success or failure respectively. 620 */ 621 boolean_t 622 phyint_inst_sockinit(struct phyint_instance *pii) 623 { 624 boolean_t success; 625 struct phyint_group *pg; 626 627 if (debug & D_PHYINT) { 628 logdebug("phyint_inst_sockinit(%s %s)\n", 629 AF_STR(pii->pii_af), pii->pii_name); 630 } 631 632 assert(pii->pii_probe_logint != NULL); 633 assert(pii->pii_probe_logint->li_flags & IFF_UP); 634 assert(SINGLETON_GROUP(pii->pii_phyint) || 635 (pii->pii_probe_logint->li_flags & IFF_NOFAILOVER)); 636 assert(pii->pii_af == AF_INET || pii->pii_af == AF_INET6); 637 638 /* 639 * If the socket is already bound, close pii_probe_sock 640 */ 641 if (pii->pii_probe_sock != -1) 642 close_probe_socket(pii, _B_TRUE); 643 644 /* 645 * If the phyint is not part of a named group and track_all_phyints is 646 * false, simply return. 647 */ 648 pg = pii->pii_phyint->pi_group; 649 if (pg == phyint_anongroup && !track_all_phyints) { 650 if (debug & D_PHYINT) 651 logdebug("phyint_inst_sockinit: no group\n"); 652 return (_B_FALSE); 653 } 654 655 /* 656 * Initialize the socket by calling the protocol specific function. 657 * If it succeeds, add the socket to the poll list. 658 */ 659 if (pii->pii_af == AF_INET6) 660 success = phyint_inst_v6_sockinit(pii); 661 else 662 success = phyint_inst_v4_sockinit(pii); 663 664 if (success && (poll_add(pii->pii_probe_sock) == 0)) 665 return (_B_TRUE); 666 667 /* Something failed, cleanup and return false */ 668 if (pii->pii_probe_sock != -1) 669 close_probe_socket(pii, _B_FALSE); 670 671 return (_B_FALSE); 672 } 673 674 /* 675 * IPv6 specific part in initializing the pii_probe_sock. This socket is 676 * used to send/receive ICMPv6 probe packets. 677 */ 678 static boolean_t 679 phyint_inst_v6_sockinit(struct phyint_instance *pii) 680 { 681 icmp6_filter_t filter; 682 int hopcount = 1; 683 int int_op; 684 struct sockaddr_in6 testaddr; 685 686 /* 687 * Open a raw socket with ICMPv6 protocol. 688 * 689 * Use IPV6_DONTFAILOVER_IF to make sure that probes go out 690 * on the specified phyint only, and are not subject to load 691 * balancing. Bind to the src address chosen will ensure that 692 * the responses are received only on the specified phyint. 693 * 694 * Set the hopcount to 1 so that probe packets are not routed. 695 * Disable multicast loopback. Set the receive filter to 696 * receive only ICMPv6 echo replies. 697 */ 698 pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMPV6); 699 if (pii->pii_probe_sock < 0) { 700 logperror_pii(pii, "phyint_inst_v6_sockinit: socket"); 701 return (_B_FALSE); 702 } 703 704 bzero(&testaddr, sizeof (testaddr)); 705 testaddr.sin6_family = AF_INET6; 706 testaddr.sin6_port = 0; 707 testaddr.sin6_addr = pii->pii_probe_logint->li_addr; 708 709 if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr, 710 sizeof (testaddr)) < 0) { 711 logperror_pii(pii, "phyint_inst_v6_sockinit: IPv6 bind"); 712 return (_B_FALSE); 713 } 714 715 /* 716 * IPV6_DONTFAILOVER_IF option takes precedence over setting 717 * IP_MULTICAST_IF. So we don't set IPV6_MULTICAST_IF again. 718 */ 719 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_DONTFAILOVER_IF, 720 (char *)&pii->pii_ifindex, sizeof (uint_t)) < 0) { 721 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 722 " IPV6_DONTFAILOVER_IF"); 723 return (_B_FALSE); 724 } 725 726 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_UNICAST_HOPS, 727 (char *)&hopcount, sizeof (hopcount)) < 0) { 728 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 729 " IPV6_UNICAST_HOPS"); 730 return (_B_FALSE); 731 } 732 733 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_HOPS, 734 (char *)&hopcount, sizeof (hopcount)) < 0) { 735 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 736 " IPV6_MULTICAST_HOPS"); 737 return (_B_FALSE); 738 } 739 740 int_op = 0; /* used to turn off option */ 741 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_MULTICAST_LOOP, 742 (char *)&int_op, sizeof (int_op)) < 0) { 743 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 744 " IPV6_MULTICAST_LOOP"); 745 return (_B_FALSE); 746 } 747 748 /* 749 * Filter out so that we only receive ICMP echo replies 750 */ 751 ICMP6_FILTER_SETBLOCKALL(&filter); 752 ICMP6_FILTER_SETPASS(ICMP6_ECHO_REPLY, &filter); 753 754 if (setsockopt(pii->pii_probe_sock, IPPROTO_ICMPV6, ICMP6_FILTER, 755 (char *)&filter, sizeof (filter)) < 0) { 756 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 757 " ICMP6_FILTER"); 758 return (_B_FALSE); 759 } 760 761 /* Enable receipt of ancillary data */ 762 int_op = 1; 763 if (setsockopt(pii->pii_probe_sock, IPPROTO_IPV6, IPV6_RECVHOPLIMIT, 764 (char *)&int_op, sizeof (int_op)) < 0) { 765 logperror_pii(pii, "phyint_inst_v6_sockinit: setsockopt" 766 " IPV6_RECVHOPLIMIT"); 767 return (_B_FALSE); 768 } 769 770 return (_B_TRUE); 771 } 772 773 /* 774 * IPv4 specific part in initializing the pii_probe_sock. This socket is 775 * used to send/receive ICMPv4 probe packets. 776 */ 777 static boolean_t 778 phyint_inst_v4_sockinit(struct phyint_instance *pii) 779 { 780 struct sockaddr_in testaddr; 781 char char_op; 782 int ttl = 1; 783 char char_ttl = 1; 784 785 /* 786 * Open a raw socket with ICMPv4 protocol. 787 * 788 * Use IP_DONTFAILOVER_IF to make sure that probes go out 789 * on the specified phyint only, and are not subject to load 790 * balancing. Bind to the src address chosen will ensure that 791 * the responses are received only on the specified phyint. 792 * 793 * Set the ttl to 1 so that probe packets are not routed. 794 * Disable multicast loopback. 795 */ 796 pii->pii_probe_sock = socket(pii->pii_af, SOCK_RAW, IPPROTO_ICMP); 797 if (pii->pii_probe_sock < 0) { 798 logperror_pii(pii, "phyint_inst_v4_sockinit: socket"); 799 return (_B_FALSE); 800 } 801 802 bzero(&testaddr, sizeof (testaddr)); 803 testaddr.sin_family = AF_INET; 804 testaddr.sin_port = 0; 805 IN6_V4MAPPED_TO_INADDR(&pii->pii_probe_logint->li_addr, 806 &testaddr.sin_addr); 807 808 if (bind(pii->pii_probe_sock, (struct sockaddr *)&testaddr, 809 sizeof (testaddr)) < 0) { 810 logperror_pii(pii, "phyint_inst_v4_sockinit: IPv4 bind"); 811 return (_B_FALSE); 812 } 813 814 /* 815 * IP_DONTFAILOVER_IF option takes precedence over setting 816 * IP_MULTICAST_IF. So we don't set IP_MULTICAST_IF again. 817 */ 818 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_DONTFAILOVER_IF, 819 (char *)&testaddr.sin_addr, sizeof (struct in_addr)) < 0) { 820 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 821 " IP_DONTFAILOVER"); 822 return (_B_FALSE); 823 } 824 825 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_TTL, 826 (char *)&ttl, sizeof (ttl)) < 0) { 827 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 828 " IP_TTL"); 829 return (_B_FALSE); 830 } 831 832 char_op = 0; /* used to turn off option */ 833 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_LOOP, 834 (char *)&char_op, sizeof (char_op)) == -1) { 835 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 836 " IP_MULTICAST_LOOP"); 837 return (_B_FALSE); 838 } 839 840 if (setsockopt(pii->pii_probe_sock, IPPROTO_IP, IP_MULTICAST_TTL, 841 (char *)&char_ttl, sizeof (char_ttl)) == -1) { 842 logperror_pii(pii, "phyint_inst_v4_sockinit: setsockopt" 843 " IP_MULTICAST_TTL"); 844 return (_B_FALSE); 845 } 846 847 return (_B_TRUE); 848 } 849 850 /* 851 * Remove the phyint group from the list of 'all phyint groups' 852 * and free it. 853 */ 854 static void 855 phyint_group_delete(struct phyint_group *pg) 856 { 857 /* 858 * The anonymous group always exists, even when empty. 859 */ 860 if (pg == phyint_anongroup) 861 return; 862 863 if (debug & D_PHYINT) 864 logdebug("phyint_group_delete('%s')\n", pg->pg_name); 865 866 /* 867 * The phyint group must be empty, and must not have any phyints. 868 * The phyint group must be in the list of all phyint groups 869 */ 870 assert(pg->pg_phyint == NULL); 871 assert(phyint_groups == pg || pg->pg_prev != NULL); 872 873 if (pg->pg_prev != NULL) 874 pg->pg_prev->pg_next = pg->pg_next; 875 else 876 phyint_groups = pg->pg_next; 877 878 if (pg->pg_next != NULL) 879 pg->pg_next->pg_prev = pg->pg_prev; 880 881 pg->pg_next = NULL; 882 pg->pg_prev = NULL; 883 884 phyint_grouplistsig++; 885 (void) phyint_group_change_event(pg, IPMP_GROUP_REMOVE); 886 887 free(pg); 888 } 889 890 /* 891 * Extract information from the kernel about the desired phyint. 892 * Look only for properties of the phyint and not properties of logints. 893 * Take appropriate action on the changes. 894 * Return codes: 895 * PI_OK 896 * The phyint exists in the kernel and matches our knowledge 897 * of the phyint. 898 * PI_DELETED 899 * The phyint has vanished in the kernel. 900 * PI_IFINDEX_CHANGED 901 * The phyint's interface index has changed. 902 * Ask the caller to delete and recreate the phyint. 903 * PI_IOCTL_ERROR 904 * Some ioctl error. Don't change anything. 905 * PI_GROUP_CHANGED 906 * The phyint has changed group. 907 */ 908 int 909 phyint_inst_update_from_k(struct phyint_instance *pii) 910 { 911 struct lifreq lifr; 912 int ifsock; 913 struct phyint *pi; 914 915 pi = pii->pii_phyint; 916 917 if (debug & D_PHYINT) { 918 logdebug("phyint_inst_update_from_k(%s %s)\n", 919 AF_STR(pii->pii_af), pi->pi_name); 920 } 921 922 /* 923 * Get the ifindex from the kernel, for comparison with the 924 * value in our tables. 925 */ 926 (void) strncpy(lifr.lifr_name, pi->pi_name, sizeof (lifr.lifr_name)); 927 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 928 929 ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6; 930 if (ioctl(ifsock, SIOCGLIFINDEX, &lifr) < 0) { 931 if (errno == ENXIO) { 932 return (PI_DELETED); 933 } else { 934 logperror_pii(pii, "phyint_inst_update_from_k:" 935 " ioctl (get lifindex)"); 936 return (PI_IOCTL_ERROR); 937 } 938 } 939 940 if (lifr.lifr_index != pi->pi_ifindex) { 941 /* 942 * The index has changed. Most likely the interface has 943 * been unplumbed and replumbed. Ask the caller to take 944 * appropriate action. 945 */ 946 if (debug & D_PHYINT) { 947 logdebug("phyint_inst_update_from_k:" 948 " old index %d new index %d\n", 949 pi->pi_ifindex, lifr.lifr_index); 950 } 951 return (PI_IFINDEX_CHANGED); 952 } 953 954 /* 955 * Get the group name from the kernel, for comparison with 956 * the value in our tables. 957 */ 958 if (ioctl(ifsock, SIOCGLIFGROUPNAME, &lifr) < 0) { 959 if (errno == ENXIO) { 960 return (PI_DELETED); 961 } else { 962 logperror_pii(pii, "phyint_inst_update_from_k:" 963 " ioctl (get groupname)"); 964 return (PI_IOCTL_ERROR); 965 } 966 } 967 968 /* 969 * If the phyint has changed group i.e. if the phyint group name 970 * returned by the kernel is different, ask the caller to delete 971 * and recreate the phyint in the right group 972 */ 973 if (strcmp(lifr.lifr_groupname, pi->pi_group->pg_name) != 0) { 974 /* Groupname has changed */ 975 if (debug & D_PHYINT) { 976 logdebug("phyint_inst_update_from_k:" 977 " groupname change\n"); 978 } 979 return (PI_GROUP_CHANGED); 980 } 981 982 /* 983 * Get the current phyint flags from the kernel, and determine what 984 * flags have changed by comparing against our tables. Note that the 985 * IFF_INACTIVE processing in initifs() relies on this call to ensure 986 * that IFF_INACTIVE is really still set on the interface. 987 */ 988 if (ioctl(ifsock, SIOCGLIFFLAGS, &lifr) < 0) { 989 if (errno == ENXIO) { 990 return (PI_DELETED); 991 } else { 992 logperror_pii(pii, "phyint_inst_update_from_k: " 993 " ioctl (get flags)"); 994 return (PI_IOCTL_ERROR); 995 } 996 } 997 998 pi->pi_flags = PHYINT_FLAGS(lifr.lifr_flags); 999 if (pi->pi_v4 != NULL) 1000 pi->pi_v4->pii_flags = pi->pi_flags; 1001 if (pi->pi_v6 != NULL) 1002 pi->pi_v6->pii_flags = pi->pi_flags; 1003 1004 if (pi->pi_flags & IFF_FAILED) { 1005 /* 1006 * If we are in the running and full state, we have 1007 * completed failbacks successfully and we would have 1008 * expected IFF_FAILED to have been clear. That it is 1009 * set means there was a race condition. Some other 1010 * process turned on the IFF_FAILED flag. Since the 1011 * flag setting is not atomic, i.e. a get ioctl followed 1012 * by a set ioctl, and since there is no way to set an 1013 * individual flag bit, this could have occurred. 1014 */ 1015 if (pi->pi_state == PI_RUNNING && pi->pi_full) 1016 (void) change_lif_flags(pi, IFF_FAILED, _B_FALSE); 1017 } else { 1018 /* 1019 * If we are in the failed state, there was a race. 1020 * we have completed failover successfully because our 1021 * state is failed and empty. Some other process turned 1022 * off the IFF_FAILED flag. Same comment as above 1023 */ 1024 if (pi->pi_state == PI_FAILED && pi->pi_empty) 1025 (void) change_lif_flags(pi, IFF_FAILED, _B_TRUE); 1026 } 1027 1028 /* No change in phyint status */ 1029 return (PI_OK); 1030 } 1031 1032 /* 1033 * Delete the phyint. Remove it from the list of all phyints, and the 1034 * list of phyint group members. If the group becomes empty, delete the 1035 * group also. 1036 */ 1037 static void 1038 phyint_delete(struct phyint *pi) 1039 { 1040 struct phyint_group *pg = pi->pi_group; 1041 1042 if (debug & D_PHYINT) 1043 logdebug("phyint_delete(%s)\n", pi->pi_name); 1044 1045 /* Both IPv4 and IPv6 phyint instances must have been deleted. */ 1046 assert(pi->pi_v4 == NULL && pi->pi_v6 == NULL); 1047 1048 /* 1049 * The phyint must belong to a group. 1050 */ 1051 assert(pg->pg_phyint == pi || pi->pi_pgprev != NULL); 1052 1053 /* The phyint must be in the list of all phyints */ 1054 assert(phyints == pi || pi->pi_prev != NULL); 1055 1056 /* Remove the phyint from the phyint group list */ 1057 pg->pg_sig++; 1058 (void) phyint_group_member_event(pg, pi, IPMP_IF_REMOVE); 1059 1060 if (pi->pi_pgprev == NULL) { 1061 /* Phyint is the 1st in the phyint group list */ 1062 pg->pg_phyint = pi->pi_pgnext; 1063 } else { 1064 pi->pi_pgprev->pi_pgnext = pi->pi_pgnext; 1065 } 1066 if (pi->pi_pgnext != NULL) 1067 pi->pi_pgnext->pi_pgprev = pi->pi_pgprev; 1068 pi->pi_pgnext = NULL; 1069 pi->pi_pgprev = NULL; 1070 1071 /* Remove the phyint from the global list of phyints */ 1072 if (pi->pi_prev == NULL) { 1073 /* Phyint is the 1st in the list */ 1074 phyints = pi->pi_next; 1075 } else { 1076 pi->pi_prev->pi_next = pi->pi_next; 1077 } 1078 if (pi->pi_next != NULL) 1079 pi->pi_next->pi_prev = pi->pi_prev; 1080 pi->pi_next = NULL; 1081 pi->pi_prev = NULL; 1082 1083 free(pi); 1084 1085 /* Delete the phyint_group if the last phyint has been deleted */ 1086 if (pg->pg_phyint == NULL) 1087 phyint_group_delete(pg); 1088 } 1089 1090 /* 1091 * Delete (unlink and free), the phyint instance. 1092 */ 1093 void 1094 phyint_inst_delete(struct phyint_instance *pii) 1095 { 1096 struct phyint *pi = pii->pii_phyint; 1097 1098 assert(pi != NULL); 1099 1100 if (debug & D_PHYINT) { 1101 logdebug("phyint_inst_delete(%s %s)\n", 1102 AF_STR(pii->pii_af), pi->pi_name); 1103 } 1104 1105 /* 1106 * If the phyint instance has associated probe targets 1107 * delete all the targets 1108 */ 1109 while (pii->pii_targets != NULL) 1110 target_delete(pii->pii_targets); 1111 1112 /* 1113 * Delete all the logints associated with this phyint 1114 * instance. 1115 */ 1116 while (pii->pii_logint != NULL) 1117 logint_delete(pii->pii_logint); 1118 1119 /* 1120 * Close the IFF_NOFAILOVER socket used to send probes to targets 1121 * from this phyint. 1122 */ 1123 if (pii->pii_probe_sock != -1) 1124 close_probe_socket(pii, _B_TRUE); 1125 1126 /* 1127 * Phyint instance must be in the list of all phyint instances. 1128 * Remove phyint instance from the global list of phyint instances. 1129 */ 1130 assert(phyint_instances == pii || pii->pii_prev != NULL); 1131 if (pii->pii_prev == NULL) { 1132 /* Phyint is the 1st in the list */ 1133 phyint_instances = pii->pii_next; 1134 } else { 1135 pii->pii_prev->pii_next = pii->pii_next; 1136 } 1137 if (pii->pii_next != NULL) 1138 pii->pii_next->pii_prev = pii->pii_prev; 1139 pii->pii_next = NULL; 1140 pii->pii_prev = NULL; 1141 1142 /* 1143 * Reset the phyint instance pointer in the phyint. 1144 * If this is the last phyint instance (being deleted) on this 1145 * phyint, then delete the phyint. 1146 */ 1147 if (pii->pii_af == AF_INET) 1148 pi->pi_v4 = NULL; 1149 else 1150 pi->pi_v6 = NULL; 1151 1152 if (pi->pi_v4 == NULL && pi->pi_v6 == NULL) 1153 phyint_delete(pi); 1154 1155 free(pii); 1156 } 1157 1158 static void 1159 phyint_inst_print(struct phyint_instance *pii) 1160 { 1161 struct logint *li; 1162 struct target *tg; 1163 char abuf[INET6_ADDRSTRLEN]; 1164 int most_recent; 1165 int i; 1166 1167 if (pii->pii_phyint == NULL) { 1168 logdebug("pii->pi_phyint NULL can't print\n"); 1169 return; 1170 } 1171 1172 logdebug("\nPhyint instance: %s %s index %u state %x flags %llx " 1173 "sock %x in_use %d empty %x full %x\n", 1174 AF_STR(pii->pii_af), pii->pii_name, pii->pii_ifindex, 1175 pii->pii_state, pii->pii_phyint->pi_flags, pii->pii_probe_sock, 1176 pii->pii_in_use, pii->pii_phyint->pi_empty, 1177 pii->pii_phyint->pi_full); 1178 1179 for (li = pii->pii_logint; li != NULL; li = li->li_next) 1180 logint_print(li); 1181 1182 logdebug("\n"); 1183 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1184 target_print(tg); 1185 1186 if (pii->pii_targets == NULL) 1187 logdebug("pi_targets NULL\n"); 1188 1189 if (pii->pii_target_next != NULL) { 1190 logdebug("pi_target_next %s %s\n", AF_STR(pii->pii_af), 1191 pr_addr(pii->pii_af, pii->pii_target_next->tg_address, 1192 abuf, sizeof (abuf))); 1193 } else { 1194 logdebug("pi_target_next NULL\n"); 1195 } 1196 1197 if (pii->pii_rtt_target_next != NULL) { 1198 logdebug("pi_rtt_target_next %s %s\n", AF_STR(pii->pii_af), 1199 pr_addr(pii->pii_af, pii->pii_rtt_target_next->tg_address, 1200 abuf, sizeof (abuf))); 1201 } else { 1202 logdebug("pi_rtt_target_next NULL\n"); 1203 } 1204 1205 if (pii->pii_targets != NULL) { 1206 most_recent = PROBE_INDEX_PREV(pii->pii_probe_next); 1207 1208 i = most_recent; 1209 do { 1210 if (pii->pii_probes[i].pr_target != NULL) { 1211 logdebug("#%d target %s ", i, 1212 pr_addr(pii->pii_af, 1213 pii->pii_probes[i].pr_target->tg_address, 1214 abuf, sizeof (abuf))); 1215 } else { 1216 logdebug("#%d target NULL ", i); 1217 } 1218 logdebug("time_sent %u status %d time_ack/lost %u\n", 1219 pii->pii_probes[i].pr_time_sent, 1220 pii->pii_probes[i].pr_status, 1221 pii->pii_probes[i].pr_time_lost); 1222 i = PROBE_INDEX_PREV(i); 1223 } while (i != most_recent); 1224 } 1225 } 1226 1227 /* 1228 * Lookup a logint based on the logical interface name, on the given 1229 * phyint instance. 1230 */ 1231 static struct logint * 1232 logint_lookup(struct phyint_instance *pii, char *name) 1233 { 1234 struct logint *li; 1235 1236 if (debug & D_LOGINT) { 1237 logdebug("logint_lookup(%s, %s)\n", 1238 AF_STR(pii->pii_af), name); 1239 } 1240 1241 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 1242 if (strncmp(name, li->li_name, sizeof (li->li_name)) == 0) 1243 break; 1244 } 1245 return (li); 1246 } 1247 1248 /* 1249 * Insert a logint at the head of the list of logints of the given 1250 * phyint instance 1251 */ 1252 static void 1253 logint_insert(struct phyint_instance *pii, struct logint *li) 1254 { 1255 li->li_next = pii->pii_logint; 1256 li->li_prev = NULL; 1257 if (pii->pii_logint != NULL) 1258 pii->pii_logint->li_prev = li; 1259 pii->pii_logint = li; 1260 li->li_phyint_inst = pii; 1261 } 1262 1263 /* 1264 * Create a new named logint, on the specified phyint instance. 1265 */ 1266 static struct logint * 1267 logint_create(struct phyint_instance *pii, char *name) 1268 { 1269 struct logint *li; 1270 1271 if (debug & D_LOGINT) { 1272 logdebug("logint_create(%s %s %s)\n", 1273 AF_STR(pii->pii_af), pii->pii_name, name); 1274 } 1275 1276 li = calloc(1, sizeof (struct logint)); 1277 if (li == NULL) { 1278 logperror("logint_create: calloc"); 1279 return (NULL); 1280 } 1281 1282 (void) strncpy(li->li_name, name, sizeof (li->li_name)); 1283 li->li_name[sizeof (li->li_name) - 1] = '\0'; 1284 logint_insert(pii, li); 1285 return (li); 1286 } 1287 1288 /* 1289 * Initialize the logint based on the data returned by the kernel. 1290 */ 1291 void 1292 logint_init_from_k(struct phyint_instance *pii, char *li_name) 1293 { 1294 int ifsock; 1295 uint64_t flags; 1296 uint64_t saved_flags; 1297 struct logint *li; 1298 struct lifreq lifr; 1299 struct in6_addr test_subnet; 1300 struct in6_addr test_subnet_mask; 1301 struct in6_addr testaddr; 1302 int test_subnet_len; 1303 struct sockaddr_in6 *sin6; 1304 struct sockaddr_in *sin; 1305 char abuf[INET6_ADDRSTRLEN]; 1306 boolean_t ptp = _B_FALSE; 1307 struct in6_addr tgaddr; 1308 1309 if (debug & D_LOGINT) { 1310 logdebug("logint_init_from_k(%s %s)\n", 1311 AF_STR(pii->pii_af), li_name); 1312 } 1313 1314 /* Get the socket for doing ioctls */ 1315 ifsock = (pii->pii_af == AF_INET) ? ifsock_v4 : ifsock_v6; 1316 1317 /* 1318 * Get the flags from the kernel. Also serves as a check whether 1319 * the logical still exists. If it doesn't exist, no need to proceed 1320 * any further. li_in_use will make the caller clean up the logint 1321 */ 1322 (void) strncpy(lifr.lifr_name, li_name, sizeof (lifr.lifr_name)); 1323 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 1324 if (ioctl(ifsock, SIOCGLIFFLAGS, (char *)&lifr) < 0) { 1325 /* Interface may have vanished */ 1326 if (errno != ENXIO) { 1327 logperror_pii(pii, "logint_init_from_k: " 1328 "ioctl (get flags)"); 1329 } 1330 return; 1331 } 1332 1333 flags = lifr.lifr_flags; 1334 1335 /* 1336 * Verified the logint exists. Now lookup the logint in our tables. 1337 * If it does not exist, create a new logint. 1338 */ 1339 li = logint_lookup(pii, li_name); 1340 if (li == NULL) { 1341 li = logint_create(pii, li_name); 1342 if (li == NULL) { 1343 /* 1344 * Pretend the interface does not exist 1345 * in the kernel 1346 */ 1347 return; 1348 } 1349 } 1350 1351 /* 1352 * Update li->li_flags with the new flags, after saving the old 1353 * value. This is used later to check what flags has changed and 1354 * take any action 1355 */ 1356 saved_flags = li->li_flags; 1357 li->li_flags = flags; 1358 1359 /* 1360 * Get the address, prefix, prefixlength and update the logint. 1361 * Check if anything has changed. If the logint used for the 1362 * test address has changed, take suitable action. 1363 */ 1364 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 1365 /* Interface may have vanished */ 1366 if (errno != ENXIO) { 1367 logperror_li(li, "logint_init_from_k: (get addr)"); 1368 } 1369 goto error; 1370 } 1371 1372 if (pii->pii_af == AF_INET) { 1373 sin = (struct sockaddr_in *)&lifr.lifr_addr; 1374 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &testaddr); 1375 } else { 1376 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 1377 testaddr = sin6->sin6_addr; 1378 } 1379 1380 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 1381 ptp = _B_TRUE; 1382 if (ioctl(ifsock, SIOCGLIFDSTADDR, (char *)&lifr) < 0) { 1383 if (errno != ENXIO) { 1384 logperror_li(li, "logint_init_from_k:" 1385 " (get dstaddr)"); 1386 } 1387 goto error; 1388 } 1389 if (pii->pii_af == AF_INET) { 1390 sin = (struct sockaddr_in *)&lifr.lifr_addr; 1391 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &tgaddr); 1392 } else { 1393 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 1394 tgaddr = sin6->sin6_addr; 1395 } 1396 } else { 1397 if (ioctl(ifsock, SIOCGLIFSUBNET, (char *)&lifr) < 0) { 1398 /* Interface may have vanished */ 1399 if (errno != ENXIO) { 1400 logperror_li(li, "logint_init_from_k:" 1401 " (get subnet)"); 1402 } 1403 goto error; 1404 } 1405 if (lifr.lifr_subnet.ss_family == AF_INET6) { 1406 sin6 = (struct sockaddr_in6 *)&lifr.lifr_subnet; 1407 test_subnet = sin6->sin6_addr; 1408 test_subnet_len = lifr.lifr_addrlen; 1409 } else { 1410 sin = (struct sockaddr_in *)&lifr.lifr_subnet; 1411 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &test_subnet); 1412 test_subnet_len = lifr.lifr_addrlen + 1413 (IPV6_ABITS - IP_ABITS); 1414 } 1415 (void) ip_index_to_mask_v6(test_subnet_len, &test_subnet_mask); 1416 } 1417 1418 /* 1419 * Also record the OINDEX for completeness. This information is 1420 * not used. 1421 */ 1422 if (ioctl(ifsock, SIOCGLIFOINDEX, (char *)&lifr) < 0) { 1423 if (errno != ENXIO) { 1424 logperror_li(li, "logint_init_from_k:" 1425 " (get lifoindex)"); 1426 } 1427 goto error; 1428 } 1429 1430 /* 1431 * If this is the logint corresponding to the test address used for 1432 * sending probes, then if anything significant has changed we need to 1433 * determine the test address again. We ignore changes to the 1434 * IFF_FAILED and IFF_RUNNING flags since those happen as a matter of 1435 * course. 1436 */ 1437 if (pii->pii_probe_logint == li) { 1438 if (((li->li_flags ^ saved_flags) & 1439 ~(IFF_FAILED | IFF_RUNNING)) != 0 || 1440 !IN6_ARE_ADDR_EQUAL(&testaddr, &li->li_addr) || 1441 (!ptp && !IN6_ARE_ADDR_EQUAL(&test_subnet, 1442 &li->li_subnet)) || 1443 (!ptp && test_subnet_len != li->li_subnet_len) || 1444 (ptp && !IN6_ARE_ADDR_EQUAL(&tgaddr, &li->li_dstaddr))) { 1445 /* 1446 * Something significant that affects the testaddress 1447 * has changed. Redo the testaddress selection later on 1448 * in select_test_ifs(). For now do the cleanup and 1449 * set pii_probe_logint to NULL. 1450 */ 1451 if (pii->pii_probe_sock != -1) 1452 close_probe_socket(pii, _B_TRUE); 1453 pii->pii_probe_logint = NULL; 1454 } 1455 } 1456 1457 1458 /* Update the logint with the values obtained from the kernel. */ 1459 li->li_addr = testaddr; 1460 li->li_in_use = 1; 1461 li->li_oifindex = lifr.lifr_index; 1462 if (ptp) { 1463 li->li_dstaddr = tgaddr; 1464 li->li_subnet_len = (pii->pii_af == AF_INET) ? 1465 IP_ABITS : IPV6_ABITS; 1466 } else { 1467 li->li_subnet = test_subnet; 1468 li->li_subnet_len = test_subnet_len; 1469 } 1470 1471 if (debug & D_LOGINT) 1472 logint_print(li); 1473 1474 return; 1475 1476 error: 1477 logerr("logint_init_from_k: IGNORED %s %s %s addr %s\n", 1478 AF_STR(pii->pii_af), pii->pii_name, li->li_name, 1479 pr_addr(pii->pii_af, testaddr, abuf, sizeof (abuf))); 1480 logint_delete(li); 1481 } 1482 1483 /* 1484 * Delete (unlink and free) a logint. 1485 */ 1486 void 1487 logint_delete(struct logint *li) 1488 { 1489 struct phyint_instance *pii; 1490 1491 pii = li->li_phyint_inst; 1492 assert(pii != NULL); 1493 1494 if (debug & D_LOGINT) { 1495 int af; 1496 char abuf[INET6_ADDRSTRLEN]; 1497 1498 af = pii->pii_af; 1499 logdebug("logint_delete(%s %s %s/%u)\n", 1500 AF_STR(af), li->li_name, 1501 pr_addr(af, li->li_addr, abuf, sizeof (abuf)), 1502 li->li_subnet_len); 1503 } 1504 1505 /* logint must be in the list of logints */ 1506 assert(pii->pii_logint == li || li->li_prev != NULL); 1507 1508 /* Remove the logint from the list of logints */ 1509 if (li->li_prev == NULL) { 1510 /* logint is the 1st in the list */ 1511 pii->pii_logint = li->li_next; 1512 } else { 1513 li->li_prev->li_next = li->li_next; 1514 } 1515 if (li->li_next != NULL) 1516 li->li_next->li_prev = li->li_prev; 1517 li->li_next = NULL; 1518 li->li_prev = NULL; 1519 1520 /* 1521 * If this logint corresponds to the IFF_NOFAILOVER testaddress of 1522 * this phyint, then close the associated socket, if it exists 1523 */ 1524 if (pii->pii_probe_logint == li) { 1525 if (pii->pii_probe_sock != -1) 1526 close_probe_socket(pii, _B_TRUE); 1527 pii->pii_probe_logint = NULL; 1528 } 1529 1530 free(li); 1531 } 1532 1533 static void 1534 logint_print(struct logint *li) 1535 { 1536 char abuf[INET6_ADDRSTRLEN]; 1537 int af; 1538 1539 af = li->li_phyint_inst->pii_af; 1540 1541 logdebug("logint: %s %s addr %s/%u", AF_STR(af), li->li_name, 1542 pr_addr(af, li->li_addr, abuf, sizeof (abuf)), li->li_subnet_len); 1543 1544 logdebug("\tFlags: %llx in_use %d oifindex %d\n", 1545 li->li_flags, li->li_in_use, li->li_oifindex); 1546 } 1547 1548 char * 1549 pr_addr(int af, struct in6_addr addr, char *abuf, int len) 1550 { 1551 struct in_addr addr_v4; 1552 1553 if (af == AF_INET) { 1554 IN6_V4MAPPED_TO_INADDR(&addr, &addr_v4); 1555 (void) inet_ntop(AF_INET, (void *)&addr_v4, abuf, len); 1556 } else { 1557 (void) inet_ntop(AF_INET6, (void *)&addr, abuf, len); 1558 } 1559 return (abuf); 1560 } 1561 1562 /* Lookup target on its address */ 1563 struct target * 1564 target_lookup(struct phyint_instance *pii, struct in6_addr addr) 1565 { 1566 struct target *tg; 1567 1568 if (debug & D_TARGET) { 1569 char abuf[INET6_ADDRSTRLEN]; 1570 1571 logdebug("target_lookup(%s %s): addr %s\n", 1572 AF_STR(pii->pii_af), pii->pii_name, 1573 pr_addr(pii->pii_af, addr, abuf, sizeof (abuf))); 1574 } 1575 1576 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1577 if (IN6_ARE_ADDR_EQUAL(&tg->tg_address, &addr)) 1578 break; 1579 } 1580 return (tg); 1581 } 1582 1583 /* 1584 * Find and return the next active target, for the next probe. 1585 * If no active targets are available, return NULL. 1586 */ 1587 struct target * 1588 target_next(struct target *tg) 1589 { 1590 struct phyint_instance *pii = tg->tg_phyint_inst; 1591 struct target *marker = tg; 1592 hrtime_t now; 1593 1594 now = gethrtime(); 1595 1596 /* 1597 * Target must be in the list of targets for this phyint 1598 * instance. 1599 */ 1600 assert(pii->pii_targets == tg || tg->tg_prev != NULL); 1601 assert(pii->pii_targets != NULL); 1602 1603 /* Return the next active target */ 1604 do { 1605 /* 1606 * Go to the next target. If we hit the end, 1607 * reset the ptr to the head 1608 */ 1609 tg = tg->tg_next; 1610 if (tg == NULL) 1611 tg = pii->pii_targets; 1612 1613 assert(TG_STATUS_VALID(tg->tg_status)); 1614 1615 switch (tg->tg_status) { 1616 case TG_ACTIVE: 1617 return (tg); 1618 1619 case TG_UNUSED: 1620 assert(pii->pii_targets_are_routers); 1621 if (pii->pii_ntargets < MAX_PROBE_TARGETS) { 1622 /* 1623 * Bubble up the unused target to active 1624 */ 1625 tg->tg_status = TG_ACTIVE; 1626 pii->pii_ntargets++; 1627 return (tg); 1628 } 1629 break; 1630 1631 case TG_SLOW: 1632 assert(pii->pii_targets_are_routers); 1633 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1634 /* 1635 * Bubble up the slow target to unused 1636 */ 1637 tg->tg_status = TG_UNUSED; 1638 } 1639 break; 1640 1641 case TG_DEAD: 1642 assert(pii->pii_targets_are_routers); 1643 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1644 /* 1645 * Bubble up the dead target to slow 1646 */ 1647 tg->tg_status = TG_SLOW; 1648 tg->tg_latime = now; 1649 } 1650 break; 1651 } 1652 1653 } while (tg != marker); 1654 1655 return (NULL); 1656 } 1657 1658 /* 1659 * Select the best available target, that is not already TG_ACTIVE, 1660 * for the caller. The caller will determine whether it wants to 1661 * make the returned target TG_ACTIVE. 1662 * The selection order is as follows. 1663 * 1. pick a TG_UNSED target, if it exists. 1664 * 2. else pick a TG_SLOW target that has recovered, if it exists 1665 * 3. else pick any TG_SLOW target, if it exists 1666 * 4. else pick a TG_DEAD target that has recovered, if it exists 1667 * 5. else pick any TG_DEAD target, if it exists 1668 * 6. else return null 1669 */ 1670 static struct target * 1671 target_select_best(struct phyint_instance *pii) 1672 { 1673 struct target *tg; 1674 struct target *slow = NULL; 1675 struct target *dead = NULL; 1676 struct target *slow_recovered = NULL; 1677 struct target *dead_recovered = NULL; 1678 hrtime_t now; 1679 1680 now = gethrtime(); 1681 1682 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1683 assert(TG_STATUS_VALID(tg->tg_status)); 1684 1685 switch (tg->tg_status) { 1686 case TG_UNUSED: 1687 return (tg); 1688 1689 case TG_SLOW: 1690 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1691 slow_recovered = tg; 1692 /* 1693 * Promote the slow_recoverd to unused 1694 */ 1695 tg->tg_status = TG_UNUSED; 1696 } else { 1697 slow = tg; 1698 } 1699 break; 1700 1701 case TG_DEAD: 1702 if (tg->tg_latime + MIN_RECOVERY_TIME < now) { 1703 dead_recovered = tg; 1704 /* 1705 * Promote the dead_recoverd to slow 1706 */ 1707 tg->tg_status = TG_SLOW; 1708 tg->tg_latime = now; 1709 } else { 1710 dead = tg; 1711 } 1712 break; 1713 1714 default: 1715 break; 1716 } 1717 } 1718 1719 if (slow_recovered != NULL) 1720 return (slow_recovered); 1721 else if (slow != NULL) 1722 return (slow); 1723 else if (dead_recovered != NULL) 1724 return (dead_recovered); 1725 else 1726 return (dead); 1727 } 1728 1729 /* 1730 * Some target was deleted. If we don't have even MIN_PROBE_TARGETS 1731 * that are active, pick the next best below. 1732 */ 1733 static void 1734 target_activate_all(struct phyint_instance *pii) 1735 { 1736 struct target *tg; 1737 1738 assert(pii->pii_ntargets == 0); 1739 assert(pii->pii_target_next == NULL); 1740 assert(pii->pii_rtt_target_next == NULL); 1741 assert(pii->pii_targets_are_routers); 1742 1743 while (pii->pii_ntargets < MIN_PROBE_TARGETS) { 1744 tg = target_select_best(pii); 1745 if (tg == NULL) { 1746 /* We are out of targets */ 1747 return; 1748 } 1749 1750 assert(TG_STATUS_VALID(tg->tg_status)); 1751 assert(tg->tg_status != TG_ACTIVE); 1752 tg->tg_status = TG_ACTIVE; 1753 pii->pii_ntargets++; 1754 if (pii->pii_target_next == NULL) { 1755 pii->pii_target_next = tg; 1756 pii->pii_rtt_target_next = tg; 1757 } 1758 } 1759 } 1760 1761 static struct target * 1762 target_first(struct phyint_instance *pii) 1763 { 1764 struct target *tg; 1765 1766 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1767 assert(TG_STATUS_VALID(tg->tg_status)); 1768 if (tg->tg_status == TG_ACTIVE) 1769 break; 1770 } 1771 1772 return (tg); 1773 } 1774 1775 /* 1776 * Create a default target entry. 1777 */ 1778 void 1779 target_create(struct phyint_instance *pii, struct in6_addr addr, 1780 boolean_t is_router) 1781 { 1782 struct target *tg; 1783 struct phyint *pi; 1784 struct logint *li; 1785 1786 if (debug & D_TARGET) { 1787 char abuf[INET6_ADDRSTRLEN]; 1788 1789 logdebug("target_create(%s %s, %s)\n", 1790 AF_STR(pii->pii_af), pii->pii_name, 1791 pr_addr(pii->pii_af, addr, abuf, sizeof (abuf))); 1792 } 1793 1794 /* 1795 * If the test address is not yet initialized, do not add 1796 * any target, since we cannot determine whether the target 1797 * belongs to the same subnet as the test address. 1798 */ 1799 li = pii->pii_probe_logint; 1800 if (li == NULL) 1801 return; 1802 1803 /* 1804 * If there are multiple subnets associated with an interface, then 1805 * add the target to this phyint instance, only if it belongs to the 1806 * same subnet as the test address. The reason is that interface 1807 * routes derived from non-test-addresses i.e. non-IFF_NOFAILOVER 1808 * addresses, will disappear after failover, and the targets will not 1809 * be reachable from this interface. 1810 */ 1811 if (!prefix_equal(li->li_subnet, addr, li->li_subnet_len)) 1812 return; 1813 1814 if (pii->pii_targets != NULL) { 1815 assert(pii->pii_ntargets <= MAX_PROBE_TARGETS); 1816 if (is_router) { 1817 if (!pii->pii_targets_are_routers) { 1818 /* 1819 * Prefer router over hosts. Using hosts is a 1820 * fallback mechanism, hence delete all host 1821 * targets. 1822 */ 1823 while (pii->pii_targets != NULL) 1824 target_delete(pii->pii_targets); 1825 } 1826 } else { 1827 /* 1828 * Routers take precedence over hosts. If this 1829 * is a router list and we are trying to add a 1830 * host, just return. If this is a host list 1831 * and if we have sufficient targets, just return 1832 */ 1833 if (pii->pii_targets_are_routers || 1834 pii->pii_ntargets == MAX_PROBE_TARGETS) 1835 return; 1836 } 1837 } 1838 1839 tg = calloc(1, sizeof (struct target)); 1840 if (tg == NULL) { 1841 logperror("target_create: calloc"); 1842 return; 1843 } 1844 1845 tg->tg_phyint_inst = pii; 1846 tg->tg_address = addr; 1847 tg->tg_in_use = 1; 1848 tg->tg_rtt_sa = -1; 1849 tg->tg_num_deferred = 0; 1850 1851 /* 1852 * If this is the first target, set 'pii_targets_are_routers' 1853 * The list of targets is either a list of hosts or list or 1854 * routers, but not a mix. 1855 */ 1856 if (pii->pii_targets == NULL) { 1857 assert(pii->pii_ntargets == 0); 1858 assert(pii->pii_target_next == NULL); 1859 assert(pii->pii_rtt_target_next == NULL); 1860 pii->pii_targets_are_routers = is_router ? 1 : 0; 1861 } 1862 1863 if (pii->pii_ntargets == MAX_PROBE_TARGETS) { 1864 assert(pii->pii_targets_are_routers); 1865 assert(pii->pii_target_next != NULL); 1866 assert(pii->pii_rtt_target_next != NULL); 1867 tg->tg_status = TG_UNUSED; 1868 } else { 1869 if (pii->pii_ntargets == 0) { 1870 assert(pii->pii_target_next == NULL); 1871 pii->pii_target_next = tg; 1872 pii->pii_rtt_target_next = tg; 1873 } 1874 pii->pii_ntargets++; 1875 tg->tg_status = TG_ACTIVE; 1876 } 1877 1878 target_insert(pii, tg); 1879 1880 /* 1881 * Change to running state, if this phyint instance is capable of 1882 * sending and receiving probes. i.e if we know of at least 1 target, 1883 * and this phyint instance socket is bound to the IFF_NOFAILOVER 1884 * address. More details in phyint state diagram in probe.c. 1885 */ 1886 pi = pii->pii_phyint; 1887 if (pi->pi_state == PI_NOTARGETS && PROBE_CAPABLE(pii)) { 1888 if (pi->pi_flags & IFF_FAILED) 1889 phyint_chstate(pi, PI_FAILED); 1890 else 1891 phyint_chstate(pi, PI_RUNNING); 1892 } 1893 } 1894 1895 /* 1896 * Add the target address named by `addr' to phyint instance `pii' if it does 1897 * not already exist. If the target is a router, `is_router' should be set to 1898 * B_TRUE. 1899 */ 1900 void 1901 target_add(struct phyint_instance *pii, struct in6_addr addr, 1902 boolean_t is_router) 1903 { 1904 struct target *tg; 1905 1906 if (pii == NULL) 1907 return; 1908 1909 tg = target_lookup(pii, addr); 1910 1911 /* 1912 * If the target does not exist, create it; target_create() will set 1913 * tg_in_use to true. If it exists already, and it is a router 1914 * target, set tg_in_use to to true, so that init_router_targets() 1915 * won't delete it 1916 */ 1917 if (tg == NULL) 1918 target_create(pii, addr, is_router); 1919 else if (is_router) 1920 tg->tg_in_use = 1; 1921 } 1922 1923 /* 1924 * Insert target at head of linked list of targets for the associated 1925 * phyint instance 1926 */ 1927 static void 1928 target_insert(struct phyint_instance *pii, struct target *tg) 1929 { 1930 tg->tg_next = pii->pii_targets; 1931 tg->tg_prev = NULL; 1932 if (tg->tg_next != NULL) 1933 tg->tg_next->tg_prev = tg; 1934 pii->pii_targets = tg; 1935 } 1936 1937 /* 1938 * Delete a target (unlink and free). 1939 */ 1940 void 1941 target_delete(struct target *tg) 1942 { 1943 int af; 1944 struct phyint_instance *pii; 1945 struct phyint_instance *pii_other; 1946 1947 pii = tg->tg_phyint_inst; 1948 af = pii->pii_af; 1949 1950 if (debug & D_TARGET) { 1951 char abuf[INET6_ADDRSTRLEN]; 1952 1953 logdebug("target_delete(%s %s, %s)\n", 1954 AF_STR(af), pii->pii_name, 1955 pr_addr(af, tg->tg_address, abuf, sizeof (abuf))); 1956 } 1957 1958 /* 1959 * Target must be in the list of targets for this phyint 1960 * instance. 1961 */ 1962 assert(pii->pii_targets == tg || tg->tg_prev != NULL); 1963 1964 /* 1965 * Reset all references to 'tg' in the probe information 1966 * for this phyint. 1967 */ 1968 reset_pii_probes(pii, tg); 1969 1970 /* 1971 * Remove this target from the list of targets of this 1972 * phyint instance. 1973 */ 1974 if (tg->tg_prev == NULL) { 1975 pii->pii_targets = tg->tg_next; 1976 } else { 1977 tg->tg_prev->tg_next = tg->tg_next; 1978 } 1979 1980 if (tg->tg_next != NULL) 1981 tg->tg_next->tg_prev = tg->tg_prev; 1982 1983 tg->tg_next = NULL; 1984 tg->tg_prev = NULL; 1985 1986 if (tg->tg_status == TG_ACTIVE) 1987 pii->pii_ntargets--; 1988 1989 /* 1990 * Adjust the next target to probe, if it points to 1991 * to the currently deleted target. 1992 */ 1993 if (pii->pii_target_next == tg) 1994 pii->pii_target_next = target_first(pii); 1995 1996 if (pii->pii_rtt_target_next == tg) 1997 pii->pii_rtt_target_next = target_first(pii); 1998 1999 free(tg); 2000 2001 /* 2002 * The number of active targets pii_ntargets == 0 iff 2003 * the next active target pii->pii_target_next == NULL 2004 */ 2005 if (pii->pii_ntargets != 0) { 2006 assert(pii->pii_target_next != NULL); 2007 assert(pii->pii_rtt_target_next != NULL); 2008 assert(pii->pii_target_next->tg_status == TG_ACTIVE); 2009 assert(pii->pii_rtt_target_next->tg_status == TG_ACTIVE); 2010 return; 2011 } 2012 2013 /* At this point, we don't have any active targets. */ 2014 assert(pii->pii_target_next == NULL); 2015 assert(pii->pii_rtt_target_next == NULL); 2016 2017 if (pii->pii_targets_are_routers) { 2018 /* 2019 * Activate any TG_SLOW or TG_DEAD router targets, 2020 * since we don't have any other targets 2021 */ 2022 target_activate_all(pii); 2023 2024 if (pii->pii_ntargets != 0) { 2025 assert(pii->pii_target_next != NULL); 2026 assert(pii->pii_rtt_target_next != NULL); 2027 assert(pii->pii_target_next->tg_status == TG_ACTIVE); 2028 assert(pii->pii_rtt_target_next->tg_status == 2029 TG_ACTIVE); 2030 return; 2031 } 2032 } 2033 2034 /* 2035 * If we still don't have any active targets, the list must 2036 * must be really empty. There aren't even TG_SLOW or TG_DEAD 2037 * targets. Zero out the probe stats since it will not be 2038 * relevant any longer. 2039 */ 2040 assert(pii->pii_targets == NULL); 2041 clear_pii_probe_stats(pii); 2042 pii_other = phyint_inst_other(pii); 2043 2044 /* 2045 * If there are no targets on both instances, 2046 * go back to PI_NOTARGETS state, since we cannot 2047 * probe this phyint any more. For more details, 2048 * please see phyint state diagram in mpd_probe.c. 2049 */ 2050 if (!PROBE_CAPABLE(pii_other)) 2051 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 2052 } 2053 2054 /* 2055 * Flush the target list of every phyint in the group, if the list 2056 * is a host target list. This is called if group failure is suspected. 2057 * If all targets have failed, multicast will subsequently discover new 2058 * targets. Else it is a group failure. 2059 * Note: This function is a no-op if the list is a router target list. 2060 */ 2061 static void 2062 target_flush_hosts(struct phyint_group *pg) 2063 { 2064 struct phyint *pi; 2065 struct phyint_instance *pii; 2066 2067 if (debug & D_TARGET) 2068 logdebug("target_flush_hosts(%s)\n", pg->pg_name); 2069 2070 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 2071 pii = pi->pi_v4; 2072 if (pii != NULL && !pii->pii_targets_are_routers) { 2073 /* 2074 * Delete all the targets. When the list becomes 2075 * empty, target_delete() will set pii->pii_targets 2076 * to NULL. 2077 */ 2078 while (pii->pii_targets != NULL) 2079 target_delete(pii->pii_targets); 2080 } 2081 pii = pi->pi_v6; 2082 if (pii != NULL && !pii->pii_targets_are_routers) { 2083 /* 2084 * Delete all the targets. When the list becomes 2085 * empty, target_delete() will set pii->pii_targets 2086 * to NULL. 2087 */ 2088 while (pii->pii_targets != NULL) 2089 target_delete(pii->pii_targets); 2090 } 2091 } 2092 } 2093 2094 /* 2095 * Reset all references to 'target' in the probe info, as this target is 2096 * being deleted. The pr_target field is guaranteed to be non-null if 2097 * pr_status is PR_UNACKED. So we change the pr_status to PR_LOST, so that 2098 * pr_target will not be accessed unconditionally. 2099 */ 2100 static void 2101 reset_pii_probes(struct phyint_instance *pii, struct target *tg) 2102 { 2103 int i; 2104 2105 for (i = 0; i < PROBE_STATS_COUNT; i++) { 2106 if (pii->pii_probes[i].pr_target == tg) { 2107 pii->pii_probes[i].pr_target = NULL; 2108 if (pii->pii_probes[i].pr_status == PR_UNACKED) 2109 pii->pii_probes[i].pr_status = PR_LOST; 2110 } 2111 } 2112 2113 } 2114 2115 /* 2116 * Clear the probe statistics array. 2117 */ 2118 void 2119 clear_pii_probe_stats(struct phyint_instance *pii) 2120 { 2121 bzero(pii->pii_probes, sizeof (struct probe_stats) * PROBE_STATS_COUNT); 2122 /* Reset the next probe index in the probe stats array */ 2123 pii->pii_probe_next = 0; 2124 } 2125 2126 static void 2127 target_print(struct target *tg) 2128 { 2129 char abuf[INET6_ADDRSTRLEN]; 2130 char buf[128]; 2131 char buf2[128]; 2132 int af; 2133 int i; 2134 2135 af = tg->tg_phyint_inst->pii_af; 2136 2137 logdebug("Target on %s %s addr %s\n" 2138 "status %d rtt_sa %d rtt_sd %d crtt %d tg_in_use %d\n", 2139 AF_STR(af), tg->tg_phyint_inst->pii_name, 2140 pr_addr(af, tg->tg_address, abuf, sizeof (abuf)), 2141 tg->tg_status, tg->tg_rtt_sa, tg->tg_rtt_sd, 2142 tg->tg_crtt, tg->tg_in_use); 2143 2144 buf[0] = '\0'; 2145 for (i = 0; i < tg->tg_num_deferred; i++) { 2146 (void) snprintf(buf2, sizeof (buf2), " %dms", 2147 tg->tg_deferred[i]); 2148 (void) strlcat(buf, buf2, sizeof (buf)); 2149 } 2150 logdebug("deferred rtts:%s\n", buf); 2151 } 2152 2153 void 2154 phyint_inst_print_all(void) 2155 { 2156 struct phyint_instance *pii; 2157 2158 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2159 phyint_inst_print(pii); 2160 } 2161 } 2162 2163 /* 2164 * Convert length for a mask to the mask. 2165 */ 2166 static void 2167 ip_index_to_mask_v6(uint_t masklen, struct in6_addr *bitmask) 2168 { 2169 int j; 2170 2171 assert(masklen <= IPV6_ABITS); 2172 bzero((char *)bitmask, sizeof (*bitmask)); 2173 2174 /* Make the 'masklen' leftmost bits one */ 2175 for (j = 0; masklen > 8; masklen -= 8, j++) 2176 bitmask->s6_addr[j] = 0xff; 2177 2178 bitmask->s6_addr[j] = 0xff << (8 - masklen); 2179 2180 } 2181 2182 /* 2183 * Compare two prefixes that have the same prefix length. 2184 * Fails if the prefix length is unreasonable. 2185 */ 2186 static boolean_t 2187 prefix_equal(struct in6_addr p1, struct in6_addr p2, int prefix_len) 2188 { 2189 uchar_t mask; 2190 int j; 2191 2192 if (prefix_len < 0 || prefix_len > IPV6_ABITS) 2193 return (_B_FALSE); 2194 2195 for (j = 0; prefix_len > 8; prefix_len -= 8, j++) 2196 if (p1.s6_addr[j] != p2.s6_addr[j]) 2197 return (_B_FALSE); 2198 2199 /* Make the N leftmost bits one */ 2200 mask = 0xff << (8 - prefix_len); 2201 if ((p1.s6_addr[j] & mask) != (p2.s6_addr[j] & mask)) 2202 return (_B_FALSE); 2203 2204 return (_B_TRUE); 2205 } 2206 2207 /* 2208 * Get the number of UP logints (excluding IFF_NOFAILOVERs), on both 2209 * IPv4 and IPv6 put together. The phyint with the least such number 2210 * will be used as the failover destination, if no standby interface is 2211 * available 2212 */ 2213 int 2214 logint_upcount(struct phyint *pi) 2215 { 2216 struct logint *li; 2217 struct phyint_instance *pii; 2218 int count = 0; 2219 2220 pii = pi->pi_v4; 2221 if (pii != NULL) { 2222 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 2223 if ((li->li_flags & 2224 (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { 2225 count++; 2226 } 2227 } 2228 } 2229 2230 pii = pi->pi_v6; 2231 if (pii != NULL) { 2232 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 2233 if ((li->li_flags & 2234 (IFF_UP | IFF_NOFAILOVER)) == IFF_UP) { 2235 count++; 2236 } 2237 } 2238 } 2239 2240 return (count); 2241 } 2242 2243 /* 2244 * Get the phyint instance with the other (IPv4 / IPv6) protocol 2245 */ 2246 struct phyint_instance * 2247 phyint_inst_other(struct phyint_instance *pii) 2248 { 2249 if (pii->pii_af == AF_INET) 2250 return (pii->pii_phyint->pi_v6); 2251 else 2252 return (pii->pii_phyint->pi_v4); 2253 } 2254 2255 /* 2256 * Post an EC_IPMP sysevent of subclass `subclass' and attributes `nvl'. 2257 * Before sending the event, it prepends the current version of the IPMP 2258 * sysevent API. Returns 0 on success, -1 on failure (in either case, 2259 * `nvl' is freed). 2260 */ 2261 static int 2262 post_event(const char *subclass, nvlist_t *nvl) 2263 { 2264 sysevent_id_t eid; 2265 2266 errno = nvlist_add_uint32(nvl, IPMP_EVENT_VERSION, 2267 IPMP_EVENT_CUR_VERSION); 2268 if (errno != 0) { 2269 logerr("cannot create `%s' event: %s", subclass, 2270 strerror(errno)); 2271 goto failed; 2272 } 2273 2274 if (sysevent_post_event(EC_IPMP, (char *)subclass, SUNW_VENDOR, 2275 "in.mpathd", nvl, &eid) == -1) { 2276 logerr("cannot send `%s' event: %s\n", subclass, 2277 strerror(errno)); 2278 goto failed; 2279 } 2280 2281 nvlist_free(nvl); 2282 return (0); 2283 failed: 2284 nvlist_free(nvl); 2285 return (-1); 2286 } 2287 2288 /* 2289 * Return the external IPMP state associated with phyint `pi'. 2290 */ 2291 static ipmp_if_state_t 2292 ifstate(struct phyint *pi) 2293 { 2294 switch (pi->pi_state) { 2295 case PI_NOTARGETS: 2296 return (IPMP_IF_UNKNOWN); 2297 2298 case PI_OFFLINE: 2299 return (IPMP_IF_OFFLINE); 2300 2301 case PI_FAILED: 2302 return (IPMP_IF_FAILED); 2303 2304 case PI_RUNNING: 2305 return (IPMP_IF_OK); 2306 } 2307 2308 logerr("ifstate: unknown state %d; aborting\n", pi->pi_state); 2309 abort(); 2310 /* NOTREACHED */ 2311 } 2312 2313 /* 2314 * Return the external IPMP interface type associated with phyint `pi'. 2315 */ 2316 static ipmp_if_type_t 2317 iftype(struct phyint *pi) 2318 { 2319 if (pi->pi_flags & IFF_STANDBY) 2320 return (IPMP_IF_STANDBY); 2321 else 2322 return (IPMP_IF_NORMAL); 2323 } 2324 2325 /* 2326 * Return the external IPMP group state associated with phyint group `pg'. 2327 */ 2328 static ipmp_group_state_t 2329 groupstate(struct phyint_group *pg) 2330 { 2331 return (GROUP_FAILED(pg) ? IPMP_GROUP_FAILED : IPMP_GROUP_OK); 2332 } 2333 2334 /* 2335 * Generate an ESC_IPMP_GROUP_STATE sysevent for phyint group `pg'. 2336 * Returns 0 on success, -1 on failure. 2337 */ 2338 static int 2339 phyint_group_state_event(struct phyint_group *pg) 2340 { 2341 nvlist_t *nvl; 2342 2343 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2344 if (errno != 0) { 2345 logperror("cannot create `group state change' event"); 2346 return (-1); 2347 } 2348 2349 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2350 if (errno != 0) 2351 goto failed; 2352 2353 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2354 if (errno != 0) 2355 goto failed; 2356 2357 errno = nvlist_add_uint32(nvl, IPMP_GROUP_STATE, groupstate(pg)); 2358 if (errno != 0) 2359 goto failed; 2360 2361 return (post_event(ESC_IPMP_GROUP_STATE, nvl)); 2362 failed: 2363 logperror("cannot create `group state change' event"); 2364 nvlist_free(nvl); 2365 return (-1); 2366 } 2367 2368 /* 2369 * Generate an ESC_IPMP_GROUP_CHANGE sysevent of type `op' for phyint group 2370 * `pg'. Returns 0 on success, -1 on failure. 2371 */ 2372 static int 2373 phyint_group_change_event(struct phyint_group *pg, ipmp_group_op_t op) 2374 { 2375 nvlist_t *nvl; 2376 2377 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2378 if (errno != 0) { 2379 logperror("cannot create `group change' event"); 2380 return (-1); 2381 } 2382 2383 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2384 if (errno != 0) 2385 goto failed; 2386 2387 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2388 if (errno != 0) 2389 goto failed; 2390 2391 errno = nvlist_add_uint64(nvl, IPMP_GROUPLIST_SIGNATURE, 2392 phyint_grouplistsig); 2393 if (errno != 0) 2394 goto failed; 2395 2396 errno = nvlist_add_uint32(nvl, IPMP_GROUP_OPERATION, op); 2397 if (errno != 0) 2398 goto failed; 2399 2400 return (post_event(ESC_IPMP_GROUP_CHANGE, nvl)); 2401 failed: 2402 logperror("cannot create `group change' event"); 2403 nvlist_free(nvl); 2404 return (-1); 2405 } 2406 2407 /* 2408 * Generate an ESC_IPMP_GROUP_MEMBER_CHANGE sysevent for phyint `pi' in 2409 * group `pg'. Returns 0 on success, -1 on failure. 2410 */ 2411 static int 2412 phyint_group_member_event(struct phyint_group *pg, struct phyint *pi, 2413 ipmp_if_op_t op) 2414 { 2415 nvlist_t *nvl; 2416 2417 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2418 if (errno != 0) { 2419 logperror("cannot create `group member change' event"); 2420 return (-1); 2421 } 2422 2423 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2424 if (errno != 0) 2425 goto failed; 2426 2427 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2428 if (errno != 0) 2429 goto failed; 2430 2431 errno = nvlist_add_uint32(nvl, IPMP_IF_OPERATION, op); 2432 if (errno != 0) 2433 goto failed; 2434 2435 errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name); 2436 if (errno != 0) 2437 goto failed; 2438 2439 errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi)); 2440 if (errno != 0) 2441 goto failed; 2442 2443 errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi)); 2444 if (errno != 0) 2445 goto failed; 2446 2447 return (post_event(ESC_IPMP_GROUP_MEMBER_CHANGE, nvl)); 2448 failed: 2449 logperror("cannot create `group member change' event"); 2450 nvlist_free(nvl); 2451 return (-1); 2452 2453 } 2454 2455 /* 2456 * Generate an ESC_IPMP_IF_CHANGE sysevent for phyint `pi' in group `pg'. 2457 * Returns 0 on success, -1 on failure. 2458 */ 2459 static int 2460 phyint_state_event(struct phyint_group *pg, struct phyint *pi) 2461 { 2462 nvlist_t *nvl; 2463 2464 errno = nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0); 2465 if (errno != 0) { 2466 logperror("cannot create `interface change' event"); 2467 return (-1); 2468 } 2469 2470 errno = nvlist_add_string(nvl, IPMP_GROUP_NAME, pg->pg_name); 2471 if (errno != 0) 2472 goto failed; 2473 2474 errno = nvlist_add_uint64(nvl, IPMP_GROUP_SIGNATURE, pg->pg_sig); 2475 if (errno != 0) 2476 goto failed; 2477 2478 errno = nvlist_add_string(nvl, IPMP_IF_NAME, pi->pi_name); 2479 if (errno != 0) 2480 goto failed; 2481 2482 errno = nvlist_add_uint32(nvl, IPMP_IF_TYPE, iftype(pi)); 2483 if (errno != 0) 2484 goto failed; 2485 2486 errno = nvlist_add_uint32(nvl, IPMP_IF_STATE, ifstate(pi)); 2487 if (errno != 0) 2488 goto failed; 2489 2490 return (post_event(ESC_IPMP_IF_CHANGE, nvl)); 2491 failed: 2492 logperror("cannot create `interface change' event"); 2493 nvlist_free(nvl); 2494 return (-1); 2495 2496 } 2497 2498 /* 2499 * Generate a signature for use. The signature is conceptually divided 2500 * into two pieces: a random 16-bit "generation number" and a 48-bit 2501 * monotonically increasing integer. The generation number protects 2502 * against stale updates to entities (e.g., IPMP groups) that have been 2503 * deleted and since recreated. 2504 */ 2505 static uint64_t 2506 gensig(void) 2507 { 2508 static int seeded = 0; 2509 2510 if (seeded == 0) { 2511 srand48((long)gethrtime()); 2512 seeded++; 2513 } 2514 2515 return ((uint64_t)lrand48() << 48 | 1); 2516 } 2517 2518 /* 2519 * Store the information associated with group `grname' into a dynamically 2520 * allocated structure pointed to by `*grinfopp'. Returns an IPMP error code. 2521 */ 2522 unsigned int 2523 getgroupinfo(const char *grname, ipmp_groupinfo_t **grinfopp) 2524 { 2525 struct phyint_group *pg; 2526 struct phyint *pi; 2527 char (*ifs)[LIFNAMSIZ]; 2528 unsigned int nif, i; 2529 2530 pg = phyint_group_lookup(grname); 2531 if (pg == NULL) 2532 return (IPMP_EUNKGROUP); 2533 2534 /* 2535 * Tally up the number of interfaces, allocate an array to hold them, 2536 * and insert their names into the array. 2537 */ 2538 for (nif = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) 2539 nif++; 2540 2541 ifs = alloca(nif * sizeof (*ifs)); 2542 for (i = 0, pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext, i++) { 2543 assert(i < nif); 2544 (void) strlcpy(ifs[i], pi->pi_name, LIFNAMSIZ); 2545 } 2546 assert(i == nif); 2547 2548 *grinfopp = ipmp_groupinfo_create(pg->pg_name, pg->pg_sig, 2549 groupstate(pg), nif, ifs); 2550 return (*grinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2551 } 2552 2553 /* 2554 * Store the information associated with interface `ifname' into a dynamically 2555 * allocated structure pointed to by `*ifinfopp'. Returns an IPMP error code. 2556 */ 2557 unsigned int 2558 getifinfo(const char *ifname, ipmp_ifinfo_t **ifinfopp) 2559 { 2560 struct phyint *pi; 2561 2562 pi = phyint_lookup(ifname); 2563 if (pi == NULL) 2564 return (IPMP_EUNKIF); 2565 2566 *ifinfopp = ipmp_ifinfo_create(pi->pi_name, pi->pi_group->pg_name, 2567 ifstate(pi), iftype(pi)); 2568 return (*ifinfopp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2569 } 2570 2571 /* 2572 * Store the current list of IPMP groups into a dynamically allocated 2573 * structure pointed to by `*grlistpp'. Returns an IPMP error code. 2574 */ 2575 unsigned int 2576 getgrouplist(ipmp_grouplist_t **grlistpp) 2577 { 2578 struct phyint_group *pg; 2579 char (*groups)[LIFGRNAMSIZ]; 2580 unsigned int i, ngroup; 2581 2582 /* 2583 * Tally up the number of groups, allocate an array to hold them, and 2584 * insert their names into the array. 2585 */ 2586 for (ngroup = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next) 2587 ngroup++; 2588 2589 groups = alloca(ngroup * sizeof (*groups)); 2590 for (i = 0, pg = phyint_groups; pg != NULL; pg = pg->pg_next, i++) { 2591 assert(i < ngroup); 2592 (void) strlcpy(groups[i], pg->pg_name, LIFGRNAMSIZ); 2593 } 2594 assert(i == ngroup); 2595 2596 *grlistpp = ipmp_grouplist_create(phyint_grouplistsig, ngroup, groups); 2597 return (*grlistpp == NULL ? IPMP_ENOMEM : IPMP_SUCCESS); 2598 } 2599 2600 /* 2601 * Store a snapshot of the IPMP subsystem into a dynamically allocated 2602 * structure pointed to by `*snapp'. Returns an IPMP error code. 2603 */ 2604 unsigned int 2605 getsnap(ipmp_snap_t **snapp) 2606 { 2607 ipmp_grouplist_t *grlistp; 2608 ipmp_groupinfo_t *grinfop; 2609 ipmp_ifinfo_t *ifinfop; 2610 ipmp_snap_t *snap; 2611 struct phyint *pi; 2612 unsigned int i; 2613 int retval; 2614 2615 snap = ipmp_snap_create(); 2616 if (snap == NULL) 2617 return (IPMP_ENOMEM); 2618 2619 /* 2620 * Add group list. 2621 */ 2622 retval = getgrouplist(&snap->sn_grlistp); 2623 if (retval != IPMP_SUCCESS) { 2624 ipmp_snap_free(snap); 2625 return (retval); 2626 } 2627 2628 /* 2629 * Add information for each group in the list. 2630 */ 2631 grlistp = snap->sn_grlistp; 2632 for (i = 0; i < grlistp->gl_ngroup; i++) { 2633 retval = getgroupinfo(grlistp->gl_groups[i], &grinfop); 2634 if (retval != IPMP_SUCCESS) { 2635 ipmp_snap_free(snap); 2636 return (retval); 2637 } 2638 retval = ipmp_snap_addgroupinfo(snap, grinfop); 2639 if (retval != IPMP_SUCCESS) { 2640 ipmp_freegroupinfo(grinfop); 2641 ipmp_snap_free(snap); 2642 return (retval); 2643 } 2644 } 2645 2646 /* 2647 * Add information for each configured phyint. 2648 */ 2649 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2650 retval = getifinfo(pi->pi_name, &ifinfop); 2651 if (retval != IPMP_SUCCESS) { 2652 ipmp_snap_free(snap); 2653 return (retval); 2654 } 2655 retval = ipmp_snap_addifinfo(snap, ifinfop); 2656 if (retval != IPMP_SUCCESS) { 2657 ipmp_freeifinfo(ifinfop); 2658 ipmp_snap_free(snap); 2659 return (retval); 2660 } 2661 } 2662 2663 *snapp = snap; 2664 return (IPMP_SUCCESS); 2665 } 2666