1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 35 /* All times below in ms */ 36 int user_failure_detection_time; /* user specified failure detection */ 37 /* time (fdt) */ 38 int user_probe_interval; /* derived from user specified fdt */ 39 40 static int rtsock_v4; /* AF_INET routing socket */ 41 static int rtsock_v6; /* AF_INET6 routing socket */ 42 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 43 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 44 static int lsock_v4; /* Listen socket to detect mpathd */ 45 static int lsock_v6; /* Listen socket to detect mpathd */ 46 static int mibfd = -1; /* fd to get mib info */ 47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 48 49 boolean_t full_scan_required = _B_FALSE; 50 static uint_t last_initifs_time; /* Time when initifs was last run */ 51 static char **argv0; /* Saved for re-exec on SIGHUP */ 52 boolean_t handle_link_notifications = _B_TRUE; 53 54 static void initlog(void); 55 static void run_timeouts(void); 56 static void initifs(void); 57 static void check_if_removed(struct phyint_instance *pii); 58 static void select_test_ifs(void); 59 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 60 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 61 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 62 struct in_addr nexthop_v4); 63 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 64 struct in6_addr nexthop_v6); 65 static void router_add_common(int af, char *ifname, 66 struct in6_addr nexthop); 67 static void init_router_targets(); 68 static void cleanup(void); 69 static int setup_listener(int af); 70 static void check_config(void); 71 static void check_addr_unique(int af, char *name); 72 static void init_host_targets(void); 73 static void dup_host_targets(struct phyint_instance *desired_pii); 74 static void loopback_cmd(int sock, int family); 75 static int poll_remove(int fd); 76 static boolean_t daemonize(void); 77 static int closefunc(void *, int); 78 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 79 static unsigned int process_query(int fd, mi_query_t *miq); 80 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 81 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 82 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 83 static unsigned int send_result(int fd, unsigned int error, int syserror); 84 85 struct local_addr *laddr_list = NULL; 86 87 /* 88 * Return the current time in milliseconds (from an arbitrary reference) 89 * truncated to fit into an int. Truncation is ok since we are interested 90 * only in differences and not the absolute values. 91 */ 92 uint_t 93 getcurrenttime(void) 94 { 95 uint_t cur_time; /* In ms */ 96 97 /* 98 * Use of a non-user-adjustable source of time is 99 * required. However millisecond precision is sufficient. 100 * divide by 10^6 101 */ 102 cur_time = (uint_t)(gethrtime() / 1000000LL); 103 return (cur_time); 104 } 105 106 /* 107 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 108 */ 109 int 110 poll_add(int fd) 111 { 112 int i; 113 int new_num; 114 struct pollfd *newfds; 115 retry: 116 /* Check if already present */ 117 for (i = 0; i < pollfd_num; i++) { 118 if (pollfds[i].fd == fd) 119 return (0); 120 } 121 /* Check for empty spot already present */ 122 for (i = 0; i < pollfd_num; i++) { 123 if (pollfds[i].fd == -1) { 124 pollfds[i].fd = fd; 125 return (0); 126 } 127 } 128 129 /* Allocate space for 32 more fds and initialize to -1 */ 130 new_num = pollfd_num + 32; 131 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 132 if (newfds == NULL) { 133 logperror("poll_add: realloc"); 134 return (-1); 135 } 136 for (i = pollfd_num; i < new_num; i++) { 137 newfds[i].fd = -1; 138 newfds[i].events = POLLIN; 139 } 140 pollfd_num = new_num; 141 pollfds = newfds; 142 goto retry; 143 } 144 145 /* 146 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 147 */ 148 static int 149 poll_remove(int fd) 150 { 151 int i; 152 153 /* Check if already present */ 154 for (i = 0; i < pollfd_num; i++) { 155 if (pollfds[i].fd == fd) { 156 pollfds[i].fd = -1; 157 return (0); 158 } 159 } 160 return (-1); 161 } 162 163 /* 164 * Extract information about the phyint instance. If the phyint instance still 165 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 166 * will use it to detect phyint instances that don't exist any longer and 167 * remove them, from our database of phyint instances. 168 * Return value: 169 * returns true if the phyint instance exists in the kernel, 170 * returns false otherwise 171 */ 172 static boolean_t 173 pii_process(int af, char *name, struct phyint_instance **pii_p) 174 { 175 int err; 176 struct phyint_instance *pii; 177 struct phyint_instance *pii_other; 178 179 if (debug & D_PHYINT) 180 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 181 182 pii = phyint_inst_lookup(af, name); 183 if (pii == NULL) { 184 /* 185 * Phyint instance does not exist in our tables, 186 * create new phyint instance 187 */ 188 pii = phyint_inst_init_from_k(af, name); 189 } else { 190 /* Phyint exists in our tables */ 191 err = phyint_inst_update_from_k(pii); 192 193 switch (err) { 194 case PI_IOCTL_ERROR: 195 /* Some ioctl error. don't change anything */ 196 pii->pii_in_use = 1; 197 break; 198 199 case PI_GROUP_CHANGED: 200 /* 201 * The phyint has changed group. 202 */ 203 restore_phyint(pii->pii_phyint); 204 /* FALLTHRU */ 205 206 case PI_IFINDEX_CHANGED: 207 /* 208 * Interface index has changed. Delete and 209 * recreate the phyint as it is quite likely 210 * the interface has been unplumbed and replumbed. 211 */ 212 pii_other = phyint_inst_other(pii); 213 if (pii_other != NULL) 214 phyint_inst_delete(pii_other); 215 phyint_inst_delete(pii); 216 pii = phyint_inst_init_from_k(af, name); 217 break; 218 219 case PI_DELETED: 220 /* Phyint instance has disappeared from kernel */ 221 pii->pii_in_use = 0; 222 break; 223 224 case PI_OK: 225 /* Phyint instance exists and is fine */ 226 pii->pii_in_use = 1; 227 break; 228 229 default: 230 /* Unknown status */ 231 logerr("pii_process: Unknown status %d\n", err); 232 break; 233 } 234 } 235 236 *pii_p = pii; 237 if (pii != NULL) 238 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 239 else 240 return (_B_FALSE); 241 } 242 243 /* 244 * This phyint is leaving the group. Try to restore the phyint to its 245 * initial state. Return the addresses that belong to other group members, 246 * to the group, and take back any addresses owned by this phyint 247 */ 248 void 249 restore_phyint(struct phyint *pi) 250 { 251 if (pi->pi_group == phyint_anongroup) 252 return; 253 254 /* 255 * Move everthing to some other member in the group. 256 * The phyint has changed group in the kernel. But we 257 * have yet to do it in our tables. 258 */ 259 if (!pi->pi_empty) 260 (void) try_failover(pi, FAILOVER_TO_ANY); 261 /* 262 * Move all addresses owned by 'pi' back to pi, from each 263 * of the other members of the group 264 */ 265 (void) try_failback(pi, _B_FALSE); 266 } 267 268 /* 269 * Scan all interfaces to detect changes as well as new and deleted interfaces 270 */ 271 static void 272 initifs() 273 { 274 int n; 275 int af; 276 char *cp; 277 char *buf; 278 int numifs; 279 struct lifnum lifn; 280 struct lifconf lifc; 281 struct lifreq *lifr; 282 struct logint *li; 283 struct phyint_instance *pii; 284 struct phyint_instance *next_pii; 285 char pi_name[LIFNAMSIZ + 1]; 286 boolean_t exists; 287 struct phyint *pi; 288 struct local_addr *next; 289 290 if (debug & D_PHYINT) 291 logdebug("initifs: Scanning interfaces\n"); 292 293 last_initifs_time = getcurrenttime(); 294 295 /* 296 * Free the laddr_list before collecting the local addresses. 297 */ 298 while (laddr_list != NULL) { 299 next = laddr_list->next; 300 free(laddr_list); 301 laddr_list = next; 302 } 303 304 /* 305 * Mark the interfaces so that we can find phyints and logints 306 * which have disappeared from the kernel. pii_process() and 307 * logint_init_from_k() will set {pii,li}_in_use when they find 308 * the interface in the kernel. Also, clear dupaddr bit on probe 309 * logint. check_addr_unique() will set the dupaddr bit on the 310 * probe logint, if the testaddress is not unique. 311 */ 312 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 313 pii->pii_in_use = 0; 314 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 315 li->li_in_use = 0; 316 if (pii->pii_probe_logint == li) 317 li->li_dupaddr = 0; 318 } 319 } 320 321 lifn.lifn_family = AF_UNSPEC; 322 lifn.lifn_flags = LIFC_ALLZONES; 323 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 324 logperror("initifs: ioctl (get interface numbers)"); 325 return; 326 } 327 numifs = lifn.lifn_count; 328 329 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 330 if (buf == NULL) { 331 logperror("initifs: calloc"); 332 return; 333 } 334 335 lifc.lifc_family = AF_UNSPEC; 336 lifc.lifc_flags = LIFC_ALLZONES; 337 lifc.lifc_len = numifs * sizeof (struct lifreq); 338 lifc.lifc_buf = buf; 339 340 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 341 /* 342 * EINVAL is commonly encountered, when things change 343 * underneath us rapidly, (eg. at boot, when new interfaces 344 * are plumbed successively) and the kernel finds the buffer 345 * size we passed as too small. We will retry again 346 * when we see the next routing socket msg, or at worst after 347 * IF_SCAN_INTERVAL ms. 348 */ 349 if (errno != EINVAL) { 350 logperror("initifs: ioctl" 351 " (get interface configuration)"); 352 } 353 free(buf); 354 return; 355 } 356 357 lifr = (struct lifreq *)lifc.lifc_req; 358 359 /* 360 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 361 * and get the state of the corresponding phyint_instance. If it is 362 * successful, then call logint_init_from_k() to get the state of the 363 * logint. 364 */ 365 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 366 int sockfd; 367 struct local_addr *taddr; 368 struct sockaddr_in *sin; 369 struct sockaddr_in6 *sin6; 370 struct lifreq lifreq; 371 372 af = lifr->lifr_addr.ss_family; 373 374 /* 375 * Collect all local addresses. 376 */ 377 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 378 (void) memset(&lifreq, 0, sizeof (lifreq)); 379 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, 380 sizeof (lifreq.lifr_name)); 381 382 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 383 if (errno != ENXIO) 384 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 385 continue; 386 } 387 388 /* 389 * Add the interface address to laddr_list. 390 * Another node might have the same IP address which is up. 391 * In that case, it is appropriate to use the address as a 392 * target, even though it is also configured (but not up) on 393 * the local system. 394 * Hence,the interface address is not added to laddr_list 395 * unless it is IFF_UP. 396 */ 397 if (lifreq.lifr_flags & IFF_UP) { 398 taddr = malloc(sizeof (struct local_addr)); 399 if (taddr == NULL) { 400 logperror("initifs: malloc"); 401 continue; 402 } 403 if (af == AF_INET) { 404 sin = (struct sockaddr_in *)&lifr->lifr_addr; 405 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 406 &taddr->addr); 407 } else { 408 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; 409 taddr->addr = sin6->sin6_addr; 410 } 411 taddr->next = laddr_list; 412 laddr_list = taddr; 413 } 414 415 /* 416 * Need to pass a phyint name to pii_process. Insert the 417 * null where the ':' IF_SEPARATOR is found in the logical 418 * name. 419 */ 420 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 421 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 422 *cp = '\0'; 423 424 exists = pii_process(af, pi_name, &pii); 425 if (exists) { 426 /* The phyint is fine. So process the logint */ 427 logint_init_from_k(pii, lifr->lifr_name); 428 } 429 check_addr_unique(af, lifr->lifr_name); 430 } 431 432 free(buf); 433 434 /* 435 * If the test address is now unique, and if it was not unique 436 * previously, clear the li_dupaddrmsg_printed flag and log a 437 * recovery message 438 */ 439 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 440 struct logint *li; 441 char abuf[INET6_ADDRSTRLEN]; 442 443 li = pii->pii_probe_logint; 444 if ((li != NULL) && !li->li_dupaddr && 445 li->li_dupaddrmsg_printed) { 446 logerr("Test address %s is unique; enabling probe-" 447 "based failure detection\n", 448 pr_addr(pii->pii_af, li->li_addr, abuf, 449 sizeof (abuf))); 450 li->li_dupaddrmsg_printed = 0; 451 } 452 } 453 454 /* 455 * Scan for phyints and logints that have disappeared from the 456 * kernel, and delete them. 457 */ 458 pii = phyint_instances; 459 460 while (pii != NULL) { 461 next_pii = pii->pii_next; 462 check_if_removed(pii); 463 pii = next_pii; 464 } 465 466 /* 467 * Select a test address for sending probes on each phyint instance 468 */ 469 select_test_ifs(); 470 471 /* 472 * Handle link up/down notifications from the NICs. 473 */ 474 process_link_state_changes(); 475 476 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 477 /* 478 * If this is a case of group failure, we don't have much 479 * to do until the group recovers again. 480 */ 481 if (GROUP_FAILED(pi->pi_group)) 482 continue; 483 484 /* 485 * Try/Retry any pending failovers / failbacks, that did not 486 * not complete, or that could not be initiated previously. 487 * This implements the 3 invariants described in the big block 488 * comment at the beginning of probe.c 489 */ 490 if (pi->pi_flags & IFF_INACTIVE) { 491 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 492 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 493 } else { 494 struct phyint_instance *pii; 495 496 /* 497 * Skip interfaces which are not capable of probing, 498 * and interfaces that have downed links (as we will 499 * not get any response). 500 */ 501 if (LINK_DOWN(pi)) 502 continue; 503 504 pii = pi->pi_v4; 505 if (!PROBE_CAPABLE(pii)) { 506 pii = pi->pi_v6; 507 if (!PROBE_CAPABLE(pii)) 508 continue; 509 } 510 511 /* 512 * It is possible that the phyint has started 513 * receiving packets, after it has been marked 514 * PI_FAILED. Don't initiate failover, if the 515 * phyint has started recovering. failure_state() 516 * captures this check. A similar logic is used 517 * for failback/repair case. 518 */ 519 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 520 (failure_state(pii) == PHYINT_FAILURE)) { 521 (void) try_failover(pi, FAILOVER_NORMAL); 522 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 523 if (try_failback(pi, _B_FALSE) != 524 IPMP_FAILURE) { 525 (void) change_lif_flags(pi, IFF_FAILED, 526 _B_FALSE); 527 /* Per state diagram */ 528 pi->pi_empty = 0; 529 } 530 } 531 } 532 } 533 } 534 535 /* 536 * Check that test/probe addresses are always unique. link-locals and 537 * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER) 538 * address can produce unexpected results. Log an error and alert the user. 539 */ 540 static void 541 check_addr_unique(int af, char *name) 542 { 543 struct lifreq lifr; 544 struct phyint *pi; 545 struct in6_addr addr; 546 struct phyint_instance *pii; 547 struct sockaddr_in *sin; 548 struct sockaddr_in6 *sin6; 549 int ifsock; 550 char abuf[INET6_ADDRSTRLEN]; 551 552 /* Get the socket for doing ioctls */ 553 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 554 555 (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); 556 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 557 /* 558 * Get the address corresponding to 'name'. We cannot 559 * do a logint lookup in our tables, because, not all logints 560 * in the system are tracked by mpathd. (eg. things not in a group) 561 */ 562 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 563 if (errno == ENXIO) { 564 /* Interface has vanished */ 565 return; 566 } else { 567 logperror("ioctl (get addr)"); 568 return; 569 } 570 } 571 572 if (af == AF_INET) { 573 sin = (struct sockaddr_in *)&lifr.lifr_addr; 574 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 575 } else { 576 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 577 addr = sin6->sin6_addr; 578 } 579 580 /* 581 * Does the address 'addr' match any known test address ? If so 582 * it is a duplicate, unless we are looking at the same logint 583 */ 584 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 585 pii = PHYINT_INSTANCE(pi, af); 586 if (pii == NULL || pii->pii_probe_logint == NULL) 587 continue; 588 589 if (!IN6_ARE_ADDR_EQUAL(&addr, 590 &pii->pii_probe_logint->li_addr)) { 591 continue; 592 } 593 594 if (strncmp(pii->pii_probe_logint->li_name, name, 595 sizeof (pii->pii_probe_logint->li_name)) == 0) { 596 continue; 597 } 598 599 /* 600 * This test address is not unique. Set the dupaddr bit 601 */ 602 pii->pii_probe_logint->li_dupaddr = 1; 603 604 /* 605 * Log an error message if not already logged 606 */ 607 if (pii->pii_probe_logint->li_dupaddrmsg_printed) 608 continue; 609 610 logerr("Test address %s is not unique; disabling " 611 "probe-based failure detection\n", 612 pr_addr(af, addr, abuf, sizeof (abuf))); 613 614 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 615 } 616 } 617 618 /* 619 * Stop probing an interface. Called when an interface is offlined. 620 * The probe socket is closed on each interface instance, and the 621 * interface state set to PI_OFFLINE. 622 */ 623 static void 624 stop_probing(struct phyint *pi) 625 { 626 struct phyint_instance *pii; 627 628 pii = pi->pi_v4; 629 if (pii != NULL) { 630 if (pii->pii_probe_sock != -1) 631 close_probe_socket(pii, _B_TRUE); 632 pii->pii_probe_logint = NULL; 633 } 634 635 pii = pi->pi_v6; 636 if (pii != NULL) { 637 if (pii->pii_probe_sock != -1) 638 close_probe_socket(pii, _B_TRUE); 639 pii->pii_probe_logint = NULL; 640 } 641 642 phyint_chstate(pi, PI_OFFLINE); 643 } 644 645 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 646 647 /* 648 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 649 * IFF_UP must also be set so that the associated address can be used as a 650 * source address. Further, we must be able to exchange packets with local 651 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 652 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 653 */ 654 static int 655 rate_testflags(uint64_t flags) 656 { 657 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 658 return (BAD_TESTFLAGS); 659 660 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 661 return (BAD_TESTFLAGS); 662 663 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 664 return (BEST_TESTFLAGS); 665 666 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 667 return (BEST_TESTFLAGS); 668 669 return (OK_TESTFLAGS); 670 } 671 672 /* 673 * Attempt to select a test address for each phyint instance. 674 * Call phyint_inst_sockinit() to complete the initializations. 675 */ 676 static void 677 select_test_ifs(void) 678 { 679 struct phyint *pi; 680 struct phyint_instance *pii; 681 struct phyint_instance *next_pii; 682 struct logint *li; 683 struct logint *probe_logint; 684 boolean_t target_scan_reqd = _B_FALSE; 685 struct target *tg; 686 int rating; 687 688 if (debug & D_PHYINT) 689 logdebug("select_test_ifs\n"); 690 691 /* 692 * For each phyint instance, do the test address selection 693 */ 694 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 695 next_pii = pii->pii_next; 696 probe_logint = NULL; 697 698 /* 699 * An interface that is offline, should not be probed. 700 * Offline interfaces should always in PI_OFFLINE state, 701 * unless some other entity has set the offline flag. 702 */ 703 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 704 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 705 logerr("shouldn't be probing offline" 706 " interface %s (state is: %u)." 707 " Stopping probes.\n", 708 pii->pii_phyint->pi_name, 709 pii->pii_phyint->pi_state); 710 stop_probing(pii->pii_phyint); 711 } 712 continue; 713 } 714 715 li = pii->pii_probe_logint; 716 if (li != NULL) { 717 /* 718 * We've already got a test address; only proceed 719 * if it's suboptimal. 720 */ 721 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 722 continue; 723 } 724 725 /* 726 * Walk the logints of this phyint instance, and select 727 * the best available test address 728 */ 729 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 730 /* 731 * Skip any IPv6 logints that are not link-local, 732 * since we should always have a link-local address 733 * anyway and in6_data() expects link-local replies. 734 */ 735 if (pii->pii_af == AF_INET6 && 736 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 737 continue; 738 739 /* 740 * Rate the testflags. If we've found an optimal 741 * match, then break out; otherwise, record the most 742 * recent OK one. 743 */ 744 rating = rate_testflags(li->li_flags); 745 if (rating == BAD_TESTFLAGS) 746 continue; 747 748 probe_logint = li; 749 if (rating == BEST_TESTFLAGS) 750 break; 751 } 752 753 /* 754 * If the probe logint has changed, ditch the old one. 755 */ 756 if (pii->pii_probe_logint != NULL && 757 pii->pii_probe_logint != probe_logint) { 758 if (pii->pii_probe_sock != -1) 759 close_probe_socket(pii, _B_TRUE); 760 pii->pii_probe_logint = NULL; 761 } 762 763 if (probe_logint == NULL) { 764 /* 765 * We don't have a test address. Don't print an 766 * error message immediately. check_config() will 767 * take care of it. Zero out the probe stats array 768 * since it is no longer relevant. Optimize by 769 * checking if it is already zeroed out. 770 */ 771 int pr_ndx; 772 773 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 774 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 775 clear_pii_probe_stats(pii); 776 reset_crtt_all(pii->pii_phyint); 777 } 778 continue; 779 } else if (probe_logint == pii->pii_probe_logint) { 780 /* 781 * If we didn't find any new test addr, go to the 782 * next phyint. 783 */ 784 continue; 785 } 786 787 /* 788 * The phyint is either being assigned a new testaddr 789 * or is being assigned a testaddr for the 1st time. 790 * Need to initialize the phyint socket 791 */ 792 pii->pii_probe_logint = probe_logint; 793 if (!phyint_inst_sockinit(pii)) { 794 if (debug & D_PHYINT) { 795 logdebug("select_test_ifs: " 796 "phyint_sockinit failed\n"); 797 } 798 phyint_inst_delete(pii); 799 continue; 800 } 801 802 /* 803 * This phyint instance is now enabled for probes; this 804 * impacts our state machine in two ways: 805 * 806 * 1. If we're probe *capable* as well (i.e., we have 807 * probe targets) and the interface is in PI_NOTARGETS, 808 * then transition to PI_RUNNING. 809 * 810 * 2. If we're not probe capable, and the other phyint 811 * instance is also not probe capable, and we were in 812 * PI_RUNNING, then transition to PI_NOTARGETS. 813 * 814 * Also see the state diagram in mpd_probe.c. 815 */ 816 if (PROBE_CAPABLE(pii)) { 817 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 818 phyint_chstate(pii->pii_phyint, PI_RUNNING); 819 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 820 if (pii->pii_phyint->pi_state == PI_RUNNING) 821 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 822 } 823 824 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 825 tg = pii->pii_targets; 826 if (tg != NULL) 827 target_delete(tg); 828 assert(pii->pii_targets == NULL); 829 assert(pii->pii_target_next == NULL); 830 assert(pii->pii_ntargets == 0); 831 target_create(pii, probe_logint->li_dstaddr, 832 _B_TRUE); 833 } 834 835 /* 836 * If no targets are currently known for this phyint 837 * we need to call init_router_targets. Since 838 * init_router_targets() initializes the list of targets 839 * for all phyints it is done below the loop. 840 */ 841 if (pii->pii_targets == NULL) 842 target_scan_reqd = _B_TRUE; 843 844 /* 845 * Start the probe timer for this instance. 846 */ 847 if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) { 848 start_timer(pii); 849 pii->pii_basetime_inited = 1; 850 } 851 } 852 853 /* 854 * Check the interface list for any interfaces that are marked 855 * PI_FAILED but no longer enabled to send probes, and call 856 * phyint_check_for_repair() to see if the link now indicates that the 857 * interface should be repaired. Also see the state diagram in 858 * mpd_probe.c. 859 */ 860 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 861 if (pi->pi_state == PI_FAILED && 862 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 863 phyint_check_for_repair(pi); 864 } 865 } 866 867 /* 868 * Try to populate the target list. init_router_targets populates 869 * the target list from the routing table. If our target list is 870 * still empty, init_host_targets adds host targets based on the 871 * host target list of other phyints in the group. 872 */ 873 if (target_scan_reqd) { 874 init_router_targets(); 875 init_host_targets(); 876 } 877 } 878 879 /* 880 * Check phyint group configuration, to detect any inconsistencies, 881 * and log an error message. This is called from runtimeouts every 882 * 20 secs. But the error message is displayed once. If the 883 * consistency is resolved by the admin, a recovery message is displayed 884 * once. 885 */ 886 static void 887 check_config(void) 888 { 889 struct phyint_group *pg; 890 struct phyint *pi; 891 boolean_t v4_in_group; 892 boolean_t v6_in_group; 893 894 /* 895 * All phyints of a group must be homogenous to ensure that 896 * failover or failback can be done. If any phyint in a group 897 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 898 * Do a similar check for IPv6. 899 */ 900 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 901 if (pg == phyint_anongroup) 902 continue; 903 904 v4_in_group = _B_FALSE; 905 v6_in_group = _B_FALSE; 906 /* 907 * 1st pass. Determine if at least 1 phyint in the group 908 * has IPv4 plumbed and if so set v4_in_group to true. 909 * Repeat similarly for IPv6. 910 */ 911 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 912 if (pi->pi_v4 != NULL) 913 v4_in_group = _B_TRUE; 914 if (pi->pi_v6 != NULL) 915 v6_in_group = _B_TRUE; 916 } 917 918 /* 919 * 2nd pass. If v4_in_group is true, check that phyint 920 * has IPv4 plumbed. Repeat similarly for IPv6. Print 921 * out a message the 1st time only. 922 */ 923 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 924 if (pi->pi_flags & IFF_OFFLINE) 925 continue; 926 927 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 928 if (!pi->pi_cfgmsg_printed) { 929 logerr("NIC %s of group %s is" 930 " not plumbed for IPv4 and may" 931 " affect failover capability\n", 932 pi->pi_name, 933 pi->pi_group->pg_name); 934 pi->pi_cfgmsg_printed = 1; 935 } 936 } else if (v6_in_group == _B_TRUE && 937 pi->pi_v6 == NULL) { 938 if (!pi->pi_cfgmsg_printed) { 939 logerr("NIC %s of group %s is" 940 " not plumbed for IPv6 and may" 941 " affect failover capability\n", 942 pi->pi_name, 943 pi->pi_group->pg_name); 944 pi->pi_cfgmsg_printed = 1; 945 } 946 } else { 947 /* 948 * The phyint matches the group configuration, 949 * if we have reached this point. If it was 950 * improperly configured earlier, log an 951 * error recovery message 952 */ 953 if (pi->pi_cfgmsg_printed) { 954 logerr("NIC %s is now consistent with " 955 "group %s and failover capability " 956 "is restored\n", pi->pi_name, 957 pi->pi_group->pg_name); 958 pi->pi_cfgmsg_printed = 0; 959 } 960 } 961 962 } 963 } 964 965 /* 966 * In order to perform probe-based failure detection, a phyint must 967 * have at least 1 test/probe address for sending and receiving probes 968 * (either on IPv4 or IPv6 instance or both). If no test address has 969 * been configured, notify the administrator, but continue on since we 970 * can still perform load spreading, along with "link up/down" based 971 * failure detection. 972 */ 973 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 974 if (pi->pi_flags & IFF_OFFLINE) 975 continue; 976 977 if ((pi->pi_v4 == NULL || 978 pi->pi_v4->pii_probe_logint == NULL) && 979 (pi->pi_v6 == NULL || 980 pi->pi_v6->pii_probe_logint == NULL)) { 981 if (!pi->pi_taddrmsg_printed) { 982 logerr("No test address configured on " 983 "interface %s; disabling probe-based " 984 "failure detection on it\n", pi->pi_name); 985 pi->pi_taddrmsg_printed = 1; 986 } 987 } else if (pi->pi_taddrmsg_printed) { 988 logerr("Test address now configured on interface %s; " 989 "enabling probe-based failure detection on it\n", 990 pi->pi_name); 991 pi->pi_taddrmsg_printed = 0; 992 } 993 994 } 995 } 996 997 /* 998 * Timer mechanism using relative time (in milliseconds) from the 999 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 1000 * will fire after TIMER_INFINITY milliseconds. 1001 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 1002 * time values. Hence 2 consecutive timer events cannot be spaced farther 1003 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 1004 * that can be passed for the delay parameter of timer_schedule() 1005 */ 1006 static uint_t timer_next; /* Currently scheduled timeout */ 1007 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 1008 1009 static void 1010 timer_init(void) 1011 { 1012 timer_next = getcurrenttime() + TIMER_INFINITY; 1013 /* 1014 * The call to run_timeouts() will get the timer started 1015 * Since there are no phyints at this point, the timer will 1016 * be set for IF_SCAN_INTERVAL ms. 1017 */ 1018 run_timeouts(); 1019 } 1020 1021 /* 1022 * Make sure the next SIGALRM occurs delay milliseconds from the current 1023 * time if not earlier. We are interested only in time differences. 1024 */ 1025 void 1026 timer_schedule(uint_t delay) 1027 { 1028 uint_t now; 1029 struct itimerval itimerval; 1030 1031 if (debug & D_TIMER) 1032 logdebug("timer_schedule(%u)\n", delay); 1033 1034 assert(delay <= TIMER_INFINITY); 1035 1036 now = getcurrenttime(); 1037 if (delay == 0) { 1038 /* Minimum allowed delay */ 1039 delay = 1; 1040 } 1041 /* Will this timer occur before the currently scheduled SIGALRM? */ 1042 if (timer_active && TIME_GE(now + delay, timer_next)) { 1043 if (debug & D_TIMER) { 1044 logdebug("timer_schedule(%u) - no action: " 1045 "now %u next %u\n", delay, now, timer_next); 1046 } 1047 return; 1048 } 1049 timer_next = now + delay; 1050 1051 itimerval.it_value.tv_sec = delay / 1000; 1052 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1053 itimerval.it_interval.tv_sec = 0; 1054 itimerval.it_interval.tv_usec = 0; 1055 if (debug & D_TIMER) { 1056 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1057 delay, itimerval.it_value.tv_sec, 1058 itimerval.it_value.tv_usec); 1059 } 1060 timer_active = _B_TRUE; 1061 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1062 logperror("timer_schedule: setitimer"); 1063 exit(2); 1064 } 1065 } 1066 1067 /* 1068 * Timer has fired. Determine when the next timer event will occur by asking 1069 * all the timer routines. Should not be called from a timer routine. 1070 */ 1071 static void 1072 run_timeouts(void) 1073 { 1074 uint_t next; 1075 uint_t next_event_time; 1076 struct phyint_instance *pii; 1077 struct phyint_instance *next_pii; 1078 static boolean_t timeout_running; 1079 1080 /* assert that recursive timeouts don't happen. */ 1081 assert(!timeout_running); 1082 1083 timeout_running = _B_TRUE; 1084 1085 if (debug & D_TIMER) 1086 logdebug("run_timeouts()\n"); 1087 1088 next = TIMER_INFINITY; 1089 1090 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1091 next_pii = pii->pii_next; 1092 next_event_time = phyint_inst_timer(pii); 1093 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1094 next = next_event_time; 1095 1096 if (debug & D_TIMER) { 1097 logdebug("run_timeouts(%s %s): next scheduled for" 1098 " this phyint inst %u, next scheduled global" 1099 " %u ms\n", 1100 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1101 next_event_time, next); 1102 } 1103 } 1104 1105 /* 1106 * Make sure initifs() is called at least once every 1107 * IF_SCAN_INTERVAL, to make sure that we are in sync 1108 * with the kernel, in case we have missed any routing 1109 * socket messages. 1110 */ 1111 if (next > IF_SCAN_INTERVAL) 1112 next = IF_SCAN_INTERVAL; 1113 1114 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1115 initifs(); 1116 check_config(); 1117 } 1118 1119 if (debug & D_TIMER) 1120 logdebug("run_timeouts: %u ms\n", next); 1121 1122 timer_schedule(next); 1123 timeout_running = _B_FALSE; 1124 } 1125 1126 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1127 static int eventpipe_write = -1; 1128 static boolean_t cleanup_started = _B_FALSE; 1129 /* Don't write to eventpipe if in cleanup */ 1130 /* 1131 * Ensure that signals are processed synchronously with the rest of 1132 * the code by just writing a one character signal number on the pipe. 1133 * The poll loop will pick this up and process the signal event. 1134 */ 1135 static void 1136 sig_handler(int signo) 1137 { 1138 uchar_t buf = (uchar_t)signo; 1139 1140 /* 1141 * Don't write to pipe if cleanup has already begun. cleanup() 1142 * might have closed the pipe already 1143 */ 1144 if (cleanup_started) 1145 return; 1146 1147 if (eventpipe_write == -1) { 1148 logerr("sig_handler: no pipe found\n"); 1149 return; 1150 } 1151 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1152 logperror("sig_handler: write"); 1153 } 1154 1155 extern struct probes_missed probes_missed; 1156 1157 /* 1158 * Pick up a signal "byte" from the pipe and process it. 1159 */ 1160 static void 1161 in_signal(int fd) 1162 { 1163 uchar_t buf; 1164 uint64_t sent, acked, lost, unacked, unknown; 1165 struct phyint_instance *pii; 1166 int pr_ndx; 1167 1168 switch (read(fd, &buf, sizeof (buf))) { 1169 case -1: 1170 logperror("in_signal: read"); 1171 exit(1); 1172 /* NOTREACHED */ 1173 case 1: 1174 break; 1175 case 0: 1176 logerr("in_signal: read end of file\n"); 1177 exit(1); 1178 /* NOTREACHED */ 1179 default: 1180 logerr("in_signal: read > 1\n"); 1181 exit(1); 1182 } 1183 1184 if (debug & D_TIMER) 1185 logdebug("in_signal() got %d\n", buf); 1186 1187 switch (buf) { 1188 case SIGALRM: 1189 if (debug & D_TIMER) { 1190 uint_t now = getcurrenttime(); 1191 1192 logdebug("in_signal(SIGALRM) delta %u\n", 1193 now - timer_next); 1194 } 1195 timer_active = _B_FALSE; 1196 run_timeouts(); 1197 break; 1198 case SIGUSR1: 1199 logdebug("Printing configuration:\n"); 1200 /* Print out the internal tables */ 1201 phyint_inst_print_all(); 1202 1203 /* 1204 * Print out the accumulated statistics about missed 1205 * probes (happens due to scheduling delay). 1206 */ 1207 logerr("Missed sending total of %d probes spread over" 1208 " %d occurrences\n", probes_missed.pm_nprobes, 1209 probes_missed.pm_ntimes); 1210 1211 /* 1212 * Print out the accumulated statistics about probes 1213 * that were sent. 1214 */ 1215 for (pii = phyint_instances; pii != NULL; 1216 pii = pii->pii_next) { 1217 unacked = 0; 1218 acked = pii->pii_cum_stats.acked; 1219 lost = pii->pii_cum_stats.lost; 1220 sent = pii->pii_cum_stats.sent; 1221 unknown = pii->pii_cum_stats.unknown; 1222 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1223 switch (pii->pii_probes[pr_ndx].pr_status) { 1224 case PR_ACKED: 1225 acked++; 1226 break; 1227 case PR_LOST: 1228 lost++; 1229 break; 1230 case PR_UNACKED: 1231 unacked++; 1232 break; 1233 } 1234 } 1235 logerr("\nProbe stats on (%s %s)\n" 1236 "Number of probes sent %lld\n" 1237 "Number of probe acks received %lld\n" 1238 "Number of probes/acks lost %lld\n" 1239 "Number of valid unacknowled probes %lld\n" 1240 "Number of ambiguous probe acks received %lld\n", 1241 AF_STR(pii->pii_af), pii->pii_name, 1242 sent, acked, lost, unacked, unknown); 1243 } 1244 break; 1245 case SIGHUP: 1246 logerr("SIGHUP: restart and reread config file\n"); 1247 cleanup(); 1248 (void) execv(argv0[0], argv0); 1249 _exit(0177); 1250 /* NOTREACHED */ 1251 case SIGINT: 1252 case SIGTERM: 1253 case SIGQUIT: 1254 cleanup(); 1255 exit(0); 1256 /* NOTREACHED */ 1257 default: 1258 logerr("in_signal: unknown signal: %d\n", buf); 1259 } 1260 } 1261 1262 static void 1263 cleanup(void) 1264 { 1265 struct phyint_instance *pii; 1266 struct phyint_instance *next_pii; 1267 1268 /* 1269 * Make sure that we don't write to eventpipe in 1270 * sig_handler() if any signal notably SIGALRM, 1271 * occurs after we close the eventpipe descriptor below 1272 */ 1273 cleanup_started = _B_TRUE; 1274 1275 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1276 next_pii = pii->pii_next; 1277 phyint_inst_delete(pii); 1278 } 1279 1280 (void) close(ifsock_v4); 1281 (void) close(ifsock_v6); 1282 (void) close(rtsock_v4); 1283 (void) close(rtsock_v6); 1284 (void) close(lsock_v4); 1285 (void) close(lsock_v6); 1286 (void) close(0); 1287 (void) close(1); 1288 (void) close(2); 1289 (void) close(mibfd); 1290 (void) close(eventpipe_read); 1291 (void) close(eventpipe_write); 1292 } 1293 1294 /* 1295 * Create pipe for signal delivery and set up signal handlers. 1296 */ 1297 static void 1298 setup_eventpipe(void) 1299 { 1300 int fds[2]; 1301 struct sigaction act; 1302 1303 if ((pipe(fds)) < 0) { 1304 logperror("setup_eventpipe: pipe"); 1305 exit(1); 1306 } 1307 eventpipe_read = fds[0]; 1308 eventpipe_write = fds[1]; 1309 if (poll_add(eventpipe_read) == -1) { 1310 exit(1); 1311 } 1312 1313 act.sa_handler = sig_handler; 1314 act.sa_flags = SA_RESTART; 1315 (void) sigaction(SIGALRM, &act, NULL); 1316 1317 (void) sigset(SIGHUP, sig_handler); 1318 (void) sigset(SIGUSR1, sig_handler); 1319 (void) sigset(SIGTERM, sig_handler); 1320 (void) sigset(SIGINT, sig_handler); 1321 (void) sigset(SIGQUIT, sig_handler); 1322 } 1323 1324 /* 1325 * Create a routing socket for receiving RTM_IFINFO messages. 1326 */ 1327 static int 1328 setup_rtsock(int af) 1329 { 1330 int s; 1331 int flags; 1332 1333 s = socket(PF_ROUTE, SOCK_RAW, af); 1334 if (s == -1) { 1335 logperror("setup_rtsock: socket PF_ROUTE"); 1336 exit(1); 1337 } 1338 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1339 logperror("setup_rtsock: fcntl F_GETFL"); 1340 (void) close(s); 1341 exit(1); 1342 } 1343 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1344 logperror("setup_rtsock: fcntl F_SETFL"); 1345 (void) close(s); 1346 exit(1); 1347 } 1348 if (poll_add(s) == -1) { 1349 (void) close(s); 1350 exit(1); 1351 } 1352 return (s); 1353 } 1354 1355 /* 1356 * Process an RTM_IFINFO message received on a routing socket. 1357 * The return value indicates whether a full interface scan is required. 1358 * Link up/down notifications from the NICs are reflected in the 1359 * IFF_RUNNING flag. 1360 * If just the state of the IFF_RUNNING interface flag has changed, a 1361 * a full interface scan isn't required. 1362 */ 1363 static boolean_t 1364 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1365 { 1366 struct sockaddr_dl *sdl; 1367 struct phyint *pi; 1368 uint64_t old_flags; 1369 struct phyint_instance *pii; 1370 1371 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1372 1373 /* 1374 * Although the sockaddr_dl structure is directly after the 1375 * if_msghdr_t structure. At the time of writing, the size of the 1376 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1377 * to the presence of a timeval structure, which contains longs, 1378 * in the if_data structure. Anyway, we know where the message ends, 1379 * so we work backwards to get the start of the sockaddr_dl structure. 1380 */ 1381 /*LINTED*/ 1382 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1383 sizeof (struct sockaddr_dl)); 1384 1385 assert(sdl->sdl_family == AF_LINK); 1386 1387 /* 1388 * The interface name is in sdl_data. 1389 * RTM_IFINFO messages are only generated for logical interface 1390 * zero, so there is no colon and logical interface number to 1391 * strip from the name. The name is not null terminated, but 1392 * there should be enough space in sdl_data to add the null. 1393 */ 1394 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1395 if (debug & D_LINKNOTE) 1396 logdebug("process_rtm_ifinfo: " 1397 "phyint name too long\n"); 1398 return (_B_TRUE); 1399 } 1400 sdl->sdl_data[sdl->sdl_nlen] = 0; 1401 1402 pi = phyint_lookup(sdl->sdl_data); 1403 if (pi == NULL) { 1404 if (debug & D_LINKNOTE) 1405 logdebug("process_rtm_ifinfo: phyint lookup failed" 1406 " for %s\n", sdl->sdl_data); 1407 return (_B_TRUE); 1408 } 1409 1410 /* 1411 * We want to try and avoid doing a full interface scan for 1412 * link state notifications from the NICs, as indicated 1413 * by the state of the IFF_RUNNING flag. If just the 1414 * IFF_RUNNING flag has changed state, the link state changes 1415 * are processed without a full scan. 1416 * If there is both an IPv4 and IPv6 instance associated with 1417 * the physical interface, we will get an RTM_IFINFO message 1418 * for each instance. If we just maintained a single copy of 1419 * the physical interface flags, it would appear that no flags 1420 * had changed when the second message is processed, leading us 1421 * to believe that the message wasn't generated by a flags change, 1422 * and that a full interface scan is required. 1423 * To get around this problem, two additional copies of the flags 1424 * are kept, one copy for each instance. These are only used in 1425 * this routine. At any one time, all three copies of the flags 1426 * should be identical except for the IFF_RUNNING flag. The 1427 * copy of the flags in the "phyint" structure is always up to 1428 * date. 1429 */ 1430 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1431 if (pii == NULL) { 1432 if (debug & D_LINKNOTE) 1433 logdebug("process_rtm_ifinfo: no instance of address " 1434 "family %s for %s\n", AF_STR(type), pi->pi_name); 1435 return (_B_TRUE); 1436 } 1437 1438 old_flags = pii->pii_flags; 1439 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1440 pi->pi_flags = pii->pii_flags; 1441 1442 if (debug & D_LINKNOTE) { 1443 logdebug("process_rtm_ifinfo: %s address family: %s, " 1444 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1445 AF_STR(type), old_flags, pi->pi_flags); 1446 } 1447 1448 /* 1449 * If IFF_STANDBY has changed, indicate that the interface has changed 1450 * types. 1451 */ 1452 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1453 phyint_newtype(pi); 1454 1455 /* 1456 * If IFF_INACTIVE has been set, then no data addresses should be 1457 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1458 * move previously failed-over addresses back to it, provided it is 1459 * not failed. For details, see the state diagram in mpd_probe.c. 1460 */ 1461 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1462 if (pii->pii_flags & IFF_INACTIVE) { 1463 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1464 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1465 } else { 1466 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1467 pi->pi_empty = 0; 1468 (void) try_failback(pi, _B_FALSE); 1469 } 1470 } 1471 } 1472 1473 /* Has just the IFF_RUNNING flag changed state ? */ 1474 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1475 struct phyint_instance *pii_other; 1476 /* 1477 * It wasn't just a link state change. Update 1478 * the other instance's copy of the flags. 1479 */ 1480 pii_other = phyint_inst_other(pii); 1481 if (pii_other != NULL) 1482 pii_other->pii_flags = pii->pii_flags; 1483 return (_B_TRUE); 1484 } 1485 1486 return (_B_FALSE); 1487 } 1488 1489 /* 1490 * Retrieve as many routing socket messages as possible, and try to 1491 * empty the routing sockets. Initiate full scan of targets or interfaces 1492 * as needed. 1493 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1494 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1495 */ 1496 static void 1497 process_rtsock(int rtsock_v4, int rtsock_v6) 1498 { 1499 int nbytes; 1500 int64_t msg[2048 / 8]; 1501 struct rt_msghdr *rtm; 1502 boolean_t need_if_scan = _B_FALSE; 1503 boolean_t need_rt_scan = _B_FALSE; 1504 boolean_t rtm_ifinfo_seen = _B_FALSE; 1505 int type; 1506 1507 /* Read as many messages as possible and try to empty the sockets */ 1508 for (type = AF_INET; ; type = AF_INET6) { 1509 for (;;) { 1510 nbytes = read((type == AF_INET) ? rtsock_v4 : 1511 rtsock_v6, msg, sizeof (msg)); 1512 if (nbytes <= 0) { 1513 /* No more messages */ 1514 break; 1515 } 1516 rtm = (struct rt_msghdr *)msg; 1517 if (rtm->rtm_version != RTM_VERSION) { 1518 logerr("process_rtsock: version %d " 1519 "not understood\n", rtm->rtm_version); 1520 break; 1521 } 1522 1523 if (debug & D_PHYINT) { 1524 logdebug("process_rtsock: message %d\n", 1525 rtm->rtm_type); 1526 } 1527 1528 switch (rtm->rtm_type) { 1529 case RTM_NEWADDR: 1530 case RTM_DELADDR: 1531 /* 1532 * Some logical interface has changed, 1533 * have to scan everything to determine 1534 * what actually changed. 1535 */ 1536 need_if_scan = _B_TRUE; 1537 break; 1538 1539 case RTM_IFINFO: 1540 rtm_ifinfo_seen = _B_TRUE; 1541 need_if_scan |= 1542 process_rtm_ifinfo((if_msghdr_t *)rtm, 1543 type); 1544 break; 1545 1546 case RTM_ADD: 1547 case RTM_DELETE: 1548 case RTM_CHANGE: 1549 case RTM_OLDADD: 1550 case RTM_OLDDEL: 1551 need_rt_scan = _B_TRUE; 1552 break; 1553 1554 default: 1555 /* Not interesting */ 1556 break; 1557 } 1558 } 1559 if (type == AF_INET6) 1560 break; 1561 } 1562 1563 if (need_if_scan) { 1564 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1565 logdebug("process_rtsock: synchronizing with kernel\n"); 1566 initifs(); 1567 } else if (rtm_ifinfo_seen) { 1568 if (debug & D_LINKNOTE) 1569 logdebug("process_rtsock: " 1570 "link up/down notification(s) seen\n"); 1571 process_link_state_changes(); 1572 } 1573 1574 if (need_rt_scan) 1575 init_router_targets(); 1576 } 1577 1578 /* 1579 * Look if the phyint instance or one of its logints have been removed from 1580 * the kernel and take appropriate action. 1581 * Uses {pii,li}_in_use. 1582 */ 1583 static void 1584 check_if_removed(struct phyint_instance *pii) 1585 { 1586 struct logint *li; 1587 struct logint *next_li; 1588 1589 /* Detect phyints that have been removed from the kernel. */ 1590 if (!pii->pii_in_use) { 1591 logtrace("%s %s has been removed from kernel\n", 1592 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1593 phyint_inst_delete(pii); 1594 } else { 1595 /* Detect logints that have been removed. */ 1596 for (li = pii->pii_logint; li != NULL; li = next_li) { 1597 next_li = li->li_next; 1598 if (!li->li_in_use) { 1599 logint_delete(li); 1600 } 1601 } 1602 } 1603 } 1604 1605 /* 1606 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1607 * tables defined by mib2.h. Parse the returned data and extract 1608 * the 'routing' information table. Process the 'routing' table 1609 * to get the list of known onlink routers, and update our database. 1610 * These onlink routers will serve as our probe targets. 1611 * Returns false, if any system calls resulted in errors, true otherwise. 1612 */ 1613 static boolean_t 1614 update_router_list(int fd) 1615 { 1616 union { 1617 char ubuf[1024]; 1618 union T_primitives uprim; 1619 } buf; 1620 1621 int flags; 1622 struct strbuf ctlbuf; 1623 struct strbuf databuf; 1624 struct T_optmgmt_req *tor; 1625 struct T_optmgmt_ack *toa; 1626 struct T_error_ack *tea; 1627 struct opthdr *optp; 1628 struct opthdr *req; 1629 int status; 1630 t_scalar_t prim; 1631 1632 tor = (struct T_optmgmt_req *)&buf; 1633 1634 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1635 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1636 tor->OPT_length = sizeof (struct opthdr); 1637 tor->MGMT_flags = T_CURRENT; 1638 1639 req = (struct opthdr *)&tor[1]; 1640 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1641 req->name = 0; 1642 req->len = 0; 1643 1644 ctlbuf.buf = (char *)&buf; 1645 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1646 ctlbuf.maxlen = sizeof (buf); 1647 flags = 0; 1648 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1649 logperror("update_router_list: putmsg(ctl)"); 1650 return (_B_FALSE); 1651 } 1652 1653 /* 1654 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1655 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1656 * a control and data part. The control part contains a struct 1657 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1658 * the level, name and length of the data in the data part. The 1659 * data part contains the actual table data. The last message 1660 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1661 * single option with zero optlen. 1662 */ 1663 1664 for (;;) { 1665 /* 1666 * Go around this loop once for each table. Ignore 1667 * all tables except the routing information table. 1668 */ 1669 flags = 0; 1670 status = getmsg(fd, &ctlbuf, NULL, &flags); 1671 if (status < 0) { 1672 if (errno == EINTR) 1673 continue; 1674 logperror("update_router_list: getmsg(ctl)"); 1675 return (_B_FALSE); 1676 } 1677 if (ctlbuf.len < sizeof (t_scalar_t)) { 1678 logerr("update_router_list: ctlbuf.len %d\n", 1679 ctlbuf.len); 1680 return (_B_FALSE); 1681 } 1682 1683 prim = buf.uprim.type; 1684 1685 switch (prim) { 1686 1687 case T_ERROR_ACK: 1688 tea = &buf.uprim.error_ack; 1689 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1690 logerr("update_router_list: T_ERROR_ACK" 1691 " ctlbuf.len %d\n", ctlbuf.len); 1692 return (_B_FALSE); 1693 } 1694 logerr("update_router_list: T_ERROR_ACK:" 1695 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1696 tea->TLI_error, tea->UNIX_error); 1697 return (_B_FALSE); 1698 1699 case T_OPTMGMT_ACK: 1700 toa = &buf.uprim.optmgmt_ack; 1701 optp = (struct opthdr *)&toa[1]; 1702 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1703 logerr("update_router_list: ctlbuf.len %d\n", 1704 ctlbuf.len); 1705 return (_B_FALSE); 1706 } 1707 if (toa->MGMT_flags != T_SUCCESS) { 1708 logerr("update_router_list: MGMT_flags 0x%lx\n", 1709 toa->MGMT_flags); 1710 return (_B_FALSE); 1711 } 1712 break; 1713 1714 default: 1715 logerr("update_router_list: unknown primitive %ld\n", 1716 prim); 1717 return (_B_FALSE); 1718 } 1719 1720 /* Process the T_OPGMGMT_ACK below */ 1721 assert(prim == T_OPTMGMT_ACK); 1722 1723 switch (status) { 1724 case 0: 1725 /* 1726 * We have reached the end of this T_OPTMGMT_ACK 1727 * message. If this is the last message i.e EOD, 1728 * return, else process the next T_OPTMGMT_ACK msg. 1729 */ 1730 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1731 sizeof (struct opthdr)) && optp->len == 0 && 1732 optp->name == 0 && optp->level == 0) { 1733 /* 1734 * This is the EOD message. Return 1735 */ 1736 return (_B_TRUE); 1737 } 1738 continue; 1739 1740 case MORECTL: 1741 case MORECTL | MOREDATA: 1742 /* 1743 * This should not happen. We should be able to read 1744 * the control portion in a single getmsg. 1745 */ 1746 logerr("update_router_list: MORECTL\n"); 1747 return (_B_FALSE); 1748 1749 case MOREDATA: 1750 databuf.maxlen = optp->len; 1751 /* malloc of 0 bytes is ok */ 1752 databuf.buf = malloc((size_t)optp->len); 1753 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1754 logperror("update_router_list: malloc"); 1755 return (_B_FALSE); 1756 } 1757 databuf.len = 0; 1758 flags = 0; 1759 for (;;) { 1760 status = getmsg(fd, NULL, &databuf, &flags); 1761 if (status >= 0) { 1762 break; 1763 } else if (errno == EINTR) { 1764 continue; 1765 } else { 1766 logperror("update_router_list:" 1767 " getmsg(data)"); 1768 free(databuf.buf); 1769 return (_B_FALSE); 1770 } 1771 } 1772 1773 if (optp->level == MIB2_IP && 1774 optp->name == MIB2_IP_ROUTE) { 1775 /* LINTED */ 1776 ire_process_v4((mib2_ipRouteEntry_t *) 1777 databuf.buf, databuf.len); 1778 } else if (optp->level == MIB2_IP6 && 1779 optp->name == MIB2_IP6_ROUTE) { 1780 /* LINTED */ 1781 ire_process_v6((mib2_ipv6RouteEntry_t *) 1782 databuf.buf, databuf.len); 1783 } 1784 free(databuf.buf); 1785 } 1786 } 1787 /* NOTREACHED */ 1788 } 1789 1790 /* 1791 * Examine the IPv4 routing table, for default routers. For each default 1792 * router, populate the list of targets of each phyint that is on the same 1793 * link as the default router 1794 */ 1795 static void 1796 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1797 { 1798 mib2_ipRouteEntry_t *rp; 1799 mib2_ipRouteEntry_t *rp1; 1800 struct in_addr nexthop_v4; 1801 mib2_ipRouteEntry_t *endp; 1802 1803 if (len == 0) 1804 return; 1805 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1806 1807 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1808 1809 /* 1810 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1811 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1812 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1813 * This is a potential target for probing, which we try to add 1814 * to the list of probe targets. 1815 */ 1816 for (rp = buf; rp < endp; rp++) { 1817 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1818 continue; 1819 1820 /* Get the nexthop address. */ 1821 nexthop_v4.s_addr = rp->ipRouteNextHop; 1822 1823 /* 1824 * Get the nexthop address. Then determine the outgoing 1825 * interface, by examining all interface IREs, and picking the 1826 * match. We don't look at the interface specified in the route 1827 * because we need to add the router target on all matching 1828 * interfaces anyway; the goal is to avoid falling back to 1829 * multicast when some interfaces are in the same subnet but 1830 * not in the same group. 1831 */ 1832 for (rp1 = buf; rp1 < endp; rp1++) { 1833 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1834 continue; 1835 } 1836 1837 /* 1838 * Determine the interface IRE that matches the nexthop. 1839 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1840 */ 1841 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1842 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1843 /* 1844 * We found the interface ire 1845 */ 1846 router_add_v4(rp1, nexthop_v4); 1847 } 1848 } 1849 } 1850 } 1851 1852 void 1853 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1854 { 1855 char *cp; 1856 char ifname[LIFNAMSIZ + 1]; 1857 struct in6_addr nexthop; 1858 int len; 1859 1860 if (debug & D_TARGET) 1861 logdebug("router_add_v4()\n"); 1862 1863 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1864 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1865 ifname[len] = '\0'; 1866 1867 if (ifname[0] == '\0') 1868 return; 1869 1870 cp = strchr(ifname, IF_SEPARATOR); 1871 if (cp != NULL) 1872 *cp = '\0'; 1873 1874 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1875 router_add_common(AF_INET, ifname, nexthop); 1876 } 1877 1878 void 1879 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1880 { 1881 struct phyint_instance *pii; 1882 struct phyint *pi; 1883 1884 if (debug & D_TARGET) 1885 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1886 1887 /* 1888 * Retrieve the phyint instance; bail if it's not known to us yet. 1889 */ 1890 pii = phyint_inst_lookup(af, ifname); 1891 if (pii == NULL) 1892 return; 1893 1894 /* 1895 * Don't use our own addresses as targets. 1896 */ 1897 if (own_address(nexthop)) 1898 return; 1899 1900 /* 1901 * If the phyint is part a named group, then add the address to all 1902 * members of the group; note that this is suboptimal in the IPv4 case 1903 * as it has already been added to all matching interfaces in 1904 * ire_process_v4(). Otherwise, add the address only to the phyint 1905 * itself, since other phyints in the anongroup may not be on the same 1906 * subnet. 1907 */ 1908 pi = pii->pii_phyint; 1909 if (pi->pi_group == phyint_anongroup) { 1910 target_add(pii, nexthop, _B_TRUE); 1911 } else { 1912 pi = pi->pi_group->pg_phyint; 1913 for (; pi != NULL; pi = pi->pi_pgnext) 1914 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1915 } 1916 } 1917 1918 /* 1919 * Examine the IPv6 routing table, for default routers. For each default 1920 * router, populate the list of targets of each phyint that is on the same 1921 * link as the default router 1922 */ 1923 static void 1924 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1925 { 1926 mib2_ipv6RouteEntry_t *rp; 1927 mib2_ipv6RouteEntry_t *endp; 1928 struct in6_addr nexthop_v6; 1929 1930 if (debug & D_TARGET) 1931 logdebug("ire_process_v6(len %d)\n", len); 1932 1933 if (len == 0) 1934 return; 1935 1936 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1937 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1938 1939 /* 1940 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1941 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1942 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1943 * This is a potential target for probing, which we try to add 1944 * to the list of probe targets. 1945 */ 1946 for (rp = buf; rp < endp; rp++) { 1947 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1948 continue; 1949 1950 /* 1951 * We have the outgoing interface in ipv6RouteIfIndex 1952 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1953 * interface must be present for link-local addresses. Since 1954 * we use only link-local addreses for probing, we don't 1955 * consider the case when the outgoing interface is not 1956 * known and we need to scan interface ires 1957 */ 1958 nexthop_v6 = rp->ipv6RouteNextHop; 1959 if (rp->ipv6RouteIfIndex.o_length != 0) { 1960 /* 1961 * We already have the outgoing interface 1962 * in ipv6RouteIfIndex. 1963 */ 1964 router_add_v6(rp, nexthop_v6); 1965 } 1966 } 1967 } 1968 1969 1970 void 1971 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1972 { 1973 char ifname[LIFNAMSIZ + 1]; 1974 char *cp; 1975 int len; 1976 1977 if (debug & D_TARGET) 1978 logdebug("router_add_v6()\n"); 1979 1980 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1981 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1982 ifname[len] = '\0'; 1983 1984 if (ifname[0] == '\0') 1985 return; 1986 1987 cp = strchr(ifname, IF_SEPARATOR); 1988 if (cp != NULL) 1989 *cp = '\0'; 1990 1991 router_add_common(AF_INET6, ifname, nexthop_v6); 1992 } 1993 1994 1995 1996 /* 1997 * Build a list of target routers, by scanning the routing tables. 1998 * It is assumed that interface routes exist, to reach the routers. 1999 */ 2000 static void 2001 init_router_targets(void) 2002 { 2003 struct target *tg; 2004 struct target *next_tg; 2005 struct phyint_instance *pii; 2006 struct phyint *pi; 2007 2008 if (force_mcast) 2009 return; 2010 2011 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2012 pi = pii->pii_phyint; 2013 /* 2014 * Exclude ptp and host targets. Set tg_in_use to false, 2015 * only for router targets. 2016 */ 2017 if (!pii->pii_targets_are_routers || 2018 (pi->pi_flags & IFF_POINTOPOINT)) 2019 continue; 2020 2021 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 2022 tg->tg_in_use = 0; 2023 } 2024 2025 if (mibfd < 0) { 2026 mibfd = open("/dev/ip", O_RDWR); 2027 if (mibfd < 0) { 2028 logperror("mibopen: ip open"); 2029 exit(1); 2030 } 2031 } 2032 2033 if (!update_router_list(mibfd)) { 2034 (void) close(mibfd); 2035 mibfd = -1; 2036 } 2037 2038 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2039 if (!pii->pii_targets_are_routers || 2040 (pi->pi_flags & IFF_POINTOPOINT)) 2041 continue; 2042 2043 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 2044 next_tg = tg->tg_next; 2045 if (!tg->tg_in_use) { 2046 target_delete(tg); 2047 } 2048 } 2049 } 2050 } 2051 2052 /* 2053 * Attempt to assign host targets to any interfaces that do not currently 2054 * have probe targets by sharing targets with other interfaces in the group. 2055 */ 2056 static void 2057 init_host_targets(void) 2058 { 2059 struct phyint_instance *pii; 2060 struct phyint_group *pg; 2061 2062 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2063 pg = pii->pii_phyint->pi_group; 2064 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2065 dup_host_targets(pii); 2066 } 2067 } 2068 2069 /* 2070 * Duplicate host targets from other phyints of the group to 2071 * the phyint instance 'desired_pii'. 2072 */ 2073 static void 2074 dup_host_targets(struct phyint_instance *desired_pii) 2075 { 2076 int af; 2077 struct phyint *pi; 2078 struct phyint_instance *pii; 2079 struct target *tg; 2080 2081 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2082 2083 af = desired_pii->pii_af; 2084 2085 /* 2086 * For every phyint in the same group as desired_pii, check if 2087 * it has any host targets. If so add them to desired_pii. 2088 */ 2089 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2090 pii = PHYINT_INSTANCE(pi, af); 2091 /* 2092 * We know that we don't have targets on this phyint instance 2093 * since we have been called. But we still check for 2094 * pii_targets_are_routers because another phyint instance 2095 * could have router targets, since IFF_NOFAILOVER addresses 2096 * on different phyint instances may belong to different 2097 * subnets. 2098 */ 2099 if ((pii == NULL) || (pii == desired_pii) || 2100 pii->pii_targets_are_routers) 2101 continue; 2102 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2103 target_create(desired_pii, tg->tg_address, _B_FALSE); 2104 } 2105 } 2106 } 2107 2108 static void 2109 usage(char *cmd) 2110 { 2111 (void) fprintf(stderr, "usage: %s\n", cmd); 2112 } 2113 2114 2115 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2116 2117 /* Get an option from the /etc/default/mpathd file */ 2118 static char * 2119 getdefault(char *name) 2120 { 2121 char namebuf[BUFSIZ]; 2122 char *value = NULL; 2123 2124 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2125 char *cp; 2126 int flags; 2127 2128 /* 2129 * ignore case 2130 */ 2131 flags = defcntl(DC_GETFLAGS, 0); 2132 TURNOFF(flags, DC_CASE); 2133 (void) defcntl(DC_SETFLAGS, flags); 2134 2135 /* Add "=" to the name */ 2136 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2137 (void) strncat(namebuf, "=", 2); 2138 2139 if ((cp = defread(namebuf)) != NULL) 2140 value = strdup(cp); 2141 2142 /* close */ 2143 (void) defopen((char *)NULL); 2144 } 2145 return (value); 2146 } 2147 2148 2149 /* 2150 * Command line options below 2151 */ 2152 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2153 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2154 static boolean_t adopt = _B_FALSE; 2155 static boolean_t foreground = _B_FALSE; 2156 2157 int 2158 main(int argc, char *argv[]) 2159 { 2160 int i; 2161 int c; 2162 struct phyint_instance *pii; 2163 char *value; 2164 2165 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2166 srandom(gethostid()); /* Initialize the random number generator */ 2167 2168 /* 2169 * NOTE: The messages output by in.mpathd are not suitable for 2170 * translation, so we do not call textdomain(). 2171 */ 2172 (void) setlocale(LC_ALL, ""); 2173 2174 /* 2175 * Get the user specified value of 'failure detection time' 2176 * from /etc/default/mpathd 2177 */ 2178 value = getdefault("FAILURE_DETECTION_TIME"); 2179 if (value != NULL) { 2180 user_failure_detection_time = 2181 (int)strtol((char *)value, NULL, 0); 2182 2183 if (user_failure_detection_time <= 0) { 2184 user_failure_detection_time = FAILURE_DETECTION_TIME; 2185 logerr("Invalid failure detection time %s, assuming " 2186 "default %d\n", value, user_failure_detection_time); 2187 2188 } else if (user_failure_detection_time < 2189 MIN_FAILURE_DETECTION_TIME) { 2190 user_failure_detection_time = 2191 MIN_FAILURE_DETECTION_TIME; 2192 logerr("Too small failure detection time of %s, " 2193 "assuming minimum %d\n", value, 2194 user_failure_detection_time); 2195 } 2196 free(value); 2197 } else { 2198 /* User has not specified the parameter, Use default value */ 2199 user_failure_detection_time = FAILURE_DETECTION_TIME; 2200 } 2201 2202 /* 2203 * This gives the frequency at which probes will be sent. 2204 * When fdt ms elapses, we should be able to determine 2205 * whether 5 consecutive probes have failed or not. 2206 * 1 probe will be sent in every user_probe_interval ms, 2207 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2208 * user_probe_interval. Thus when we send out probe 'n' we 2209 * can be sure that probe 'n - 2' is lost, if we have not 2210 * got the ack. (since the probe interval is > crtt). But 2211 * probe 'n - 1' may be a valid unacked probe, since the 2212 * time between 2 successive probes could be as small as 2213 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2214 */ 2215 user_probe_interval = user_failure_detection_time / 2216 (NUM_PROBE_FAILS + 2); 2217 2218 /* 2219 * Get the user specified value of failback_enabled from 2220 * /etc/default/mpathd 2221 */ 2222 value = getdefault("FAILBACK"); 2223 if (value != NULL) { 2224 if (strncasecmp(value, "yes", 3) == 0) 2225 failback_enabled = _B_TRUE; 2226 else if (strncasecmp(value, "no", 2) == 0) 2227 failback_enabled = _B_FALSE; 2228 else 2229 logerr("Invalid value for FAILBACK %s\n", value); 2230 free(value); 2231 } else { 2232 failback_enabled = _B_TRUE; 2233 } 2234 2235 /* 2236 * Get the user specified value of track_all_phyints from 2237 * /etc/default/mpathd. The sense is reversed in 2238 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2239 */ 2240 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2241 if (value != NULL) { 2242 if (strncasecmp(value, "yes", 3) == 0) 2243 track_all_phyints = _B_FALSE; 2244 else if (strncasecmp(value, "no", 2) == 0) 2245 track_all_phyints = _B_TRUE; 2246 else 2247 logerr("Invalid value for " 2248 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2249 free(value); 2250 } else { 2251 track_all_phyints = _B_FALSE; 2252 } 2253 2254 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2255 switch (c) { 2256 case 'a': 2257 adopt = _B_TRUE; 2258 break; 2259 case 'm': 2260 force_mcast = _B_TRUE; 2261 break; 2262 case 'd': 2263 debug = D_ALL; 2264 foreground = _B_TRUE; 2265 break; 2266 case 'D': 2267 i = (int)strtol(optarg, NULL, 0); 2268 if (i == 0) { 2269 (void) fprintf(stderr, "Bad debug flags: %s\n", 2270 optarg); 2271 exit(1); 2272 } 2273 debug |= i; 2274 foreground = _B_TRUE; 2275 break; 2276 case 'l': 2277 /* 2278 * Turn off link state notification handling. 2279 * Undocumented command line flag, for debugging 2280 * purposes. 2281 */ 2282 handle_link_notifications = _B_FALSE; 2283 break; 2284 default: 2285 usage(argv[0]); 2286 exit(1); 2287 } 2288 } 2289 2290 /* 2291 * The sockets for the loopback command interface should be listening 2292 * before we fork and exit in daemonize(). This way, whoever started us 2293 * can use the loopback interface as soon as they get a zero exit 2294 * status. 2295 */ 2296 lsock_v4 = setup_listener(AF_INET); 2297 lsock_v6 = setup_listener(AF_INET6); 2298 2299 if (lsock_v4 < 0 && lsock_v6 < 0) { 2300 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2301 exit(1); 2302 } 2303 2304 if (!foreground) { 2305 if (!daemonize()) { 2306 logerr("cannot daemonize\n"); 2307 exit(EXIT_FAILURE); 2308 } 2309 initlog(); 2310 } 2311 2312 /* 2313 * Initializations: 2314 * 1. Create ifsock* sockets. These are used for performing SIOC* 2315 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2316 * 2. Initialize a pipe for handling/recording signal events. 2317 * 3. Create the routing sockets, used for listening 2318 * to routing / interface changes. 2319 * 4. phyint_init() - Initialize physical interface state 2320 * (in mpd_tables.c). Must be done before creating interfaces, 2321 * which timer_init() does indirectly. 2322 * 5. timer_init() - Initialize timer related stuff 2323 * 6. initifs() - Initialize our database of all known interfaces 2324 * 7. init_router_targets() - Initialize our database of all known 2325 * router targets. 2326 */ 2327 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2328 if (ifsock_v4 < 0) { 2329 logperror("main: IPv4 socket open"); 2330 exit(1); 2331 } 2332 2333 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2334 if (ifsock_v6 < 0) { 2335 logperror("main: IPv6 socket open"); 2336 exit(1); 2337 } 2338 2339 setup_eventpipe(); 2340 2341 rtsock_v4 = setup_rtsock(AF_INET); 2342 rtsock_v6 = setup_rtsock(AF_INET6); 2343 2344 if (phyint_init() == -1) { 2345 logerr("cannot initialize physical interface structures"); 2346 exit(1); 2347 } 2348 2349 timer_init(); 2350 2351 initifs(); 2352 2353 /* Inform kernel whether failback is enabled or disabled */ 2354 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2355 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2356 exit(1); 2357 } 2358 2359 /* 2360 * If we're operating in "adopt" mode and no interfaces need to be 2361 * tracked, shut down (ifconfig(1M) will restart us on demand if 2362 * interfaces are subsequently put into multipathing groups). 2363 */ 2364 if (adopt && phyint_instances == NULL) 2365 exit(0); 2366 2367 /* 2368 * Main body. Keep listening for activity on any of the sockets 2369 * that we are monitoring and take appropriate action as necessary. 2370 * signals are also handled synchronously. 2371 */ 2372 for (;;) { 2373 if (poll(pollfds, pollfd_num, -1) < 0) { 2374 if (errno == EINTR) 2375 continue; 2376 logperror("main: poll"); 2377 exit(1); 2378 } 2379 for (i = 0; i < pollfd_num; i++) { 2380 if ((pollfds[i].fd == -1) || 2381 !(pollfds[i].revents & POLLIN)) 2382 continue; 2383 if (pollfds[i].fd == eventpipe_read) { 2384 in_signal(eventpipe_read); 2385 break; 2386 } 2387 if (pollfds[i].fd == rtsock_v4 || 2388 pollfds[i].fd == rtsock_v6) { 2389 process_rtsock(rtsock_v4, rtsock_v6); 2390 break; 2391 } 2392 for (pii = phyint_instances; pii != NULL; 2393 pii = pii->pii_next) { 2394 if (pollfds[i].fd == pii->pii_probe_sock) { 2395 if (pii->pii_af == AF_INET) 2396 in_data(pii); 2397 else 2398 in6_data(pii); 2399 break; 2400 } 2401 } 2402 if (pollfds[i].fd == lsock_v4) 2403 loopback_cmd(lsock_v4, AF_INET); 2404 else if (pollfds[i].fd == lsock_v6) 2405 loopback_cmd(lsock_v6, AF_INET6); 2406 } 2407 if (full_scan_required) { 2408 initifs(); 2409 full_scan_required = _B_FALSE; 2410 } 2411 } 2412 /* NOTREACHED */ 2413 return (EXIT_SUCCESS); 2414 } 2415 2416 static int 2417 setup_listener(int af) 2418 { 2419 int sock; 2420 int on; 2421 int len; 2422 int ret; 2423 struct sockaddr_storage laddr; 2424 struct sockaddr_in *sin; 2425 struct sockaddr_in6 *sin6; 2426 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2427 2428 assert(af == AF_INET || af == AF_INET6); 2429 2430 sock = socket(af, SOCK_STREAM, 0); 2431 if (sock < 0) { 2432 logperror("setup_listener: socket"); 2433 exit(1); 2434 } 2435 2436 on = 1; 2437 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2438 sizeof (on)) < 0) { 2439 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2440 exit(1); 2441 } 2442 2443 bzero(&laddr, sizeof (laddr)); 2444 laddr.ss_family = af; 2445 2446 if (af == AF_INET) { 2447 sin = (struct sockaddr_in *)&laddr; 2448 sin->sin_port = htons(MPATHD_PORT); 2449 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2450 len = sizeof (struct sockaddr_in); 2451 } else { 2452 sin6 = (struct sockaddr_in6 *)&laddr; 2453 sin6->sin6_port = htons(MPATHD_PORT); 2454 sin6->sin6_addr = loopback_addr; 2455 len = sizeof (struct sockaddr_in6); 2456 } 2457 2458 ret = bind(sock, (struct sockaddr *)&laddr, len); 2459 if (ret < 0) { 2460 if (errno == EADDRINUSE) { 2461 /* 2462 * Another instance of mpathd may be already active. 2463 */ 2464 logerr("main: is another instance of in.mpathd " 2465 "already active?\n"); 2466 exit(1); 2467 } else { 2468 (void) close(sock); 2469 return (-1); 2470 } 2471 } 2472 if (listen(sock, 30) < 0) { 2473 logperror("main: listen"); 2474 exit(1); 2475 } 2476 if (poll_add(sock) == -1) { 2477 (void) close(sock); 2478 exit(1); 2479 } 2480 2481 return (sock); 2482 } 2483 2484 /* 2485 * Table of commands and their expected size; used by loopback_cmd(). 2486 */ 2487 static struct { 2488 const char *name; 2489 unsigned int size; 2490 } commands[] = { 2491 { "MI_PING", sizeof (uint32_t) }, 2492 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2493 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2494 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2495 { "MI_QUERY", sizeof (mi_query_t) } 2496 }; 2497 2498 /* 2499 * Commands received over the loopback interface come here. Currently 2500 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2501 * module. ifconfig only makes a connection, and closes it to check if 2502 * in.mpathd is running. 2503 * if_mpadm sends commands in the format specified by the mpathd_interface 2504 * structure. 2505 */ 2506 static void 2507 loopback_cmd(int sock, int family) 2508 { 2509 int newfd; 2510 ssize_t len; 2511 struct sockaddr_storage peer; 2512 struct sockaddr_in *peer_sin; 2513 struct sockaddr_in6 *peer_sin6; 2514 socklen_t peerlen; 2515 union mi_commands mpi; 2516 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2517 char abuf[INET6_ADDRSTRLEN]; 2518 uint_t cmd; 2519 int retval; 2520 2521 peerlen = sizeof (peer); 2522 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2523 if (newfd < 0) { 2524 logperror("loopback_cmd: accept"); 2525 return; 2526 } 2527 2528 switch (family) { 2529 case AF_INET: 2530 /* 2531 * Validate the address and port to make sure that 2532 * non privileged processes don't connect and start 2533 * talking to us. 2534 */ 2535 if (peerlen != sizeof (struct sockaddr_in)) { 2536 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2537 (void) close(newfd); 2538 return; 2539 } 2540 peer_sin = (struct sockaddr_in *)&peer; 2541 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2542 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2543 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2544 abuf, sizeof (abuf)); 2545 logerr("Attempt to connect from addr %s port %d\n", 2546 abuf, ntohs(peer_sin->sin_port)); 2547 (void) close(newfd); 2548 return; 2549 } 2550 break; 2551 2552 case AF_INET6: 2553 if (peerlen != sizeof (struct sockaddr_in6)) { 2554 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2555 (void) close(newfd); 2556 return; 2557 } 2558 /* 2559 * Validate the address and port to make sure that 2560 * non privileged processes don't connect and start 2561 * talking to us. 2562 */ 2563 peer_sin6 = (struct sockaddr_in6 *)&peer; 2564 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2565 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2566 &loopback_addr))) { 2567 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2568 sizeof (abuf)); 2569 logerr("Attempt to connect from addr %s port %d\n", 2570 abuf, ntohs(peer_sin6->sin6_port)); 2571 (void) close(newfd); 2572 return; 2573 } 2574 2575 default: 2576 logdebug("loopback_cmd: family %d\n", family); 2577 (void) close(newfd); 2578 return; 2579 } 2580 2581 /* 2582 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2583 * all supported commands 2584 */ 2585 len = read(newfd, &mpi, sizeof (mpi)); 2586 2587 /* 2588 * ifconfig does not send any data. Just tests to see if mpathd 2589 * is already running. 2590 */ 2591 if (len <= 0) { 2592 (void) close(newfd); 2593 return; 2594 } 2595 2596 /* 2597 * In theory, we can receive any sized message for a stream socket, 2598 * but we don't expect that to happen for a small message over a 2599 * loopback connection. 2600 */ 2601 if (len < sizeof (uint32_t)) { 2602 logerr("loopback_cmd: bad command format or read returns " 2603 "partial data %d\n", len); 2604 } 2605 2606 cmd = mpi.mi_command; 2607 if (cmd >= MI_NCMD) { 2608 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2609 (void) close(newfd); 2610 return; 2611 } 2612 2613 if (len < commands[cmd].size) { 2614 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2615 commands[cmd].name, commands[cmd].size, len); 2616 (void) close(newfd); 2617 return; 2618 } 2619 2620 retval = process_cmd(newfd, &mpi); 2621 if (retval != IPMP_SUCCESS) { 2622 logerr("failed processing %s: %s\n", commands[cmd].name, 2623 ipmp_errmsg(retval)); 2624 } 2625 (void) close(newfd); 2626 } 2627 2628 extern int global_errno; /* set by failover() or failback() */ 2629 2630 /* 2631 * Process the offline, undo offline and set original index commands, 2632 * received from if_mpadm(1M) 2633 */ 2634 static unsigned int 2635 process_cmd(int newfd, union mi_commands *mpi) 2636 { 2637 uint_t nif = 0; 2638 uint32_t cmd; 2639 struct phyint *pi; 2640 struct phyint *pi2; 2641 struct phyint_group *pg; 2642 boolean_t success; 2643 int error; 2644 struct mi_offline *mio; 2645 struct mi_undo_offline *miu; 2646 struct lifreq lifr; 2647 int ifsock; 2648 struct mi_setoindex *mis; 2649 2650 cmd = mpi->mi_command; 2651 2652 switch (cmd) { 2653 case MI_OFFLINE: 2654 mio = &mpi->mi_ocmd; 2655 /* 2656 * Lookup the interface that needs to be offlined. 2657 * If it does not exist, return a suitable error. 2658 */ 2659 pi = phyint_lookup(mio->mio_ifname); 2660 if (pi == NULL) 2661 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2662 2663 /* 2664 * Verify that the minimum redundancy requirements are met. 2665 * The multipathing group must have at least the specified 2666 * number of functional interfaces after offlining the 2667 * requested interface. Otherwise return a suitable error. 2668 */ 2669 pg = pi->pi_group; 2670 nif = 0; 2671 if (pg != phyint_anongroup) { 2672 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2673 pi2 = pi2->pi_pgnext) { 2674 if ((pi2->pi_state == PI_RUNNING) || 2675 (pg->pg_groupfailed && 2676 !(pi2->pi_flags & IFF_OFFLINE))) 2677 nif++; 2678 } 2679 } 2680 if (nif < mio->mio_min_redundancy) 2681 return (send_result(newfd, IPMP_EMINRED, 0)); 2682 2683 /* 2684 * The order of operation is to set IFF_OFFLINE, followed by 2685 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2686 * can be created. Subsequent failover moves everything on 2687 * the OFFLINE interface to some other functional interface. 2688 */ 2689 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2690 if (success) { 2691 if (!pi->pi_empty) { 2692 error = try_failover(pi, FAILOVER_NORMAL); 2693 if (error != 0) { 2694 if (!change_lif_flags(pi, IFF_OFFLINE, 2695 _B_FALSE)) { 2696 logerr("process_cmd: couldn't" 2697 " clear OFFLINE flag on" 2698 " %s\n", pi->pi_name); 2699 /* 2700 * Offline interfaces should 2701 * not be probed. 2702 */ 2703 stop_probing(pi); 2704 } 2705 return (send_result(newfd, error, 2706 global_errno)); 2707 } 2708 } 2709 } else { 2710 return (send_result(newfd, IPMP_FAILURE, errno)); 2711 } 2712 2713 /* 2714 * The interface is now Offline, so stop probing it. 2715 * Note that if_mpadm(1M) will down the test addresses, 2716 * after receiving a success reply from us. The routing 2717 * socket message will then make us close the socket used 2718 * for sending probes. But it is more logical that an 2719 * offlined interface must not be probed, even if it has 2720 * test addresses. 2721 */ 2722 stop_probing(pi); 2723 return (send_result(newfd, IPMP_SUCCESS, 0)); 2724 2725 case MI_UNDO_OFFLINE: 2726 miu = &mpi->mi_ucmd; 2727 /* 2728 * Undo the offline command. As usual lookup the interface. 2729 * Send an error if it does not exist. 2730 */ 2731 pi = phyint_lookup(miu->miu_ifname); 2732 if (pi == NULL) 2733 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2734 2735 /* 2736 * Inverse of the offline operation. Do a failback, and then 2737 * clear the IFF_OFFLINE flag. 2738 */ 2739 error = do_failback(pi, _B_TRUE); 2740 if (error == IPMP_EFBPARTIAL) 2741 return (send_result(newfd, IPMP_EFBPARTIAL, 0)); 2742 error = do_failback(pi, _B_FALSE); 2743 2744 switch (error) { 2745 case IPMP_SUCCESS: 2746 if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) { 2747 logdebug("undo error %X\n", global_errno); 2748 error = IPMP_FAILURE; 2749 break; 2750 } 2751 /* FALLTHROUGH */ 2752 2753 case IPMP_EFBPARTIAL: 2754 /* 2755 * Reset the state of the interface based on the 2756 * current link state; if this phyint subsequently 2757 * acquires a test address, the state will be changed 2758 * again later as a result of the probes. 2759 */ 2760 if (LINK_UP(pi)) 2761 phyint_chstate(pi, PI_RUNNING); 2762 else 2763 phyint_chstate(pi, PI_FAILED); 2764 break; 2765 2766 case IPMP_FAILURE: 2767 break; 2768 2769 default: 2770 logdebug("do_failback: unexpected return value\n"); 2771 break; 2772 } 2773 return (send_result(newfd, error, global_errno)); 2774 2775 case MI_SETOINDEX: 2776 mis = &mpi->mi_scmd; 2777 2778 /* Get the socket for doing ioctls */ 2779 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2780 2781 /* 2782 * Get index of new original interface. 2783 * The index is returned in lifr.lifr_index. 2784 */ 2785 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2786 sizeof (lifr.lifr_name)); 2787 2788 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2789 return (send_result(newfd, IPMP_FAILURE, errno)); 2790 2791 /* 2792 * Set new original interface index. 2793 * The new index was put into lifr.lifr_index by the 2794 * SIOCGLIFINDEX ioctl. 2795 */ 2796 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2797 sizeof (lifr.lifr_name)); 2798 2799 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2800 return (send_result(newfd, IPMP_FAILURE, errno)); 2801 2802 return (send_result(newfd, IPMP_SUCCESS, 0)); 2803 2804 case MI_QUERY: 2805 return (process_query(newfd, &mpi->mi_qcmd)); 2806 2807 default: 2808 break; 2809 } 2810 2811 return (send_result(newfd, IPMP_EPROTO, 0)); 2812 } 2813 2814 /* 2815 * Process the query request pointed to by `miq' and send a reply on file 2816 * descriptor `fd'. Returns an IPMP error code. 2817 */ 2818 static unsigned int 2819 process_query(int fd, mi_query_t *miq) 2820 { 2821 ipmp_groupinfo_t *grinfop; 2822 ipmp_groupinfolist_t *grlp; 2823 ipmp_grouplist_t *grlistp; 2824 ipmp_ifinfo_t *ifinfop; 2825 ipmp_ifinfolist_t *iflp; 2826 ipmp_snap_t *snap; 2827 unsigned int retval; 2828 2829 switch (miq->miq_inforeq) { 2830 case IPMP_GROUPLIST: 2831 retval = getgrouplist(&grlistp); 2832 if (retval != IPMP_SUCCESS) 2833 return (send_result(fd, retval, errno)); 2834 2835 retval = send_result(fd, IPMP_SUCCESS, 0); 2836 if (retval == IPMP_SUCCESS) 2837 retval = send_grouplist(fd, grlistp); 2838 2839 ipmp_freegrouplist(grlistp); 2840 return (retval); 2841 2842 case IPMP_GROUPINFO: 2843 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2844 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2845 if (retval != IPMP_SUCCESS) 2846 return (send_result(fd, retval, errno)); 2847 2848 retval = send_result(fd, IPMP_SUCCESS, 0); 2849 if (retval == IPMP_SUCCESS) 2850 retval = send_groupinfo(fd, grinfop); 2851 2852 ipmp_freegroupinfo(grinfop); 2853 return (retval); 2854 2855 case IPMP_IFINFO: 2856 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2857 retval = getifinfo(miq->miq_ifname, &ifinfop); 2858 if (retval != IPMP_SUCCESS) 2859 return (send_result(fd, retval, errno)); 2860 2861 retval = send_result(fd, IPMP_SUCCESS, 0); 2862 if (retval == IPMP_SUCCESS) 2863 retval = send_ifinfo(fd, ifinfop); 2864 2865 ipmp_freeifinfo(ifinfop); 2866 return (retval); 2867 2868 case IPMP_SNAP: 2869 retval = getsnap(&snap); 2870 if (retval != IPMP_SUCCESS) 2871 return (send_result(fd, retval, errno)); 2872 2873 retval = send_result(fd, IPMP_SUCCESS, 0); 2874 if (retval != IPMP_SUCCESS) 2875 goto out; 2876 2877 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2878 if (retval != IPMP_SUCCESS) 2879 goto out; 2880 2881 retval = send_grouplist(fd, snap->sn_grlistp); 2882 if (retval != IPMP_SUCCESS) 2883 goto out; 2884 2885 iflp = snap->sn_ifinfolistp; 2886 for (; iflp != NULL; iflp = iflp->ifl_next) { 2887 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2888 if (retval != IPMP_SUCCESS) 2889 goto out; 2890 } 2891 2892 grlp = snap->sn_grinfolistp; 2893 for (; grlp != NULL; grlp = grlp->grl_next) { 2894 retval = send_groupinfo(fd, grlp->grl_grinfop); 2895 if (retval != IPMP_SUCCESS) 2896 goto out; 2897 } 2898 out: 2899 ipmp_snap_free(snap); 2900 return (retval); 2901 2902 default: 2903 break; 2904 2905 } 2906 return (send_result(fd, IPMP_EPROTO, 0)); 2907 } 2908 2909 /* 2910 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2911 * Returns an IPMP error code. 2912 */ 2913 static unsigned int 2914 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2915 { 2916 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2917 unsigned int retval; 2918 2919 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2920 if (retval != IPMP_SUCCESS) 2921 return (retval); 2922 2923 return (ipmp_writetlv(fd, IPMP_IFLIST, 2924 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2925 } 2926 2927 /* 2928 * Send the interface information pointed to by `ifinfop' on file descriptor 2929 * `fd'. Returns an IPMP error code. 2930 */ 2931 static unsigned int 2932 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2933 { 2934 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2935 } 2936 2937 /* 2938 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2939 * Returns an IPMP error code. 2940 */ 2941 static unsigned int 2942 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2943 { 2944 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2945 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2946 } 2947 2948 /* 2949 * Initialize an mi_result_t structure using `error' and `syserror' and 2950 * send it on file descriptor `fd'. Returns an IPMP error code. 2951 */ 2952 static unsigned int 2953 send_result(int fd, unsigned int error, int syserror) 2954 { 2955 mi_result_t me; 2956 2957 me.me_mpathd_error = error; 2958 if (error == IPMP_FAILURE) 2959 me.me_sys_error = syserror; 2960 else 2961 me.me_sys_error = 0; 2962 2963 return (ipmp_write(fd, &me, sizeof (me))); 2964 } 2965 2966 /* 2967 * Daemonize the process. 2968 */ 2969 static boolean_t 2970 daemonize(void) 2971 { 2972 switch (fork()) { 2973 case -1: 2974 return (_B_FALSE); 2975 2976 case 0: 2977 /* 2978 * Lose our controlling terminal, and become both a session 2979 * leader and a process group leader. 2980 */ 2981 if (setsid() == -1) 2982 return (_B_FALSE); 2983 2984 /* 2985 * Under POSIX, a session leader can accidentally (through 2986 * open(2)) acquire a controlling terminal if it does not 2987 * have one. Just to be safe, fork() again so we are not a 2988 * session leader. 2989 */ 2990 switch (fork()) { 2991 case -1: 2992 return (_B_FALSE); 2993 2994 case 0: 2995 (void) chdir("/"); 2996 (void) umask(022); 2997 (void) fdwalk(closefunc, NULL); 2998 break; 2999 3000 default: 3001 _exit(EXIT_SUCCESS); 3002 } 3003 break; 3004 3005 default: 3006 _exit(EXIT_SUCCESS); 3007 } 3008 3009 return (_B_TRUE); 3010 } 3011 3012 /* 3013 * The parent has created some fds before forking on purpose, keep them open. 3014 */ 3015 static int 3016 closefunc(void *not_used, int fd) 3017 /* ARGSUSED */ 3018 { 3019 if (fd != lsock_v4 && fd != lsock_v6) 3020 (void) close(fd); 3021 return (0); 3022 } 3023 3024 /* LOGGER */ 3025 3026 #include <syslog.h> 3027 3028 /* 3029 * Logging routines. All routines log to syslog, unless the daemon is 3030 * running in the foreground, in which case the logging goes to stderr. 3031 * 3032 * The following routines are available: 3033 * 3034 * logdebug(): A printf-like function for outputting debug messages 3035 * (messages at LOG_DEBUG) that are only of use to developers. 3036 * 3037 * logtrace(): A printf-like function for outputting tracing messages 3038 * (messages at LOG_INFO) from the daemon. This is typically used 3039 * to log the receipt of interesting network-related conditions. 3040 * 3041 * logerr(): A printf-like function for outputting error messages 3042 * (messages at LOG_ERR) from the daemon. 3043 * 3044 * logperror*(): A set of functions used to output error messages 3045 * (messages at LOG_ERR); these automatically append strerror(errno) 3046 * and a newline to the message passed to them. 3047 * 3048 * NOTE: since the logging functions write to syslog, the messages passed 3049 * to them are not eligible for localization. Thus, gettext() must 3050 * *not* be used. 3051 */ 3052 3053 static int logging = 0; 3054 3055 static void 3056 initlog(void) 3057 { 3058 logging++; 3059 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3060 } 3061 3062 /* PRINTFLIKE1 */ 3063 void 3064 logerr(char *fmt, ...) 3065 { 3066 va_list ap; 3067 3068 va_start(ap, fmt); 3069 3070 if (logging) 3071 vsyslog(LOG_ERR, fmt, ap); 3072 else 3073 (void) vfprintf(stderr, fmt, ap); 3074 va_end(ap); 3075 } 3076 3077 /* PRINTFLIKE1 */ 3078 void 3079 logtrace(char *fmt, ...) 3080 { 3081 va_list ap; 3082 3083 va_start(ap, fmt); 3084 3085 if (logging) 3086 vsyslog(LOG_INFO, fmt, ap); 3087 else 3088 (void) vfprintf(stderr, fmt, ap); 3089 va_end(ap); 3090 } 3091 3092 /* PRINTFLIKE1 */ 3093 void 3094 logdebug(char *fmt, ...) 3095 { 3096 va_list ap; 3097 3098 va_start(ap, fmt); 3099 3100 if (logging) 3101 vsyslog(LOG_DEBUG, fmt, ap); 3102 else 3103 (void) vfprintf(stderr, fmt, ap); 3104 va_end(ap); 3105 } 3106 3107 /* PRINTFLIKE1 */ 3108 void 3109 logperror(char *str) 3110 { 3111 if (logging) 3112 syslog(LOG_ERR, "%s: %m\n", str); 3113 else 3114 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3115 } 3116 3117 void 3118 logperror_pii(struct phyint_instance *pii, char *str) 3119 { 3120 if (logging) { 3121 syslog(LOG_ERR, "%s (%s %s): %m\n", 3122 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3123 } else { 3124 (void) fprintf(stderr, "%s (%s %s): %s\n", 3125 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3126 strerror(errno)); 3127 } 3128 } 3129 3130 void 3131 logperror_li(struct logint *li, char *str) 3132 { 3133 struct phyint_instance *pii = li->li_phyint_inst; 3134 3135 if (logging) { 3136 syslog(LOG_ERR, "%s (%s %s): %m\n", 3137 str, AF_STR(pii->pii_af), li->li_name); 3138 } else { 3139 (void) fprintf(stderr, "%s (%s %s): %s\n", 3140 str, AF_STR(pii->pii_af), li->li_name, 3141 strerror(errno)); 3142 } 3143 } 3144 3145 void 3146 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3147 { 3148 if (polled) 3149 (void) poll_remove(pii->pii_probe_sock); 3150 (void) close(pii->pii_probe_sock); 3151 pii->pii_probe_sock = -1; 3152 pii->pii_basetime_inited = 0; 3153 } 3154