1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 35 /* All times below in ms */ 36 int user_failure_detection_time; /* user specified failure detection */ 37 /* time (fdt) */ 38 int user_probe_interval; /* derived from user specified fdt */ 39 40 static int rtsock_v4; /* AF_INET routing socket */ 41 static int rtsock_v6; /* AF_INET6 routing socket */ 42 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 43 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 44 static int lsock_v4; /* Listen socket to detect mpathd */ 45 static int lsock_v6; /* Listen socket to detect mpathd */ 46 static int mibfd = -1; /* fd to get mib info */ 47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 48 49 boolean_t full_scan_required = _B_FALSE; 50 static uint_t last_initifs_time; /* Time when initifs was last run */ 51 static char **argv0; /* Saved for re-exec on SIGHUP */ 52 boolean_t handle_link_notifications = _B_TRUE; 53 54 static void initlog(void); 55 static void run_timeouts(void); 56 static void initifs(void); 57 static void check_if_removed(struct phyint_instance *pii); 58 static void select_test_ifs(void); 59 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 60 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 61 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 62 struct in_addr nexthop_v4); 63 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 64 struct in6_addr nexthop_v6); 65 static void router_add_common(int af, char *ifname, 66 struct in6_addr nexthop); 67 static void init_router_targets(); 68 static void cleanup(void); 69 static int setup_listener(int af); 70 static void check_config(void); 71 static void check_addr_unique(int af, char *name); 72 static void init_host_targets(void); 73 static void dup_host_targets(struct phyint_instance *desired_pii); 74 static void loopback_cmd(int sock, int family); 75 static int poll_remove(int fd); 76 static boolean_t daemonize(void); 77 static int closefunc(void *, int); 78 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 79 static unsigned int process_query(int fd, mi_query_t *miq); 80 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 81 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 82 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 83 static unsigned int send_result(int fd, unsigned int error, int syserror); 84 85 /* 86 * Return the current time in milliseconds (from an arbitrary reference) 87 * truncated to fit into an int. Truncation is ok since we are interested 88 * only in differences and not the absolute values. 89 */ 90 uint_t 91 getcurrenttime(void) 92 { 93 uint_t cur_time; /* In ms */ 94 95 /* 96 * Use of a non-user-adjustable source of time is 97 * required. However millisecond precision is sufficient. 98 * divide by 10^6 99 */ 100 cur_time = (uint_t)(gethrtime() / 1000000LL); 101 return (cur_time); 102 } 103 104 /* 105 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 106 */ 107 int 108 poll_add(int fd) 109 { 110 int i; 111 int new_num; 112 struct pollfd *newfds; 113 retry: 114 /* Check if already present */ 115 for (i = 0; i < pollfd_num; i++) { 116 if (pollfds[i].fd == fd) 117 return (0); 118 } 119 /* Check for empty spot already present */ 120 for (i = 0; i < pollfd_num; i++) { 121 if (pollfds[i].fd == -1) { 122 pollfds[i].fd = fd; 123 return (0); 124 } 125 } 126 127 /* Allocate space for 32 more fds and initialize to -1 */ 128 new_num = pollfd_num + 32; 129 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 130 if (newfds == NULL) { 131 logperror("poll_add: realloc"); 132 return (-1); 133 } 134 for (i = pollfd_num; i < new_num; i++) { 135 newfds[i].fd = -1; 136 newfds[i].events = POLLIN; 137 } 138 pollfd_num = new_num; 139 pollfds = newfds; 140 goto retry; 141 } 142 143 /* 144 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 145 */ 146 static int 147 poll_remove(int fd) 148 { 149 int i; 150 151 /* Check if already present */ 152 for (i = 0; i < pollfd_num; i++) { 153 if (pollfds[i].fd == fd) { 154 pollfds[i].fd = -1; 155 return (0); 156 } 157 } 158 return (-1); 159 } 160 161 /* 162 * Extract information about the phyint instance. If the phyint instance still 163 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 164 * will use it to detect phyint instances that don't exist any longer and 165 * remove them, from our database of phyint instances. 166 * Return value: 167 * returns true if the phyint instance exists in the kernel, 168 * returns false otherwise 169 */ 170 static boolean_t 171 pii_process(int af, char *name, struct phyint_instance **pii_p) 172 { 173 int err; 174 struct phyint_instance *pii; 175 struct phyint_instance *pii_other; 176 177 if (debug & D_PHYINT) 178 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 179 180 pii = phyint_inst_lookup(af, name); 181 if (pii == NULL) { 182 /* 183 * Phyint instance does not exist in our tables, 184 * create new phyint instance 185 */ 186 pii = phyint_inst_init_from_k(af, name); 187 } else { 188 /* Phyint exists in our tables */ 189 err = phyint_inst_update_from_k(pii); 190 191 switch (err) { 192 case PI_IOCTL_ERROR: 193 /* Some ioctl error. don't change anything */ 194 pii->pii_in_use = 1; 195 break; 196 197 case PI_GROUP_CHANGED: 198 /* 199 * The phyint has changed group. 200 */ 201 restore_phyint(pii->pii_phyint); 202 /* FALLTHRU */ 203 204 case PI_IFINDEX_CHANGED: 205 /* 206 * Interface index has changed. Delete and 207 * recreate the phyint as it is quite likely 208 * the interface has been unplumbed and replumbed. 209 */ 210 pii_other = phyint_inst_other(pii); 211 if (pii_other != NULL) 212 phyint_inst_delete(pii_other); 213 phyint_inst_delete(pii); 214 pii = phyint_inst_init_from_k(af, name); 215 break; 216 217 case PI_DELETED: 218 /* Phyint instance has disappeared from kernel */ 219 pii->pii_in_use = 0; 220 break; 221 222 case PI_OK: 223 /* Phyint instance exists and is fine */ 224 pii->pii_in_use = 1; 225 break; 226 227 default: 228 /* Unknown status */ 229 logerr("pii_process: Unknown status %d\n", err); 230 break; 231 } 232 } 233 234 *pii_p = pii; 235 if (pii != NULL) 236 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 237 else 238 return (_B_FALSE); 239 } 240 241 /* 242 * This phyint is leaving the group. Try to restore the phyint to its 243 * initial state. Return the addresses that belong to other group members, 244 * to the group, and take back any addresses owned by this phyint 245 */ 246 void 247 restore_phyint(struct phyint *pi) 248 { 249 if (pi->pi_group == phyint_anongroup) 250 return; 251 252 /* 253 * Move everthing to some other member in the group. 254 * The phyint has changed group in the kernel. But we 255 * have yet to do it in our tables. 256 */ 257 if (!pi->pi_empty) 258 (void) try_failover(pi, FAILOVER_TO_ANY); 259 /* 260 * Move all addresses owned by 'pi' back to pi, from each 261 * of the other members of the group 262 */ 263 (void) try_failback(pi, _B_FALSE); 264 } 265 266 /* 267 * Scan all interfaces to detect changes as well as new and deleted interfaces 268 */ 269 static void 270 initifs() 271 { 272 int n; 273 int af; 274 char *cp; 275 char *buf; 276 int numifs; 277 struct lifnum lifn; 278 struct lifconf lifc; 279 struct lifreq *lifr; 280 struct logint *li; 281 struct phyint_instance *pii; 282 struct phyint_instance *next_pii; 283 char pi_name[LIFNAMSIZ + 1]; 284 boolean_t exists; 285 struct phyint *pi; 286 287 if (debug & D_PHYINT) 288 logdebug("initifs: Scanning interfaces\n"); 289 290 last_initifs_time = getcurrenttime(); 291 292 /* 293 * Mark the interfaces so that we can find phyints and logints 294 * which have disappeared from the kernel. pii_process() and 295 * logint_init_from_k() will set {pii,li}_in_use when they find 296 * the interface in the kernel. Also, clear dupaddr bit on probe 297 * logint. check_addr_unique() will set the dupaddr bit on the 298 * probe logint, if the testaddress is not unique. 299 */ 300 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 301 pii->pii_in_use = 0; 302 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 303 li->li_in_use = 0; 304 if (pii->pii_probe_logint == li) 305 li->li_dupaddr = 0; 306 } 307 } 308 309 lifn.lifn_family = AF_UNSPEC; 310 lifn.lifn_flags = 0; 311 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 312 logperror("initifs: ioctl (get interface numbers)"); 313 return; 314 } 315 numifs = lifn.lifn_count; 316 317 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 318 if (buf == NULL) { 319 logperror("initifs: calloc"); 320 return; 321 } 322 323 lifc.lifc_family = AF_UNSPEC; 324 lifc.lifc_flags = 0; 325 lifc.lifc_len = numifs * sizeof (struct lifreq); 326 lifc.lifc_buf = buf; 327 328 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 329 /* 330 * EINVAL is commonly encountered, when things change 331 * underneath us rapidly, (eg. at boot, when new interfaces 332 * are plumbed successively) and the kernel finds the buffer 333 * size we passed as too small. We will retry again 334 * when we see the next routing socket msg, or at worst after 335 * IF_SCAN_INTERVAL ms. 336 */ 337 if (errno != EINVAL) { 338 logperror("initifs: ioctl" 339 " (get interface configuration)"); 340 } 341 free(buf); 342 return; 343 } 344 345 lifr = (struct lifreq *)lifc.lifc_req; 346 347 /* 348 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 349 * and get the state of the corresponding phyint_instance. If it is 350 * successful, then call logint_init_from_k() to get the state of the 351 * logint. 352 */ 353 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 354 af = lifr->lifr_addr.ss_family; 355 356 /* 357 * Need to pass a phyint name to pii_process. Insert the 358 * null where the ':' IF_SEPARATOR is found in the logical 359 * name. 360 */ 361 (void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 362 pi_name[sizeof (pi_name) - 1] = '\0'; 363 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 364 *cp = '\0'; 365 366 exists = pii_process(af, pi_name, &pii); 367 if (exists) { 368 /* The phyint is fine. So process the logint */ 369 logint_init_from_k(pii, lifr->lifr_name); 370 } 371 check_addr_unique(af, lifr->lifr_name); 372 } 373 374 free(buf); 375 376 /* 377 * If the test address is now unique, and if it was not unique 378 * previously, clear the li_dupaddrmsg_printed flag and log a 379 * recovery message 380 */ 381 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 382 struct logint *li; 383 char abuf[INET6_ADDRSTRLEN]; 384 385 li = pii->pii_probe_logint; 386 if ((li != NULL) && !li->li_dupaddr && 387 li->li_dupaddrmsg_printed) { 388 logerr("Test address %s is unique; enabling probe-" 389 "based failure detection\n", 390 pr_addr(pii->pii_af, li->li_addr, abuf, 391 sizeof (abuf))); 392 li->li_dupaddrmsg_printed = 0; 393 } 394 } 395 396 /* 397 * Scan for phyints and logints that have disappeared from the 398 * kernel, and delete them. 399 */ 400 pii = phyint_instances; 401 402 while (pii != NULL) { 403 next_pii = pii->pii_next; 404 check_if_removed(pii); 405 pii = next_pii; 406 } 407 408 /* 409 * Select a test address for sending probes on each phyint instance 410 */ 411 select_test_ifs(); 412 413 /* 414 * Handle link up/down notifications from the NICs. 415 */ 416 process_link_state_changes(); 417 418 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 419 /* 420 * If this is a case of group failure, we don't have much 421 * to do until the group recovers again. 422 */ 423 if (GROUP_FAILED(pi->pi_group)) 424 continue; 425 426 /* 427 * Try/Retry any pending failovers / failbacks, that did not 428 * not complete, or that could not be initiated previously. 429 * This implements the 3 invariants described in the big block 430 * comment at the beginning of probe.c 431 */ 432 if (pi->pi_flags & IFF_INACTIVE) { 433 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 434 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 435 } else { 436 struct phyint_instance *pii; 437 438 pii = pi->pi_v4; 439 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 440 pii = pi->pi_v6; 441 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 442 continue; 443 /* 444 * It is possible that the phyint has started 445 * receiving packets, after it has been marked 446 * PI_FAILED. Don't initiate failover, if the 447 * phyint has started recovering. failure_state() 448 * captures this check. A similar logic is used 449 * for failback/repair case. 450 */ 451 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 452 (failure_state(pii) == PHYINT_FAILURE)) { 453 (void) try_failover(pi, FAILOVER_NORMAL); 454 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 455 if (try_failback(pi, _B_FALSE) != 456 IPMP_FAILURE) { 457 (void) change_lif_flags(pi, IFF_FAILED, 458 _B_FALSE); 459 /* Per state diagram */ 460 pi->pi_empty = 0; 461 } 462 } 463 } 464 } 465 } 466 467 /* 468 * Check that test/probe addresses are always unique. link-locals and 469 * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER) 470 * address can produce unexpected results. Log an error and alert the user. 471 */ 472 static void 473 check_addr_unique(int af, char *name) 474 { 475 struct lifreq lifr; 476 struct phyint *pi; 477 struct in6_addr addr; 478 struct phyint_instance *pii; 479 struct sockaddr_in *sin; 480 struct sockaddr_in6 *sin6; 481 int ifsock; 482 char abuf[INET6_ADDRSTRLEN]; 483 484 /* Get the socket for doing ioctls */ 485 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 486 487 (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); 488 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 489 /* 490 * Get the address corresponding to 'name'. We cannot 491 * do a logint lookup in our tables, because, not all logints 492 * in the system are tracked by mpathd. (eg. things not in a group) 493 */ 494 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 495 if (errno == ENXIO) { 496 /* Interface has vanished */ 497 return; 498 } else { 499 logperror("ioctl (get addr)"); 500 return; 501 } 502 } 503 504 if (af == AF_INET) { 505 sin = (struct sockaddr_in *)&lifr.lifr_addr; 506 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 507 } else { 508 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 509 addr = sin6->sin6_addr; 510 } 511 512 /* 513 * Does the address 'addr' match any known test address ? If so 514 * it is a duplicate, unless we are looking at the same logint 515 */ 516 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 517 pii = PHYINT_INSTANCE(pi, af); 518 if (pii == NULL || pii->pii_probe_logint == NULL) 519 continue; 520 521 if (!IN6_ARE_ADDR_EQUAL(&addr, 522 &pii->pii_probe_logint->li_addr)) { 523 continue; 524 } 525 526 if (strncmp(pii->pii_probe_logint->li_name, name, 527 sizeof (pii->pii_probe_logint->li_name)) == 0) { 528 continue; 529 } 530 531 /* 532 * This test address is not unique. Set the dupaddr bit 533 */ 534 pii->pii_probe_logint->li_dupaddr = 1; 535 536 /* 537 * Log an error message if not already logged 538 */ 539 if (pii->pii_probe_logint->li_dupaddrmsg_printed) 540 continue; 541 542 logerr("Test address %s is not unique; disabling " 543 "probe-based failure detection\n", 544 pr_addr(af, addr, abuf, sizeof (abuf))); 545 546 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 547 } 548 } 549 550 /* 551 * Stop probing an interface. Called when an interface is offlined. 552 * The probe socket is closed on each interface instance, and the 553 * interface state set to PI_OFFLINE. 554 */ 555 static void 556 stop_probing(struct phyint *pi) 557 { 558 struct phyint_instance *pii; 559 560 pii = pi->pi_v4; 561 if (pii != NULL) { 562 if (pii->pii_probe_sock != -1) 563 close_probe_socket(pii, _B_TRUE); 564 pii->pii_probe_logint = NULL; 565 } 566 567 pii = pi->pi_v6; 568 if (pii != NULL) { 569 if (pii->pii_probe_sock != -1) 570 close_probe_socket(pii, _B_TRUE); 571 pii->pii_probe_logint = NULL; 572 } 573 574 phyint_chstate(pi, PI_OFFLINE); 575 } 576 577 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 578 579 /* 580 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 581 * IFF_UP must also be set so that the associated address can be used as a 582 * source address. Further, we must be able to exchange packets with local 583 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 584 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 585 */ 586 static int 587 rate_testflags(uint64_t flags) 588 { 589 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 590 return (BAD_TESTFLAGS); 591 592 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 593 return (BAD_TESTFLAGS); 594 595 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 596 return (BEST_TESTFLAGS); 597 598 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 599 return (BEST_TESTFLAGS); 600 601 return (OK_TESTFLAGS); 602 } 603 604 /* 605 * Attempt to select a test address for each phyint instance. 606 * Call phyint_inst_sockinit() to complete the initializations. 607 */ 608 static void 609 select_test_ifs(void) 610 { 611 struct phyint *pi; 612 struct phyint_instance *pii; 613 struct phyint_instance *next_pii; 614 struct logint *li; 615 struct logint *probe_logint; 616 boolean_t target_scan_reqd = _B_FALSE; 617 struct target *tg; 618 int rating; 619 620 if (debug & D_PHYINT) 621 logdebug("select_test_ifs\n"); 622 623 /* 624 * For each phyint instance, do the test address selection 625 */ 626 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 627 next_pii = pii->pii_next; 628 probe_logint = NULL; 629 630 /* 631 * An interface that is offline, should not be probed. 632 * Offline interfaces should always in PI_OFFLINE state, 633 * unless some other entity has set the offline flag. 634 */ 635 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 636 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 637 logerr("shouldn't be probing offline" 638 " interface %s (state is: %u)." 639 " Stopping probes.\n", 640 pii->pii_phyint->pi_name, 641 pii->pii_phyint->pi_state); 642 stop_probing(pii->pii_phyint); 643 } 644 continue; 645 } 646 647 li = pii->pii_probe_logint; 648 if (li != NULL) { 649 /* 650 * We've already got a test address; only proceed 651 * if it's suboptimal. 652 */ 653 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 654 continue; 655 } 656 657 /* 658 * Walk the logints of this phyint instance, and select 659 * the best available test address 660 */ 661 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 662 /* 663 * Skip any IPv6 logints that are not link-local, 664 * since we should always have a link-local address 665 * anyway and in6_data() expects link-local replies. 666 */ 667 if (pii->pii_af == AF_INET6 && 668 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 669 continue; 670 671 /* 672 * Rate the testflags. If we've found an optimal 673 * match, then break out; otherwise, record the most 674 * recent OK one. 675 */ 676 rating = rate_testflags(li->li_flags); 677 if (rating == BAD_TESTFLAGS) 678 continue; 679 680 probe_logint = li; 681 if (rating == BEST_TESTFLAGS) 682 break; 683 } 684 685 /* 686 * If the probe logint has changed, ditch the old one. 687 */ 688 if (pii->pii_probe_logint != NULL && 689 pii->pii_probe_logint != probe_logint) { 690 if (pii->pii_probe_sock != -1) 691 close_probe_socket(pii, _B_TRUE); 692 pii->pii_probe_logint = NULL; 693 } 694 695 if (probe_logint == NULL) { 696 /* 697 * We don't have a test address. Don't print an 698 * error message immediately. check_config() will 699 * take care of it. Zero out the probe stats array 700 * since it is no longer relevant. Optimize by 701 * checking if it is already zeroed out. 702 */ 703 int pr_ndx; 704 705 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 706 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 707 clear_pii_probe_stats(pii); 708 reset_crtt_all(pii->pii_phyint); 709 } 710 continue; 711 } else if (probe_logint == pii->pii_probe_logint) { 712 /* 713 * If we didn't find any new test addr, go to the 714 * next phyint. 715 */ 716 continue; 717 } 718 719 /* 720 * The phyint is either being assigned a new testaddr 721 * or is being assigned a testaddr for the 1st time. 722 * Need to initialize the phyint socket 723 */ 724 pii->pii_probe_logint = probe_logint; 725 if (!phyint_inst_sockinit(pii)) { 726 if (debug & D_PHYINT) { 727 logdebug("select_test_ifs: " 728 "phyint_sockinit failed\n"); 729 } 730 phyint_inst_delete(pii); 731 continue; 732 } 733 734 /* 735 * This phyint instance is now enabled for probes; this 736 * impacts our state machine in two ways: 737 * 738 * 1. If we're probe *capable* as well (i.e., we have 739 * probe targets) and the interface is in PI_NOTARGETS, 740 * then transition to PI_RUNNING. 741 * 742 * 2. If we're not probe capable, and the other phyint 743 * instance is also not probe capable, and we were in 744 * PI_RUNNING, then transition to PI_NOTARGETS. 745 * 746 * Also see the state diagram in mpd_probe.c. 747 */ 748 if (PROBE_CAPABLE(pii)) { 749 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 750 phyint_chstate(pii->pii_phyint, PI_RUNNING); 751 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 752 if (pii->pii_phyint->pi_state == PI_RUNNING) 753 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 754 } 755 756 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 757 tg = pii->pii_targets; 758 if (tg != NULL) 759 target_delete(tg); 760 assert(pii->pii_targets == NULL); 761 assert(pii->pii_target_next == NULL); 762 assert(pii->pii_ntargets == 0); 763 target_create(pii, probe_logint->li_dstaddr, 764 _B_TRUE); 765 } 766 767 /* 768 * If no targets are currently known for this phyint 769 * we need to call init_router_targets. Since 770 * init_router_targets() initializes the list of targets 771 * for all phyints it is done below the loop. 772 */ 773 if (pii->pii_targets == NULL) 774 target_scan_reqd = _B_TRUE; 775 776 /* 777 * Start the probe timer for this instance. 778 */ 779 if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) { 780 start_timer(pii); 781 pii->pii_basetime_inited = 1; 782 } 783 } 784 785 /* 786 * Check the interface list for any interfaces that are marked 787 * PI_FAILED but no longer enabled to send probes, and call 788 * phyint_check_for_repair() to see if the link now indicates that the 789 * interface should be repaired. Also see the state diagram in 790 * mpd_probe.c. 791 */ 792 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 793 if (pi->pi_state == PI_FAILED && 794 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 795 phyint_check_for_repair(pi); 796 } 797 } 798 799 /* 800 * Try to populate the target list. init_router_targets populates 801 * the target list from the routing table. If our target list is 802 * still empty, init_host_targets adds host targets based on the 803 * host target list of other phyints in the group. 804 */ 805 if (target_scan_reqd) { 806 init_router_targets(); 807 init_host_targets(); 808 } 809 } 810 811 /* 812 * Check phyint group configuration, to detect any inconsistencies, 813 * and log an error message. This is called from runtimeouts every 814 * 20 secs. But the error message is displayed once. If the 815 * consistency is resolved by the admin, a recovery message is displayed 816 * once. 817 */ 818 static void 819 check_config(void) 820 { 821 struct phyint_group *pg; 822 struct phyint *pi; 823 boolean_t v4_in_group; 824 boolean_t v6_in_group; 825 826 /* 827 * All phyints of a group must be homogenous to ensure that 828 * failover or failback can be done. If any phyint in a group 829 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 830 * Do a similar check for IPv6. 831 */ 832 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 833 if (pg == phyint_anongroup) 834 continue; 835 836 v4_in_group = _B_FALSE; 837 v6_in_group = _B_FALSE; 838 /* 839 * 1st pass. Determine if at least 1 phyint in the group 840 * has IPv4 plumbed and if so set v4_in_group to true. 841 * Repeat similarly for IPv6. 842 */ 843 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 844 if (pi->pi_v4 != NULL) 845 v4_in_group = _B_TRUE; 846 if (pi->pi_v6 != NULL) 847 v6_in_group = _B_TRUE; 848 } 849 850 /* 851 * 2nd pass. If v4_in_group is true, check that phyint 852 * has IPv4 plumbed. Repeat similarly for IPv6. Print 853 * out a message the 1st time only. 854 */ 855 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 856 if (pi->pi_flags & IFF_OFFLINE) 857 continue; 858 859 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 860 if (!pi->pi_cfgmsg_printed) { 861 logerr("NIC %s of group %s is" 862 " not plumbed for IPv4 and may" 863 " affect failover capability\n", 864 pi->pi_name, 865 pi->pi_group->pg_name); 866 pi->pi_cfgmsg_printed = 1; 867 } 868 } else if (v6_in_group == _B_TRUE && 869 pi->pi_v6 == NULL) { 870 if (!pi->pi_cfgmsg_printed) { 871 logerr("NIC %s of group %s is" 872 " not plumbed for IPv6 and may" 873 " affect failover capability\n", 874 pi->pi_name, 875 pi->pi_group->pg_name); 876 pi->pi_cfgmsg_printed = 1; 877 } 878 } else { 879 /* 880 * The phyint matches the group configuration, 881 * if we have reached this point. If it was 882 * improperly configured earlier, log an 883 * error recovery message 884 */ 885 if (pi->pi_cfgmsg_printed) { 886 logerr("NIC %s is now consistent with " 887 "group %s and failover capability " 888 "is restored\n", pi->pi_name, 889 pi->pi_group->pg_name); 890 pi->pi_cfgmsg_printed = 0; 891 } 892 } 893 894 } 895 } 896 897 /* 898 * In order to perform probe-based failure detection, a phyint must 899 * have at least 1 test/probe address for sending and receiving probes 900 * (either on IPv4 or IPv6 instance or both). If no test address has 901 * been configured, notify the administrator, but continue on since we 902 * can still perform load spreading, along with "link up/down" based 903 * failure detection. 904 */ 905 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 906 if (pi->pi_flags & IFF_OFFLINE) 907 continue; 908 909 if ((pi->pi_v4 == NULL || 910 pi->pi_v4->pii_probe_logint == NULL) && 911 (pi->pi_v6 == NULL || 912 pi->pi_v6->pii_probe_logint == NULL)) { 913 if (!pi->pi_taddrmsg_printed) { 914 logerr("No test address configured on " 915 "interface %s; disabling probe-based " 916 "failure detection on it\n", pi->pi_name); 917 pi->pi_taddrmsg_printed = 1; 918 } 919 } else if (pi->pi_taddrmsg_printed) { 920 logerr("Test address now configured on interface %s; " 921 "enabling probe-based failure detection on it\n", 922 pi->pi_name); 923 pi->pi_taddrmsg_printed = 0; 924 } 925 926 } 927 } 928 929 /* 930 * Timer mechanism using relative time (in milliseconds) from the 931 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 932 * will fire after TIMER_INFINITY milliseconds. 933 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 934 * time values. Hence 2 consecutive timer events cannot be spaced farther 935 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 936 * that can be passed for the delay parameter of timer_schedule() 937 */ 938 static uint_t timer_next; /* Currently scheduled timeout */ 939 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 940 941 static void 942 timer_init(void) 943 { 944 timer_next = getcurrenttime() + TIMER_INFINITY; 945 /* 946 * The call to run_timeouts() will get the timer started 947 * Since there are no phyints at this point, the timer will 948 * be set for IF_SCAN_INTERVAL ms. 949 */ 950 run_timeouts(); 951 } 952 953 /* 954 * Make sure the next SIGALRM occurs delay milliseconds from the current 955 * time if not earlier. We are interested only in time differences. 956 */ 957 void 958 timer_schedule(uint_t delay) 959 { 960 uint_t now; 961 struct itimerval itimerval; 962 963 if (debug & D_TIMER) 964 logdebug("timer_schedule(%u)\n", delay); 965 966 assert(delay <= TIMER_INFINITY); 967 968 now = getcurrenttime(); 969 if (delay == 0) { 970 /* Minimum allowed delay */ 971 delay = 1; 972 } 973 /* Will this timer occur before the currently scheduled SIGALRM? */ 974 if (timer_active && TIME_GE(now + delay, timer_next)) { 975 if (debug & D_TIMER) { 976 logdebug("timer_schedule(%u) - no action: " 977 "now %u next %u\n", delay, now, timer_next); 978 } 979 return; 980 } 981 timer_next = now + delay; 982 983 itimerval.it_value.tv_sec = delay / 1000; 984 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 985 itimerval.it_interval.tv_sec = 0; 986 itimerval.it_interval.tv_usec = 0; 987 if (debug & D_TIMER) { 988 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 989 delay, itimerval.it_value.tv_sec, 990 itimerval.it_value.tv_usec); 991 } 992 timer_active = _B_TRUE; 993 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 994 logperror("timer_schedule: setitimer"); 995 exit(2); 996 } 997 } 998 999 /* 1000 * Timer has fired. Determine when the next timer event will occur by asking 1001 * all the timer routines. Should not be called from a timer routine. 1002 */ 1003 static void 1004 run_timeouts(void) 1005 { 1006 uint_t next; 1007 uint_t next_event_time; 1008 struct phyint_instance *pii; 1009 struct phyint_instance *next_pii; 1010 static boolean_t timeout_running; 1011 1012 /* assert that recursive timeouts don't happen. */ 1013 assert(!timeout_running); 1014 1015 timeout_running = _B_TRUE; 1016 1017 if (debug & D_TIMER) 1018 logdebug("run_timeouts()\n"); 1019 1020 next = TIMER_INFINITY; 1021 1022 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1023 next_pii = pii->pii_next; 1024 next_event_time = phyint_inst_timer(pii); 1025 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1026 next = next_event_time; 1027 1028 if (debug & D_TIMER) { 1029 logdebug("run_timeouts(%s %s): next scheduled for" 1030 " this phyint inst %u, next scheduled global" 1031 " %u ms\n", 1032 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1033 next_event_time, next); 1034 } 1035 } 1036 1037 /* 1038 * Make sure initifs() is called at least once every 1039 * IF_SCAN_INTERVAL, to make sure that we are in sync 1040 * with the kernel, in case we have missed any routing 1041 * socket messages. 1042 */ 1043 if (next > IF_SCAN_INTERVAL) 1044 next = IF_SCAN_INTERVAL; 1045 1046 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1047 initifs(); 1048 check_config(); 1049 } 1050 1051 if (debug & D_TIMER) 1052 logdebug("run_timeouts: %u ms\n", next); 1053 1054 timer_schedule(next); 1055 timeout_running = _B_FALSE; 1056 } 1057 1058 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1059 static int eventpipe_write = -1; 1060 static boolean_t cleanup_started = _B_FALSE; 1061 /* Don't write to eventpipe if in cleanup */ 1062 /* 1063 * Ensure that signals are processed synchronously with the rest of 1064 * the code by just writing a one character signal number on the pipe. 1065 * The poll loop will pick this up and process the signal event. 1066 */ 1067 static void 1068 sig_handler(int signo) 1069 { 1070 uchar_t buf = (uchar_t)signo; 1071 1072 /* 1073 * Don't write to pipe if cleanup has already begun. cleanup() 1074 * might have closed the pipe already 1075 */ 1076 if (cleanup_started) 1077 return; 1078 1079 if (eventpipe_write == -1) { 1080 logerr("sig_handler: no pipe found\n"); 1081 return; 1082 } 1083 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1084 logperror("sig_handler: write"); 1085 } 1086 1087 extern struct probes_missed probes_missed; 1088 1089 /* 1090 * Pick up a signal "byte" from the pipe and process it. 1091 */ 1092 static void 1093 in_signal(int fd) 1094 { 1095 uchar_t buf; 1096 uint64_t sent, acked, lost, unacked, unknown; 1097 struct phyint_instance *pii; 1098 int pr_ndx; 1099 1100 switch (read(fd, &buf, sizeof (buf))) { 1101 case -1: 1102 logperror("in_signal: read"); 1103 exit(1); 1104 /* NOTREACHED */ 1105 case 1: 1106 break; 1107 case 0: 1108 logerr("in_signal: read end of file\n"); 1109 exit(1); 1110 /* NOTREACHED */ 1111 default: 1112 logerr("in_signal: read > 1\n"); 1113 exit(1); 1114 } 1115 1116 if (debug & D_TIMER) 1117 logdebug("in_signal() got %d\n", buf); 1118 1119 switch (buf) { 1120 case SIGALRM: 1121 if (debug & D_TIMER) { 1122 uint_t now = getcurrenttime(); 1123 1124 logdebug("in_signal(SIGALRM) delta %u\n", 1125 now - timer_next); 1126 } 1127 timer_active = _B_FALSE; 1128 run_timeouts(); 1129 break; 1130 case SIGUSR1: 1131 logdebug("Printing configuration:\n"); 1132 /* Print out the internal tables */ 1133 phyint_inst_print_all(); 1134 1135 /* 1136 * Print out the accumulated statistics about missed 1137 * probes (happens due to scheduling delay). 1138 */ 1139 logerr("Missed sending total of %d probes spread over" 1140 " %d occurrences\n", probes_missed.pm_nprobes, 1141 probes_missed.pm_ntimes); 1142 1143 /* 1144 * Print out the accumulated statistics about probes 1145 * that were sent. 1146 */ 1147 for (pii = phyint_instances; pii != NULL; 1148 pii = pii->pii_next) { 1149 unacked = 0; 1150 acked = pii->pii_cum_stats.acked; 1151 lost = pii->pii_cum_stats.lost; 1152 sent = pii->pii_cum_stats.sent; 1153 unknown = pii->pii_cum_stats.unknown; 1154 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1155 switch (pii->pii_probes[pr_ndx].pr_status) { 1156 case PR_ACKED: 1157 acked++; 1158 break; 1159 case PR_LOST: 1160 lost++; 1161 break; 1162 case PR_UNACKED: 1163 unacked++; 1164 break; 1165 } 1166 } 1167 logerr("\nProbe stats on (%s %s)\n" 1168 "Number of probes sent %lld\n" 1169 "Number of probe acks received %lld\n" 1170 "Number of probes/acks lost %lld\n" 1171 "Number of valid unacknowled probes %lld\n" 1172 "Number of ambiguous probe acks received %lld\n", 1173 AF_STR(pii->pii_af), pii->pii_name, 1174 sent, acked, lost, unacked, unknown); 1175 } 1176 break; 1177 case SIGHUP: 1178 logerr("SIGHUP: restart and reread config file\n"); 1179 cleanup(); 1180 (void) execv(argv0[0], argv0); 1181 _exit(0177); 1182 /* NOTREACHED */ 1183 case SIGINT: 1184 case SIGTERM: 1185 case SIGQUIT: 1186 cleanup(); 1187 exit(0); 1188 /* NOTREACHED */ 1189 default: 1190 logerr("in_signal: unknown signal: %d\n", buf); 1191 } 1192 } 1193 1194 static void 1195 cleanup(void) 1196 { 1197 struct phyint_instance *pii; 1198 struct phyint_instance *next_pii; 1199 1200 /* 1201 * Make sure that we don't write to eventpipe in 1202 * sig_handler() if any signal notably SIGALRM, 1203 * occurs after we close the eventpipe descriptor below 1204 */ 1205 cleanup_started = _B_TRUE; 1206 1207 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1208 next_pii = pii->pii_next; 1209 phyint_inst_delete(pii); 1210 } 1211 1212 (void) close(ifsock_v4); 1213 (void) close(ifsock_v6); 1214 (void) close(rtsock_v4); 1215 (void) close(rtsock_v6); 1216 (void) close(lsock_v4); 1217 (void) close(lsock_v6); 1218 (void) close(0); 1219 (void) close(1); 1220 (void) close(2); 1221 (void) close(mibfd); 1222 (void) close(eventpipe_read); 1223 (void) close(eventpipe_write); 1224 } 1225 1226 /* 1227 * Create pipe for signal delivery and set up signal handlers. 1228 */ 1229 static void 1230 setup_eventpipe(void) 1231 { 1232 int fds[2]; 1233 struct sigaction act; 1234 1235 if ((pipe(fds)) < 0) { 1236 logperror("setup_eventpipe: pipe"); 1237 exit(1); 1238 } 1239 eventpipe_read = fds[0]; 1240 eventpipe_write = fds[1]; 1241 if (poll_add(eventpipe_read) == -1) { 1242 exit(1); 1243 } 1244 1245 act.sa_handler = sig_handler; 1246 act.sa_flags = SA_RESTART; 1247 (void) sigaction(SIGALRM, &act, NULL); 1248 1249 (void) sigset(SIGHUP, sig_handler); 1250 (void) sigset(SIGUSR1, sig_handler); 1251 (void) sigset(SIGTERM, sig_handler); 1252 (void) sigset(SIGINT, sig_handler); 1253 (void) sigset(SIGQUIT, sig_handler); 1254 } 1255 1256 /* 1257 * Create a routing socket for receiving RTM_IFINFO messages. 1258 */ 1259 static int 1260 setup_rtsock(int af) 1261 { 1262 int s; 1263 int flags; 1264 1265 s = socket(PF_ROUTE, SOCK_RAW, af); 1266 if (s == -1) { 1267 logperror("setup_rtsock: socket PF_ROUTE"); 1268 exit(1); 1269 } 1270 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1271 logperror("setup_rtsock: fcntl F_GETFL"); 1272 (void) close(s); 1273 exit(1); 1274 } 1275 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1276 logperror("setup_rtsock: fcntl F_SETFL"); 1277 (void) close(s); 1278 exit(1); 1279 } 1280 if (poll_add(s) == -1) { 1281 (void) close(s); 1282 exit(1); 1283 } 1284 return (s); 1285 } 1286 1287 /* 1288 * Process an RTM_IFINFO message received on a routing socket. 1289 * The return value indicates whether a full interface scan is required. 1290 * Link up/down notifications from the NICs are reflected in the 1291 * IFF_RUNNING flag. 1292 * If just the state of the IFF_RUNNING interface flag has changed, a 1293 * a full interface scan isn't required. 1294 */ 1295 static boolean_t 1296 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1297 { 1298 struct sockaddr_dl *sdl; 1299 struct phyint *pi; 1300 uint64_t old_flags; 1301 struct phyint_instance *pii; 1302 1303 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1304 1305 /* 1306 * Although the sockaddr_dl structure is directly after the 1307 * if_msghdr_t structure. At the time of writing, the size of the 1308 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1309 * to the presence of a timeval structure, which contains longs, 1310 * in the if_data structure. Anyway, we know where the message ends, 1311 * so we work backwards to get the start of the sockaddr_dl structure. 1312 */ 1313 /*LINTED*/ 1314 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1315 sizeof (struct sockaddr_dl)); 1316 1317 assert(sdl->sdl_family == AF_LINK); 1318 1319 /* 1320 * The interface name is in sdl_data. 1321 * RTM_IFINFO messages are only generated for logical interface 1322 * zero, so there is no colon and logical interface number to 1323 * strip from the name. The name is not null terminated, but 1324 * there should be enough space in sdl_data to add the null. 1325 */ 1326 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1327 if (debug & D_LINKNOTE) 1328 logdebug("process_rtm_ifinfo: " 1329 "phyint name too long\n"); 1330 return (_B_TRUE); 1331 } 1332 sdl->sdl_data[sdl->sdl_nlen] = 0; 1333 1334 pi = phyint_lookup(sdl->sdl_data); 1335 if (pi == NULL) { 1336 if (debug & D_LINKNOTE) 1337 logdebug("process_rtm_ifinfo: phyint lookup failed" 1338 " for %s\n", sdl->sdl_data); 1339 return (_B_TRUE); 1340 } 1341 1342 /* 1343 * We want to try and avoid doing a full interface scan for 1344 * link state notifications from the NICs, as indicated 1345 * by the state of the IFF_RUNNING flag. If just the 1346 * IFF_RUNNING flag has changed state, the link state changes 1347 * are processed without a full scan. 1348 * If there is both an IPv4 and IPv6 instance associated with 1349 * the physical interface, we will get an RTM_IFINFO message 1350 * for each instance. If we just maintained a single copy of 1351 * the physical interface flags, it would appear that no flags 1352 * had changed when the second message is processed, leading us 1353 * to believe that the message wasn't generated by a flags change, 1354 * and that a full interface scan is required. 1355 * To get around this problem, two additional copies of the flags 1356 * are kept, one copy for each instance. These are only used in 1357 * this routine. At any one time, all three copies of the flags 1358 * should be identical except for the IFF_RUNNING flag. The 1359 * copy of the flags in the "phyint" structure is always up to 1360 * date. 1361 */ 1362 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1363 if (pii == NULL) { 1364 if (debug & D_LINKNOTE) 1365 logdebug("process_rtm_ifinfo: no instance of address " 1366 "family %s for %s\n", AF_STR(type), pi->pi_name); 1367 return (_B_TRUE); 1368 } 1369 1370 old_flags = pii->pii_flags; 1371 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1372 pi->pi_flags = pii->pii_flags; 1373 1374 if (debug & D_LINKNOTE) { 1375 logdebug("process_rtm_ifinfo: %s address family: %s, " 1376 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1377 AF_STR(type), old_flags, pi->pi_flags); 1378 } 1379 1380 /* 1381 * If IFF_STANDBY has changed, indicate that the interface has changed 1382 * types. 1383 */ 1384 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1385 phyint_newtype(pi); 1386 1387 /* 1388 * If IFF_INACTIVE has been set, then no data addresses should be 1389 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1390 * move previously failed-over addresses back to it, provided it is 1391 * not failed. For details, see the state diagram in mpd_probe.c. 1392 */ 1393 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1394 if (pii->pii_flags & IFF_INACTIVE) { 1395 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1396 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1397 } else { 1398 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1399 pi->pi_empty = 0; 1400 (void) try_failback(pi, _B_FALSE); 1401 } 1402 } 1403 } 1404 1405 /* Has just the IFF_RUNNING flag changed state ? */ 1406 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1407 struct phyint_instance *pii_other; 1408 /* 1409 * It wasn't just a link state change. Update 1410 * the other instance's copy of the flags. 1411 */ 1412 pii_other = phyint_inst_other(pii); 1413 if (pii_other != NULL) 1414 pii_other->pii_flags = pii->pii_flags; 1415 return (_B_TRUE); 1416 } 1417 1418 return (_B_FALSE); 1419 } 1420 1421 /* 1422 * Retrieve as many routing socket messages as possible, and try to 1423 * empty the routing sockets. Initiate full scan of targets or interfaces 1424 * as needed. 1425 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1426 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1427 */ 1428 static void 1429 process_rtsock(int rtsock_v4, int rtsock_v6) 1430 { 1431 int nbytes; 1432 int64_t msg[2048 / 8]; 1433 struct rt_msghdr *rtm; 1434 boolean_t need_if_scan = _B_FALSE; 1435 boolean_t need_rt_scan = _B_FALSE; 1436 boolean_t rtm_ifinfo_seen = _B_FALSE; 1437 int type; 1438 1439 /* Read as many messages as possible and try to empty the sockets */ 1440 for (type = AF_INET; ; type = AF_INET6) { 1441 for (;;) { 1442 nbytes = read((type == AF_INET) ? rtsock_v4 : 1443 rtsock_v6, msg, sizeof (msg)); 1444 if (nbytes <= 0) { 1445 /* No more messages */ 1446 break; 1447 } 1448 rtm = (struct rt_msghdr *)msg; 1449 if (rtm->rtm_version != RTM_VERSION) { 1450 logerr("process_rtsock: version %d " 1451 "not understood\n", rtm->rtm_version); 1452 break; 1453 } 1454 1455 if (debug & D_PHYINT) { 1456 logdebug("process_rtsock: message %d\n", 1457 rtm->rtm_type); 1458 } 1459 1460 switch (rtm->rtm_type) { 1461 case RTM_NEWADDR: 1462 case RTM_DELADDR: 1463 /* 1464 * Some logical interface has changed, 1465 * have to scan everything to determine 1466 * what actually changed. 1467 */ 1468 need_if_scan = _B_TRUE; 1469 break; 1470 1471 case RTM_IFINFO: 1472 rtm_ifinfo_seen = _B_TRUE; 1473 need_if_scan |= 1474 process_rtm_ifinfo((if_msghdr_t *)rtm, 1475 type); 1476 break; 1477 1478 case RTM_ADD: 1479 case RTM_DELETE: 1480 case RTM_CHANGE: 1481 case RTM_OLDADD: 1482 case RTM_OLDDEL: 1483 need_rt_scan = _B_TRUE; 1484 break; 1485 1486 default: 1487 /* Not interesting */ 1488 break; 1489 } 1490 } 1491 if (type == AF_INET6) 1492 break; 1493 } 1494 1495 if (need_if_scan) { 1496 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1497 logdebug("process_rtsock: synchronizing with kernel\n"); 1498 initifs(); 1499 } else if (rtm_ifinfo_seen) { 1500 if (debug & D_LINKNOTE) 1501 logdebug("process_rtsock: " 1502 "link up/down notification(s) seen\n"); 1503 process_link_state_changes(); 1504 } 1505 1506 if (need_rt_scan) 1507 init_router_targets(); 1508 } 1509 1510 /* 1511 * Look if the phyint instance or one of its logints have been removed from 1512 * the kernel and take appropriate action. 1513 * Uses {pii,li}_in_use. 1514 */ 1515 static void 1516 check_if_removed(struct phyint_instance *pii) 1517 { 1518 struct logint *li; 1519 struct logint *next_li; 1520 1521 /* Detect phyints that have been removed from the kernel. */ 1522 if (!pii->pii_in_use) { 1523 logtrace("%s %s has been removed from kernel\n", 1524 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1525 phyint_inst_delete(pii); 1526 } else { 1527 /* Detect logints that have been removed. */ 1528 for (li = pii->pii_logint; li != NULL; li = next_li) { 1529 next_li = li->li_next; 1530 if (!li->li_in_use) { 1531 logint_delete(li); 1532 } 1533 } 1534 } 1535 } 1536 1537 /* 1538 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1539 * tables defined by mib2.h. Parse the returned data and extract 1540 * the 'routing' information table. Process the 'routing' table 1541 * to get the list of known onlink routers, and update our database. 1542 * These onlink routers will serve as our probe targets. 1543 * Returns false, if any system calls resulted in errors, true otherwise. 1544 */ 1545 static boolean_t 1546 update_router_list(int fd) 1547 { 1548 union { 1549 char ubuf[1024]; 1550 union T_primitives uprim; 1551 } buf; 1552 1553 int flags; 1554 struct strbuf ctlbuf; 1555 struct strbuf databuf; 1556 struct T_optmgmt_req *tor; 1557 struct T_optmgmt_ack *toa; 1558 struct T_error_ack *tea; 1559 struct opthdr *optp; 1560 struct opthdr *req; 1561 int status; 1562 t_scalar_t prim; 1563 1564 tor = (struct T_optmgmt_req *)&buf; 1565 1566 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1567 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1568 tor->OPT_length = sizeof (struct opthdr); 1569 tor->MGMT_flags = T_CURRENT; 1570 1571 req = (struct opthdr *)&tor[1]; 1572 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1573 req->name = 0; 1574 req->len = 0; 1575 1576 ctlbuf.buf = (char *)&buf; 1577 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1578 ctlbuf.maxlen = sizeof (buf); 1579 flags = 0; 1580 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1581 logperror("update_router_list: putmsg(ctl)"); 1582 return (_B_FALSE); 1583 } 1584 1585 /* 1586 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1587 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1588 * a control and data part. The control part contains a struct 1589 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1590 * the level, name and length of the data in the data part. The 1591 * data part contains the actual table data. The last message 1592 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1593 * single option with zero optlen. 1594 */ 1595 1596 for (;;) { 1597 /* 1598 * Go around this loop once for each table. Ignore 1599 * all tables except the routing information table. 1600 */ 1601 flags = 0; 1602 status = getmsg(fd, &ctlbuf, NULL, &flags); 1603 if (status < 0) { 1604 if (errno == EINTR) 1605 continue; 1606 logperror("update_router_list: getmsg(ctl)"); 1607 return (_B_FALSE); 1608 } 1609 if (ctlbuf.len < sizeof (t_scalar_t)) { 1610 logerr("update_router_list: ctlbuf.len %d\n", 1611 ctlbuf.len); 1612 return (_B_FALSE); 1613 } 1614 1615 prim = buf.uprim.type; 1616 1617 switch (prim) { 1618 1619 case T_ERROR_ACK: 1620 tea = &buf.uprim.error_ack; 1621 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1622 logerr("update_router_list: T_ERROR_ACK" 1623 " ctlbuf.len %d\n", ctlbuf.len); 1624 return (_B_FALSE); 1625 } 1626 logerr("update_router_list: T_ERROR_ACK:" 1627 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1628 tea->TLI_error, tea->UNIX_error); 1629 return (_B_FALSE); 1630 1631 case T_OPTMGMT_ACK: 1632 toa = &buf.uprim.optmgmt_ack; 1633 optp = (struct opthdr *)&toa[1]; 1634 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1635 logerr("update_router_list: ctlbuf.len %d\n", 1636 ctlbuf.len); 1637 return (_B_FALSE); 1638 } 1639 if (toa->MGMT_flags != T_SUCCESS) { 1640 logerr("update_router_list: MGMT_flags 0x%lx\n", 1641 toa->MGMT_flags); 1642 return (_B_FALSE); 1643 } 1644 break; 1645 1646 default: 1647 logerr("update_router_list: unknown primitive %ld\n", 1648 prim); 1649 return (_B_FALSE); 1650 } 1651 1652 /* Process the T_OPGMGMT_ACK below */ 1653 assert(prim == T_OPTMGMT_ACK); 1654 1655 switch (status) { 1656 case 0: 1657 /* 1658 * We have reached the end of this T_OPTMGMT_ACK 1659 * message. If this is the last message i.e EOD, 1660 * return, else process the next T_OPTMGMT_ACK msg. 1661 */ 1662 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1663 sizeof (struct opthdr)) && optp->len == 0 && 1664 optp->name == 0 && optp->level == 0) { 1665 /* 1666 * This is the EOD message. Return 1667 */ 1668 return (_B_TRUE); 1669 } 1670 continue; 1671 1672 case MORECTL: 1673 case MORECTL | MOREDATA: 1674 /* 1675 * This should not happen. We should be able to read 1676 * the control portion in a single getmsg. 1677 */ 1678 logerr("update_router_list: MORECTL\n"); 1679 return (_B_FALSE); 1680 1681 case MOREDATA: 1682 databuf.maxlen = optp->len; 1683 /* malloc of 0 bytes is ok */ 1684 databuf.buf = malloc((size_t)optp->len); 1685 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1686 logperror("update_router_list: malloc"); 1687 return (_B_FALSE); 1688 } 1689 databuf.len = 0; 1690 flags = 0; 1691 for (;;) { 1692 status = getmsg(fd, NULL, &databuf, &flags); 1693 if (status >= 0) { 1694 break; 1695 } else if (errno == EINTR) { 1696 continue; 1697 } else { 1698 logperror("update_router_list:" 1699 " getmsg(data)"); 1700 free(databuf.buf); 1701 return (_B_FALSE); 1702 } 1703 } 1704 1705 if (optp->level == MIB2_IP && 1706 optp->name == MIB2_IP_ROUTE) { 1707 /* LINTED */ 1708 ire_process_v4((mib2_ipRouteEntry_t *) 1709 databuf.buf, databuf.len); 1710 } else if (optp->level == MIB2_IP6 && 1711 optp->name == MIB2_IP6_ROUTE) { 1712 /* LINTED */ 1713 ire_process_v6((mib2_ipv6RouteEntry_t *) 1714 databuf.buf, databuf.len); 1715 } 1716 free(databuf.buf); 1717 } 1718 } 1719 /* NOTREACHED */ 1720 } 1721 1722 /* 1723 * Examine the IPv4 routing table, for default routers. For each default 1724 * router, populate the list of targets of each phyint that is on the same 1725 * link as the default router 1726 */ 1727 static void 1728 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1729 { 1730 mib2_ipRouteEntry_t *rp; 1731 mib2_ipRouteEntry_t *rp1; 1732 struct in_addr nexthop_v4; 1733 mib2_ipRouteEntry_t *endp; 1734 1735 if (len == 0) 1736 return; 1737 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1738 1739 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1740 1741 /* 1742 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1743 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1744 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1745 * This is a potential target for probing, which we try to add 1746 * to the list of probe targets. 1747 */ 1748 for (rp = buf; rp < endp; rp++) { 1749 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1750 continue; 1751 1752 /* Get the nexthop address. */ 1753 nexthop_v4.s_addr = rp->ipRouteNextHop; 1754 1755 /* 1756 * Get the nexthop address. Then determine the outgoing 1757 * interface, by examining all interface IREs, and picking the 1758 * match. We don't look at the interface specified in the route 1759 * because we need to add the router target on all matching 1760 * interfaces anyway; the goal is to avoid falling back to 1761 * multicast when some interfaces are in the same subnet but 1762 * not in the same group. 1763 */ 1764 for (rp1 = buf; rp1 < endp; rp1++) { 1765 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1766 continue; 1767 } 1768 1769 /* 1770 * Determine the interface IRE that matches the nexthop. 1771 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1772 */ 1773 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1774 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1775 /* 1776 * We found the interface ire 1777 */ 1778 router_add_v4(rp1, nexthop_v4); 1779 } 1780 } 1781 } 1782 } 1783 1784 void 1785 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1786 { 1787 char *cp; 1788 char ifname[LIFNAMSIZ + 1]; 1789 struct in6_addr nexthop; 1790 int len; 1791 1792 if (debug & D_TARGET) 1793 logdebug("router_add_v4()\n"); 1794 1795 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1796 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1797 ifname[len] = '\0'; 1798 1799 if (ifname[0] == '\0') 1800 return; 1801 1802 cp = strchr(ifname, IF_SEPARATOR); 1803 if (cp != NULL) 1804 *cp = '\0'; 1805 1806 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1807 router_add_common(AF_INET, ifname, nexthop); 1808 } 1809 1810 void 1811 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1812 { 1813 struct phyint_instance *pii; 1814 struct phyint *pi; 1815 1816 if (debug & D_TARGET) 1817 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1818 1819 /* 1820 * Retrieve the phyint instance; bail if it's not known to us yet. 1821 */ 1822 pii = phyint_inst_lookup(af, ifname); 1823 if (pii == NULL) 1824 return; 1825 1826 /* 1827 * Don't use our own addresses as targets. 1828 */ 1829 if (own_address(pii->pii_af, nexthop)) 1830 return; 1831 1832 /* 1833 * If the phyint is part a named group, then add the address to all 1834 * members of the group; note that this is suboptimal in the IPv4 case 1835 * as it has already been added to all matching interfaces in 1836 * ire_process_v4(). Otherwise, add the address only to the phyint 1837 * itself, since other phyints in the anongroup may not be on the same 1838 * subnet. 1839 */ 1840 pi = pii->pii_phyint; 1841 if (pi->pi_group == phyint_anongroup) { 1842 target_add(pii, nexthop, _B_TRUE); 1843 } else { 1844 pi = pi->pi_group->pg_phyint; 1845 for (; pi != NULL; pi = pi->pi_pgnext) 1846 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1847 } 1848 } 1849 1850 /* 1851 * Examine the IPv6 routing table, for default routers. For each default 1852 * router, populate the list of targets of each phyint that is on the same 1853 * link as the default router 1854 */ 1855 static void 1856 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1857 { 1858 mib2_ipv6RouteEntry_t *rp; 1859 mib2_ipv6RouteEntry_t *endp; 1860 struct in6_addr nexthop_v6; 1861 1862 if (debug & D_TARGET) 1863 logdebug("ire_process_v6(len %d)\n", len); 1864 1865 if (len == 0) 1866 return; 1867 1868 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1869 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1870 1871 /* 1872 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1873 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1874 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1875 * This is a potential target for probing, which we try to add 1876 * to the list of probe targets. 1877 */ 1878 for (rp = buf; rp < endp; rp++) { 1879 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1880 continue; 1881 1882 /* 1883 * We have the outgoing interface in ipv6RouteIfIndex 1884 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1885 * interface must be present for link-local addresses. Since 1886 * we use only link-local addreses for probing, we don't 1887 * consider the case when the outgoing interface is not 1888 * known and we need to scan interface ires 1889 */ 1890 nexthop_v6 = rp->ipv6RouteNextHop; 1891 if (rp->ipv6RouteIfIndex.o_length != 0) { 1892 /* 1893 * We already have the outgoing interface 1894 * in ipv6RouteIfIndex. 1895 */ 1896 router_add_v6(rp, nexthop_v6); 1897 } 1898 } 1899 } 1900 1901 1902 void 1903 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1904 { 1905 char ifname[LIFNAMSIZ + 1]; 1906 char *cp; 1907 int len; 1908 1909 if (debug & D_TARGET) 1910 logdebug("router_add_v6()\n"); 1911 1912 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1913 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1914 ifname[len] = '\0'; 1915 1916 if (ifname[0] == '\0') 1917 return; 1918 1919 cp = strchr(ifname, IF_SEPARATOR); 1920 if (cp != NULL) 1921 *cp = '\0'; 1922 1923 router_add_common(AF_INET6, ifname, nexthop_v6); 1924 } 1925 1926 1927 1928 /* 1929 * Build a list of target routers, by scanning the routing tables. 1930 * It is assumed that interface routes exist, to reach the routers. 1931 */ 1932 static void 1933 init_router_targets(void) 1934 { 1935 struct target *tg; 1936 struct target *next_tg; 1937 struct phyint_instance *pii; 1938 struct phyint *pi; 1939 1940 if (force_mcast) 1941 return; 1942 1943 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1944 pi = pii->pii_phyint; 1945 /* 1946 * Exclude ptp and host targets. Set tg_in_use to false, 1947 * only for router targets. 1948 */ 1949 if (!pii->pii_targets_are_routers || 1950 (pi->pi_flags & IFF_POINTOPOINT)) 1951 continue; 1952 1953 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1954 tg->tg_in_use = 0; 1955 } 1956 1957 if (mibfd < 0) { 1958 mibfd = open("/dev/ip", O_RDWR); 1959 if (mibfd < 0) { 1960 logperror("mibopen: ip open"); 1961 exit(1); 1962 } 1963 } 1964 1965 if (!update_router_list(mibfd)) { 1966 (void) close(mibfd); 1967 mibfd = -1; 1968 } 1969 1970 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1971 if (!pii->pii_targets_are_routers || 1972 (pi->pi_flags & IFF_POINTOPOINT)) 1973 continue; 1974 1975 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1976 next_tg = tg->tg_next; 1977 if (!tg->tg_in_use) { 1978 target_delete(tg); 1979 } 1980 } 1981 } 1982 } 1983 1984 /* 1985 * Attempt to assign host targets to any interfaces that do not currently 1986 * have probe targets by sharing targets with other interfaces in the group. 1987 */ 1988 static void 1989 init_host_targets(void) 1990 { 1991 struct phyint_instance *pii; 1992 struct phyint_group *pg; 1993 1994 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1995 pg = pii->pii_phyint->pi_group; 1996 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1997 dup_host_targets(pii); 1998 } 1999 } 2000 2001 /* 2002 * Duplicate host targets from other phyints of the group to 2003 * the phyint instance 'desired_pii'. 2004 */ 2005 static void 2006 dup_host_targets(struct phyint_instance *desired_pii) 2007 { 2008 int af; 2009 struct phyint *pi; 2010 struct phyint_instance *pii; 2011 struct target *tg; 2012 2013 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2014 2015 af = desired_pii->pii_af; 2016 2017 /* 2018 * For every phyint in the same group as desired_pii, check if 2019 * it has any host targets. If so add them to desired_pii. 2020 */ 2021 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2022 pii = PHYINT_INSTANCE(pi, af); 2023 /* 2024 * We know that we don't have targets on this phyint instance 2025 * since we have been called. But we still check for 2026 * pii_targets_are_routers because another phyint instance 2027 * could have router targets, since IFF_NOFAILOVER addresses 2028 * on different phyint instances may belong to different 2029 * subnets. 2030 */ 2031 if ((pii == NULL) || (pii == desired_pii) || 2032 pii->pii_targets_are_routers) 2033 continue; 2034 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2035 target_create(desired_pii, tg->tg_address, _B_FALSE); 2036 } 2037 } 2038 } 2039 2040 static void 2041 usage(char *cmd) 2042 { 2043 (void) fprintf(stderr, "usage: %s\n", cmd); 2044 } 2045 2046 2047 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2048 2049 /* Get an option from the /etc/default/mpathd file */ 2050 static char * 2051 getdefault(char *name) 2052 { 2053 char namebuf[BUFSIZ]; 2054 char *value = NULL; 2055 2056 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2057 char *cp; 2058 int flags; 2059 2060 /* 2061 * ignore case 2062 */ 2063 flags = defcntl(DC_GETFLAGS, 0); 2064 TURNOFF(flags, DC_CASE); 2065 (void) defcntl(DC_SETFLAGS, flags); 2066 2067 /* Add "=" to the name */ 2068 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2069 (void) strncat(namebuf, "=", 2); 2070 2071 if ((cp = defread(namebuf)) != NULL) 2072 value = strdup(cp); 2073 2074 /* close */ 2075 (void) defopen((char *)NULL); 2076 } 2077 return (value); 2078 } 2079 2080 2081 /* 2082 * Command line options below 2083 */ 2084 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2085 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2086 static boolean_t adopt = _B_FALSE; 2087 static boolean_t foreground = _B_FALSE; 2088 2089 int 2090 main(int argc, char *argv[]) 2091 { 2092 int i; 2093 int c; 2094 struct phyint_instance *pii; 2095 char *value; 2096 2097 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2098 srandom(gethostid()); /* Initialize the random number generator */ 2099 2100 /* 2101 * NOTE: The messages output by in.mpathd are not suitable for 2102 * translation, so we do not call textdomain(). 2103 */ 2104 (void) setlocale(LC_ALL, ""); 2105 2106 /* 2107 * Get the user specified value of 'failure detection time' 2108 * from /etc/default/mpathd 2109 */ 2110 value = getdefault("FAILURE_DETECTION_TIME"); 2111 if (value != NULL) { 2112 user_failure_detection_time = 2113 (int)strtol((char *)value, NULL, 0); 2114 2115 if (user_failure_detection_time <= 0) { 2116 user_failure_detection_time = FAILURE_DETECTION_TIME; 2117 logerr("Invalid failure detection time %s, assuming " 2118 "default %d\n", value, user_failure_detection_time); 2119 2120 } else if (user_failure_detection_time < 2121 MIN_FAILURE_DETECTION_TIME) { 2122 user_failure_detection_time = 2123 MIN_FAILURE_DETECTION_TIME; 2124 logerr("Too small failure detection time of %s, " 2125 "assuming minimum %d\n", value, 2126 user_failure_detection_time); 2127 } 2128 free(value); 2129 } else { 2130 /* User has not specified the parameter, Use default value */ 2131 user_failure_detection_time = FAILURE_DETECTION_TIME; 2132 } 2133 2134 /* 2135 * This gives the frequency at which probes will be sent. 2136 * When fdt ms elapses, we should be able to determine 2137 * whether 5 consecutive probes have failed or not. 2138 * 1 probe will be sent in every user_probe_interval ms, 2139 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2140 * user_probe_interval. Thus when we send out probe 'n' we 2141 * can be sure that probe 'n - 2' is lost, if we have not 2142 * got the ack. (since the probe interval is > crtt). But 2143 * probe 'n - 1' may be a valid unacked probe, since the 2144 * time between 2 successive probes could be as small as 2145 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2146 */ 2147 user_probe_interval = user_failure_detection_time / 2148 (NUM_PROBE_FAILS + 2); 2149 2150 /* 2151 * Get the user specified value of failback_enabled from 2152 * /etc/default/mpathd 2153 */ 2154 value = getdefault("FAILBACK"); 2155 if (value != NULL) { 2156 if (strncasecmp(value, "yes", 3) == 0) 2157 failback_enabled = _B_TRUE; 2158 else if (strncasecmp(value, "no", 2) == 0) 2159 failback_enabled = _B_FALSE; 2160 else 2161 logerr("Invalid value for FAILBACK %s\n", value); 2162 free(value); 2163 } else { 2164 failback_enabled = _B_TRUE; 2165 } 2166 2167 /* 2168 * Get the user specified value of track_all_phyints from 2169 * /etc/default/mpathd. The sense is reversed in 2170 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2171 */ 2172 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2173 if (value != NULL) { 2174 if (strncasecmp(value, "yes", 3) == 0) 2175 track_all_phyints = _B_FALSE; 2176 else if (strncasecmp(value, "no", 2) == 0) 2177 track_all_phyints = _B_TRUE; 2178 else 2179 logerr("Invalid value for " 2180 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2181 free(value); 2182 } else { 2183 track_all_phyints = _B_FALSE; 2184 } 2185 2186 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2187 switch (c) { 2188 case 'a': 2189 adopt = _B_TRUE; 2190 break; 2191 case 'm': 2192 force_mcast = _B_TRUE; 2193 break; 2194 case 'd': 2195 debug = D_ALL; 2196 foreground = _B_TRUE; 2197 break; 2198 case 'D': 2199 i = (int)strtol(optarg, NULL, 0); 2200 if (i == 0) { 2201 (void) fprintf(stderr, "Bad debug flags: %s\n", 2202 optarg); 2203 exit(1); 2204 } 2205 debug |= i; 2206 foreground = _B_TRUE; 2207 break; 2208 case 'l': 2209 /* 2210 * Turn off link state notification handling. 2211 * Undocumented command line flag, for debugging 2212 * purposes. 2213 */ 2214 handle_link_notifications = _B_FALSE; 2215 break; 2216 default: 2217 usage(argv[0]); 2218 exit(1); 2219 } 2220 } 2221 2222 /* 2223 * The sockets for the loopback command interface should be listening 2224 * before we fork and exit in daemonize(). This way, whoever started us 2225 * can use the loopback interface as soon as they get a zero exit 2226 * status. 2227 */ 2228 lsock_v4 = setup_listener(AF_INET); 2229 lsock_v6 = setup_listener(AF_INET6); 2230 2231 if (lsock_v4 < 0 && lsock_v6 < 0) { 2232 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2233 exit(1); 2234 } 2235 2236 if (!foreground) { 2237 if (!daemonize()) { 2238 logerr("cannot daemonize\n"); 2239 exit(EXIT_FAILURE); 2240 } 2241 initlog(); 2242 } 2243 2244 /* 2245 * Initializations: 2246 * 1. Create ifsock* sockets. These are used for performing SIOC* 2247 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2248 * 2. Initialize a pipe for handling/recording signal events. 2249 * 3. Create the routing sockets, used for listening 2250 * to routing / interface changes. 2251 * 4. phyint_init() - Initialize physical interface state 2252 * (in mpd_tables.c). Must be done before creating interfaces, 2253 * which timer_init() does indirectly. 2254 * 5. timer_init() - Initialize timer related stuff 2255 * 6. initifs() - Initialize our database of all known interfaces 2256 * 7. init_router_targets() - Initialize our database of all known 2257 * router targets. 2258 */ 2259 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2260 if (ifsock_v4 < 0) { 2261 logperror("main: IPv4 socket open"); 2262 exit(1); 2263 } 2264 2265 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2266 if (ifsock_v6 < 0) { 2267 logperror("main: IPv6 socket open"); 2268 exit(1); 2269 } 2270 2271 setup_eventpipe(); 2272 2273 rtsock_v4 = setup_rtsock(AF_INET); 2274 rtsock_v6 = setup_rtsock(AF_INET6); 2275 2276 if (phyint_init() == -1) { 2277 logerr("cannot initialize physical interface structures"); 2278 exit(1); 2279 } 2280 2281 timer_init(); 2282 2283 initifs(); 2284 2285 /* Inform kernel whether failback is enabled or disabled */ 2286 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2287 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2288 exit(1); 2289 } 2290 2291 /* 2292 * If we're operating in "adopt" mode and no interfaces need to be 2293 * tracked, shut down (ifconfig(1M) will restart us on demand if 2294 * interfaces are subsequently put into multipathing groups). 2295 */ 2296 if (adopt && phyint_instances == NULL) 2297 exit(0); 2298 2299 /* 2300 * Main body. Keep listening for activity on any of the sockets 2301 * that we are monitoring and take appropriate action as necessary. 2302 * signals are also handled synchronously. 2303 */ 2304 for (;;) { 2305 if (poll(pollfds, pollfd_num, -1) < 0) { 2306 if (errno == EINTR) 2307 continue; 2308 logperror("main: poll"); 2309 exit(1); 2310 } 2311 for (i = 0; i < pollfd_num; i++) { 2312 if ((pollfds[i].fd == -1) || 2313 !(pollfds[i].revents & POLLIN)) 2314 continue; 2315 if (pollfds[i].fd == eventpipe_read) { 2316 in_signal(eventpipe_read); 2317 break; 2318 } 2319 if (pollfds[i].fd == rtsock_v4 || 2320 pollfds[i].fd == rtsock_v6) { 2321 process_rtsock(rtsock_v4, rtsock_v6); 2322 break; 2323 } 2324 for (pii = phyint_instances; pii != NULL; 2325 pii = pii->pii_next) { 2326 if (pollfds[i].fd == pii->pii_probe_sock) { 2327 if (pii->pii_af == AF_INET) 2328 in_data(pii); 2329 else 2330 in6_data(pii); 2331 break; 2332 } 2333 } 2334 if (pollfds[i].fd == lsock_v4) 2335 loopback_cmd(lsock_v4, AF_INET); 2336 else if (pollfds[i].fd == lsock_v6) 2337 loopback_cmd(lsock_v6, AF_INET6); 2338 } 2339 if (full_scan_required) { 2340 initifs(); 2341 full_scan_required = _B_FALSE; 2342 } 2343 } 2344 /* NOTREACHED */ 2345 return (EXIT_SUCCESS); 2346 } 2347 2348 static int 2349 setup_listener(int af) 2350 { 2351 int sock; 2352 int on; 2353 int len; 2354 int ret; 2355 struct sockaddr_storage laddr; 2356 struct sockaddr_in *sin; 2357 struct sockaddr_in6 *sin6; 2358 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2359 2360 assert(af == AF_INET || af == AF_INET6); 2361 2362 sock = socket(af, SOCK_STREAM, 0); 2363 if (sock < 0) { 2364 logperror("setup_listener: socket"); 2365 exit(1); 2366 } 2367 2368 on = 1; 2369 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2370 sizeof (on)) < 0) { 2371 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2372 exit(1); 2373 } 2374 2375 bzero(&laddr, sizeof (laddr)); 2376 laddr.ss_family = af; 2377 2378 if (af == AF_INET) { 2379 sin = (struct sockaddr_in *)&laddr; 2380 sin->sin_port = htons(MPATHD_PORT); 2381 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2382 len = sizeof (struct sockaddr_in); 2383 } else { 2384 sin6 = (struct sockaddr_in6 *)&laddr; 2385 sin6->sin6_port = htons(MPATHD_PORT); 2386 sin6->sin6_addr = loopback_addr; 2387 len = sizeof (struct sockaddr_in6); 2388 } 2389 2390 ret = bind(sock, (struct sockaddr *)&laddr, len); 2391 if (ret < 0) { 2392 if (errno == EADDRINUSE) { 2393 /* 2394 * Another instance of mpathd may be already active. 2395 */ 2396 logerr("main: is another instance of in.mpathd " 2397 "already active?\n"); 2398 exit(1); 2399 } else { 2400 (void) close(sock); 2401 return (-1); 2402 } 2403 } 2404 if (listen(sock, 30) < 0) { 2405 logperror("main: listen"); 2406 exit(1); 2407 } 2408 if (poll_add(sock) == -1) { 2409 (void) close(sock); 2410 exit(1); 2411 } 2412 2413 return (sock); 2414 } 2415 2416 /* 2417 * Table of commands and their expected size; used by loopback_cmd(). 2418 */ 2419 static struct { 2420 const char *name; 2421 unsigned int size; 2422 } commands[] = { 2423 { "MI_PING", sizeof (uint32_t) }, 2424 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2425 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2426 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2427 { "MI_QUERY", sizeof (mi_query_t) } 2428 }; 2429 2430 /* 2431 * Commands received over the loopback interface come here. Currently 2432 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2433 * module. ifconfig only makes a connection, and closes it to check if 2434 * in.mpathd is running. 2435 * if_mpadm sends commands in the format specified by the mpathd_interface 2436 * structure. 2437 */ 2438 static void 2439 loopback_cmd(int sock, int family) 2440 { 2441 int newfd; 2442 ssize_t len; 2443 struct sockaddr_storage peer; 2444 struct sockaddr_in *peer_sin; 2445 struct sockaddr_in6 *peer_sin6; 2446 socklen_t peerlen; 2447 union mi_commands mpi; 2448 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2449 char abuf[INET6_ADDRSTRLEN]; 2450 uint_t cmd; 2451 int retval; 2452 2453 peerlen = sizeof (peer); 2454 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2455 if (newfd < 0) { 2456 logperror("loopback_cmd: accept"); 2457 return; 2458 } 2459 2460 switch (family) { 2461 case AF_INET: 2462 /* 2463 * Validate the address and port to make sure that 2464 * non privileged processes don't connect and start 2465 * talking to us. 2466 */ 2467 if (peerlen != sizeof (struct sockaddr_in)) { 2468 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2469 (void) close(newfd); 2470 return; 2471 } 2472 peer_sin = (struct sockaddr_in *)&peer; 2473 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2474 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2475 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2476 abuf, sizeof (abuf)); 2477 logerr("Attempt to connect from addr %s port %d\n", 2478 abuf, ntohs(peer_sin->sin_port)); 2479 (void) close(newfd); 2480 return; 2481 } 2482 break; 2483 2484 case AF_INET6: 2485 if (peerlen != sizeof (struct sockaddr_in6)) { 2486 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2487 (void) close(newfd); 2488 return; 2489 } 2490 /* 2491 * Validate the address and port to make sure that 2492 * non privileged processes don't connect and start 2493 * talking to us. 2494 */ 2495 peer_sin6 = (struct sockaddr_in6 *)&peer; 2496 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2497 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2498 &loopback_addr))) { 2499 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2500 sizeof (abuf)); 2501 logerr("Attempt to connect from addr %s port %d\n", 2502 abuf, ntohs(peer_sin6->sin6_port)); 2503 (void) close(newfd); 2504 return; 2505 } 2506 2507 default: 2508 logdebug("loopback_cmd: family %d\n", family); 2509 (void) close(newfd); 2510 return; 2511 } 2512 2513 /* 2514 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2515 * all supported commands 2516 */ 2517 len = read(newfd, &mpi, sizeof (mpi)); 2518 2519 /* 2520 * ifconfig does not send any data. Just tests to see if mpathd 2521 * is already running. 2522 */ 2523 if (len <= 0) { 2524 (void) close(newfd); 2525 return; 2526 } 2527 2528 /* 2529 * In theory, we can receive any sized message for a stream socket, 2530 * but we don't expect that to happen for a small message over a 2531 * loopback connection. 2532 */ 2533 if (len < sizeof (uint32_t)) { 2534 logerr("loopback_cmd: bad command format or read returns " 2535 "partial data %d\n", len); 2536 } 2537 2538 cmd = mpi.mi_command; 2539 if (cmd >= MI_NCMD) { 2540 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2541 (void) close(newfd); 2542 return; 2543 } 2544 2545 if (len < commands[cmd].size) { 2546 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2547 commands[cmd].name, commands[cmd].size, len); 2548 (void) close(newfd); 2549 return; 2550 } 2551 2552 retval = process_cmd(newfd, &mpi); 2553 if (retval != IPMP_SUCCESS) { 2554 logerr("failed processing %s: %s\n", commands[cmd].name, 2555 ipmp_errmsg(retval)); 2556 } 2557 (void) close(newfd); 2558 } 2559 2560 extern int global_errno; /* set by failover() or failback() */ 2561 2562 /* 2563 * Process the offline, undo offline and set original index commands, 2564 * received from if_mpadm(1M) 2565 */ 2566 static unsigned int 2567 process_cmd(int newfd, union mi_commands *mpi) 2568 { 2569 uint_t nif = 0; 2570 uint32_t cmd; 2571 struct phyint *pi; 2572 struct phyint *pi2; 2573 struct phyint_group *pg; 2574 boolean_t success; 2575 int error; 2576 struct mi_offline *mio; 2577 struct mi_undo_offline *miu; 2578 struct lifreq lifr; 2579 int ifsock; 2580 struct mi_setoindex *mis; 2581 2582 cmd = mpi->mi_command; 2583 2584 switch (cmd) { 2585 case MI_OFFLINE: 2586 mio = &mpi->mi_ocmd; 2587 /* 2588 * Lookup the interface that needs to be offlined. 2589 * If it does not exist, return a suitable error. 2590 */ 2591 pi = phyint_lookup(mio->mio_ifname); 2592 if (pi == NULL) 2593 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2594 2595 /* 2596 * Verify that the minimum redundancy requirements are met. 2597 * The multipathing group must have at least the specified 2598 * number of functional interfaces after offlining the 2599 * requested interface. Otherwise return a suitable error. 2600 */ 2601 pg = pi->pi_group; 2602 nif = 0; 2603 if (pg != phyint_anongroup) { 2604 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2605 pi2 = pi2->pi_pgnext) { 2606 if ((pi2->pi_state == PI_RUNNING) || 2607 (pg->pg_groupfailed && 2608 !(pi2->pi_flags & IFF_OFFLINE))) 2609 nif++; 2610 } 2611 } 2612 if (nif < mio->mio_min_redundancy) 2613 return (send_result(newfd, IPMP_EMINRED, 0)); 2614 2615 /* 2616 * The order of operation is to set IFF_OFFLINE, followed by 2617 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2618 * can be created. Subsequent failover moves everything on 2619 * the OFFLINE interface to some other functional interface. 2620 */ 2621 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2622 if (success) { 2623 if (!pi->pi_empty) { 2624 error = try_failover(pi, FAILOVER_NORMAL); 2625 if (error != 0) { 2626 if (!change_lif_flags(pi, IFF_OFFLINE, 2627 _B_FALSE)) { 2628 logerr("process_cmd: couldn't" 2629 " clear OFFLINE flag on" 2630 " %s\n", pi->pi_name); 2631 /* 2632 * Offline interfaces should 2633 * not be probed. 2634 */ 2635 stop_probing(pi); 2636 } 2637 return (send_result(newfd, error, 2638 global_errno)); 2639 } 2640 } 2641 } else { 2642 return (send_result(newfd, IPMP_FAILURE, errno)); 2643 } 2644 2645 /* 2646 * The interface is now Offline, so stop probing it. 2647 * Note that if_mpadm(1M) will down the test addresses, 2648 * after receiving a success reply from us. The routing 2649 * socket message will then make us close the socket used 2650 * for sending probes. But it is more logical that an 2651 * offlined interface must not be probed, even if it has 2652 * test addresses. 2653 */ 2654 stop_probing(pi); 2655 return (send_result(newfd, IPMP_SUCCESS, 0)); 2656 2657 case MI_UNDO_OFFLINE: 2658 miu = &mpi->mi_ucmd; 2659 /* 2660 * Undo the offline command. As usual lookup the interface. 2661 * Send an error if it does not exist. 2662 */ 2663 pi = phyint_lookup(miu->miu_ifname); 2664 if (pi == NULL) 2665 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2666 2667 /* 2668 * Inverse of the offline operation. Do a failback, and then 2669 * clear the IFF_OFFLINE flag. 2670 */ 2671 error = do_failback(pi, _B_TRUE); 2672 if (error == IPMP_EFBPARTIAL) 2673 return (send_result(newfd, IPMP_EFBPARTIAL, 0)); 2674 error = do_failback(pi, _B_FALSE); 2675 2676 switch (error) { 2677 case IPMP_SUCCESS: 2678 if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) { 2679 logdebug("undo error %X\n", global_errno); 2680 error = IPMP_FAILURE; 2681 break; 2682 } 2683 /* FALLTHROUGH */ 2684 2685 case IPMP_EFBPARTIAL: 2686 /* 2687 * Reset the state of the interface based on the 2688 * current link state; if this phyint subsequently 2689 * acquires a test address, the state will be changed 2690 * again later as a result of the probes. 2691 */ 2692 if (LINK_UP(pi)) 2693 phyint_chstate(pi, PI_RUNNING); 2694 else 2695 phyint_chstate(pi, PI_FAILED); 2696 break; 2697 2698 case IPMP_FAILURE: 2699 break; 2700 2701 default: 2702 logdebug("do_failback: unexpected return value\n"); 2703 break; 2704 } 2705 return (send_result(newfd, error, global_errno)); 2706 2707 case MI_SETOINDEX: 2708 mis = &mpi->mi_scmd; 2709 2710 /* Get the socket for doing ioctls */ 2711 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2712 2713 /* 2714 * Get index of new original interface. 2715 * The index is returned in lifr.lifr_index. 2716 */ 2717 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2718 sizeof (lifr.lifr_name)); 2719 2720 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2721 return (send_result(newfd, IPMP_FAILURE, errno)); 2722 2723 /* 2724 * Set new original interface index. 2725 * The new index was put into lifr.lifr_index by the 2726 * SIOCGLIFINDEX ioctl. 2727 */ 2728 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2729 sizeof (lifr.lifr_name)); 2730 2731 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2732 return (send_result(newfd, IPMP_FAILURE, errno)); 2733 2734 return (send_result(newfd, IPMP_SUCCESS, 0)); 2735 2736 case MI_QUERY: 2737 return (process_query(newfd, &mpi->mi_qcmd)); 2738 2739 default: 2740 break; 2741 } 2742 2743 return (send_result(newfd, IPMP_EPROTO, 0)); 2744 } 2745 2746 /* 2747 * Process the query request pointed to by `miq' and send a reply on file 2748 * descriptor `fd'. Returns an IPMP error code. 2749 */ 2750 static unsigned int 2751 process_query(int fd, mi_query_t *miq) 2752 { 2753 ipmp_groupinfo_t *grinfop; 2754 ipmp_groupinfolist_t *grlp; 2755 ipmp_grouplist_t *grlistp; 2756 ipmp_ifinfo_t *ifinfop; 2757 ipmp_ifinfolist_t *iflp; 2758 ipmp_snap_t *snap; 2759 unsigned int retval; 2760 2761 switch (miq->miq_inforeq) { 2762 case IPMP_GROUPLIST: 2763 retval = getgrouplist(&grlistp); 2764 if (retval != IPMP_SUCCESS) 2765 return (send_result(fd, retval, errno)); 2766 2767 retval = send_result(fd, IPMP_SUCCESS, 0); 2768 if (retval == IPMP_SUCCESS) 2769 retval = send_grouplist(fd, grlistp); 2770 2771 ipmp_freegrouplist(grlistp); 2772 return (retval); 2773 2774 case IPMP_GROUPINFO: 2775 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2776 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2777 if (retval != IPMP_SUCCESS) 2778 return (send_result(fd, retval, errno)); 2779 2780 retval = send_result(fd, IPMP_SUCCESS, 0); 2781 if (retval == IPMP_SUCCESS) 2782 retval = send_groupinfo(fd, grinfop); 2783 2784 ipmp_freegroupinfo(grinfop); 2785 return (retval); 2786 2787 case IPMP_IFINFO: 2788 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2789 retval = getifinfo(miq->miq_ifname, &ifinfop); 2790 if (retval != IPMP_SUCCESS) 2791 return (send_result(fd, retval, errno)); 2792 2793 retval = send_result(fd, IPMP_SUCCESS, 0); 2794 if (retval == IPMP_SUCCESS) 2795 retval = send_ifinfo(fd, ifinfop); 2796 2797 ipmp_freeifinfo(ifinfop); 2798 return (retval); 2799 2800 case IPMP_SNAP: 2801 retval = getsnap(&snap); 2802 if (retval != IPMP_SUCCESS) 2803 return (send_result(fd, retval, errno)); 2804 2805 retval = send_result(fd, IPMP_SUCCESS, 0); 2806 if (retval != IPMP_SUCCESS) 2807 goto out; 2808 2809 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2810 if (retval != IPMP_SUCCESS) 2811 goto out; 2812 2813 retval = send_grouplist(fd, snap->sn_grlistp); 2814 if (retval != IPMP_SUCCESS) 2815 goto out; 2816 2817 iflp = snap->sn_ifinfolistp; 2818 for (; iflp != NULL; iflp = iflp->ifl_next) { 2819 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2820 if (retval != IPMP_SUCCESS) 2821 goto out; 2822 } 2823 2824 grlp = snap->sn_grinfolistp; 2825 for (; grlp != NULL; grlp = grlp->grl_next) { 2826 retval = send_groupinfo(fd, grlp->grl_grinfop); 2827 if (retval != IPMP_SUCCESS) 2828 goto out; 2829 } 2830 out: 2831 ipmp_snap_free(snap); 2832 return (retval); 2833 2834 default: 2835 break; 2836 2837 } 2838 return (send_result(fd, IPMP_EPROTO, 0)); 2839 } 2840 2841 /* 2842 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2843 * Returns an IPMP error code. 2844 */ 2845 static unsigned int 2846 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2847 { 2848 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2849 unsigned int retval; 2850 2851 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2852 if (retval != IPMP_SUCCESS) 2853 return (retval); 2854 2855 return (ipmp_writetlv(fd, IPMP_IFLIST, 2856 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2857 } 2858 2859 /* 2860 * Send the interface information pointed to by `ifinfop' on file descriptor 2861 * `fd'. Returns an IPMP error code. 2862 */ 2863 static unsigned int 2864 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2865 { 2866 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2867 } 2868 2869 /* 2870 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2871 * Returns an IPMP error code. 2872 */ 2873 static unsigned int 2874 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2875 { 2876 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2877 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2878 } 2879 2880 /* 2881 * Initialize an mi_result_t structure using `error' and `syserror' and 2882 * send it on file descriptor `fd'. Returns an IPMP error code. 2883 */ 2884 static unsigned int 2885 send_result(int fd, unsigned int error, int syserror) 2886 { 2887 mi_result_t me; 2888 2889 me.me_mpathd_error = error; 2890 if (error == IPMP_FAILURE) 2891 me.me_sys_error = syserror; 2892 else 2893 me.me_sys_error = 0; 2894 2895 return (ipmp_write(fd, &me, sizeof (me))); 2896 } 2897 2898 /* 2899 * Daemonize the process. 2900 */ 2901 static boolean_t 2902 daemonize(void) 2903 { 2904 switch (fork()) { 2905 case -1: 2906 return (_B_FALSE); 2907 2908 case 0: 2909 /* 2910 * Lose our controlling terminal, and become both a session 2911 * leader and a process group leader. 2912 */ 2913 if (setsid() == -1) 2914 return (_B_FALSE); 2915 2916 /* 2917 * Under POSIX, a session leader can accidentally (through 2918 * open(2)) acquire a controlling terminal if it does not 2919 * have one. Just to be safe, fork() again so we are not a 2920 * session leader. 2921 */ 2922 switch (fork()) { 2923 case -1: 2924 return (_B_FALSE); 2925 2926 case 0: 2927 (void) chdir("/"); 2928 (void) umask(022); 2929 (void) fdwalk(closefunc, NULL); 2930 break; 2931 2932 default: 2933 _exit(EXIT_SUCCESS); 2934 } 2935 break; 2936 2937 default: 2938 _exit(EXIT_SUCCESS); 2939 } 2940 2941 return (_B_TRUE); 2942 } 2943 2944 /* 2945 * The parent has created some fds before forking on purpose, keep them open. 2946 */ 2947 static int 2948 closefunc(void *not_used, int fd) 2949 /* ARGSUSED */ 2950 { 2951 if (fd != lsock_v4 && fd != lsock_v6) 2952 (void) close(fd); 2953 return (0); 2954 } 2955 2956 /* LOGGER */ 2957 2958 #include <syslog.h> 2959 2960 /* 2961 * Logging routines. All routines log to syslog, unless the daemon is 2962 * running in the foreground, in which case the logging goes to stderr. 2963 * 2964 * The following routines are available: 2965 * 2966 * logdebug(): A printf-like function for outputting debug messages 2967 * (messages at LOG_DEBUG) that are only of use to developers. 2968 * 2969 * logtrace(): A printf-like function for outputting tracing messages 2970 * (messages at LOG_INFO) from the daemon. This is typically used 2971 * to log the receipt of interesting network-related conditions. 2972 * 2973 * logerr(): A printf-like function for outputting error messages 2974 * (messages at LOG_ERR) from the daemon. 2975 * 2976 * logperror*(): A set of functions used to output error messages 2977 * (messages at LOG_ERR); these automatically append strerror(errno) 2978 * and a newline to the message passed to them. 2979 * 2980 * NOTE: since the logging functions write to syslog, the messages passed 2981 * to them are not eligible for localization. Thus, gettext() must 2982 * *not* be used. 2983 */ 2984 2985 static int logging = 0; 2986 2987 static void 2988 initlog(void) 2989 { 2990 logging++; 2991 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 2992 } 2993 2994 /* PRINTFLIKE1 */ 2995 void 2996 logerr(char *fmt, ...) 2997 { 2998 va_list ap; 2999 3000 va_start(ap, fmt); 3001 3002 if (logging) 3003 vsyslog(LOG_ERR, fmt, ap); 3004 else 3005 (void) vfprintf(stderr, fmt, ap); 3006 va_end(ap); 3007 } 3008 3009 /* PRINTFLIKE1 */ 3010 void 3011 logtrace(char *fmt, ...) 3012 { 3013 va_list ap; 3014 3015 va_start(ap, fmt); 3016 3017 if (logging) 3018 vsyslog(LOG_INFO, fmt, ap); 3019 else 3020 (void) vfprintf(stderr, fmt, ap); 3021 va_end(ap); 3022 } 3023 3024 /* PRINTFLIKE1 */ 3025 void 3026 logdebug(char *fmt, ...) 3027 { 3028 va_list ap; 3029 3030 va_start(ap, fmt); 3031 3032 if (logging) 3033 vsyslog(LOG_DEBUG, fmt, ap); 3034 else 3035 (void) vfprintf(stderr, fmt, ap); 3036 va_end(ap); 3037 } 3038 3039 /* PRINTFLIKE1 */ 3040 void 3041 logperror(char *str) 3042 { 3043 if (logging) 3044 syslog(LOG_ERR, "%s: %m\n", str); 3045 else 3046 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3047 } 3048 3049 void 3050 logperror_pii(struct phyint_instance *pii, char *str) 3051 { 3052 if (logging) { 3053 syslog(LOG_ERR, "%s (%s %s): %m\n", 3054 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3055 } else { 3056 (void) fprintf(stderr, "%s (%s %s): %s\n", 3057 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3058 strerror(errno)); 3059 } 3060 } 3061 3062 void 3063 logperror_li(struct logint *li, char *str) 3064 { 3065 struct phyint_instance *pii = li->li_phyint_inst; 3066 3067 if (logging) { 3068 syslog(LOG_ERR, "%s (%s %s): %m\n", 3069 str, AF_STR(pii->pii_af), li->li_name); 3070 } else { 3071 (void) fprintf(stderr, "%s (%s %s): %s\n", 3072 str, AF_STR(pii->pii_af), li->li_name, 3073 strerror(errno)); 3074 } 3075 } 3076 3077 void 3078 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3079 { 3080 if (polled) 3081 (void) poll_remove(pii->pii_probe_sock); 3082 (void) close(pii->pii_probe_sock); 3083 pii->pii_probe_sock = -1; 3084 pii->pii_basetime_inited = 0; 3085 } 3086