1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 35 /* All times below in ms */ 36 int user_failure_detection_time; /* user specified failure detection */ 37 /* time (fdt) */ 38 int user_probe_interval; /* derived from user specified fdt */ 39 40 static int rtsock_v4; /* AF_INET routing socket */ 41 static int rtsock_v6; /* AF_INET6 routing socket */ 42 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 43 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 44 static int lsock_v4; /* Listen socket to detect mpathd */ 45 static int lsock_v6; /* Listen socket to detect mpathd */ 46 static int mibfd = -1; /* fd to get mib info */ 47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 48 49 boolean_t full_scan_required = _B_FALSE; 50 static uint_t last_initifs_time; /* Time when initifs was last run */ 51 static char **argv0; /* Saved for re-exec on SIGHUP */ 52 boolean_t handle_link_notifications = _B_TRUE; 53 54 static void initlog(void); 55 static void run_timeouts(void); 56 static void initifs(void); 57 static void check_if_removed(struct phyint_instance *pii); 58 static void select_test_ifs(void); 59 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 60 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 61 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 62 struct in_addr nexthop_v4); 63 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 64 struct in6_addr nexthop_v6); 65 static void router_add_common(int af, char *ifname, 66 struct in6_addr nexthop); 67 static void init_router_targets(); 68 static void cleanup(void); 69 static int setup_listener(int af); 70 static void check_config(void); 71 static void check_addr_unique(int af, char *name); 72 static void init_host_targets(void); 73 static void dup_host_targets(struct phyint_instance *desired_pii); 74 static void loopback_cmd(int sock, int family); 75 static int poll_remove(int fd); 76 static boolean_t daemonize(void); 77 static int closefunc(void *, int); 78 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 79 static unsigned int process_query(int fd, mi_query_t *miq); 80 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 81 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 82 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 83 static unsigned int send_result(int fd, unsigned int error, int syserror); 84 85 struct local_addr *laddr_list = NULL; 86 87 /* 88 * Return the current time in milliseconds (from an arbitrary reference) 89 * truncated to fit into an int. Truncation is ok since we are interested 90 * only in differences and not the absolute values. 91 */ 92 uint_t 93 getcurrenttime(void) 94 { 95 uint_t cur_time; /* In ms */ 96 97 /* 98 * Use of a non-user-adjustable source of time is 99 * required. However millisecond precision is sufficient. 100 * divide by 10^6 101 */ 102 cur_time = (uint_t)(gethrtime() / 1000000LL); 103 return (cur_time); 104 } 105 106 /* 107 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 108 */ 109 int 110 poll_add(int fd) 111 { 112 int i; 113 int new_num; 114 struct pollfd *newfds; 115 retry: 116 /* Check if already present */ 117 for (i = 0; i < pollfd_num; i++) { 118 if (pollfds[i].fd == fd) 119 return (0); 120 } 121 /* Check for empty spot already present */ 122 for (i = 0; i < pollfd_num; i++) { 123 if (pollfds[i].fd == -1) { 124 pollfds[i].fd = fd; 125 return (0); 126 } 127 } 128 129 /* Allocate space for 32 more fds and initialize to -1 */ 130 new_num = pollfd_num + 32; 131 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 132 if (newfds == NULL) { 133 logperror("poll_add: realloc"); 134 return (-1); 135 } 136 for (i = pollfd_num; i < new_num; i++) { 137 newfds[i].fd = -1; 138 newfds[i].events = POLLIN; 139 } 140 pollfd_num = new_num; 141 pollfds = newfds; 142 goto retry; 143 } 144 145 /* 146 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 147 */ 148 static int 149 poll_remove(int fd) 150 { 151 int i; 152 153 /* Check if already present */ 154 for (i = 0; i < pollfd_num; i++) { 155 if (pollfds[i].fd == fd) { 156 pollfds[i].fd = -1; 157 return (0); 158 } 159 } 160 return (-1); 161 } 162 163 /* 164 * Extract information about the phyint instance. If the phyint instance still 165 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 166 * will use it to detect phyint instances that don't exist any longer and 167 * remove them, from our database of phyint instances. 168 * Return value: 169 * returns true if the phyint instance exists in the kernel, 170 * returns false otherwise 171 */ 172 static boolean_t 173 pii_process(int af, char *name, struct phyint_instance **pii_p) 174 { 175 int err; 176 struct phyint_instance *pii; 177 struct phyint_instance *pii_other; 178 179 if (debug & D_PHYINT) 180 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 181 182 pii = phyint_inst_lookup(af, name); 183 if (pii == NULL) { 184 /* 185 * Phyint instance does not exist in our tables, 186 * create new phyint instance 187 */ 188 pii = phyint_inst_init_from_k(af, name); 189 } else { 190 /* Phyint exists in our tables */ 191 err = phyint_inst_update_from_k(pii); 192 193 switch (err) { 194 case PI_IOCTL_ERROR: 195 /* Some ioctl error. don't change anything */ 196 pii->pii_in_use = 1; 197 break; 198 199 case PI_GROUP_CHANGED: 200 /* 201 * The phyint has changed group. 202 */ 203 restore_phyint(pii->pii_phyint); 204 /* FALLTHRU */ 205 206 case PI_IFINDEX_CHANGED: 207 /* 208 * Interface index has changed. Delete and 209 * recreate the phyint as it is quite likely 210 * the interface has been unplumbed and replumbed. 211 */ 212 pii_other = phyint_inst_other(pii); 213 if (pii_other != NULL) 214 phyint_inst_delete(pii_other); 215 phyint_inst_delete(pii); 216 pii = phyint_inst_init_from_k(af, name); 217 break; 218 219 case PI_DELETED: 220 /* Phyint instance has disappeared from kernel */ 221 pii->pii_in_use = 0; 222 break; 223 224 case PI_OK: 225 /* Phyint instance exists and is fine */ 226 pii->pii_in_use = 1; 227 break; 228 229 default: 230 /* Unknown status */ 231 logerr("pii_process: Unknown status %d\n", err); 232 break; 233 } 234 } 235 236 *pii_p = pii; 237 if (pii != NULL) 238 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 239 else 240 return (_B_FALSE); 241 } 242 243 /* 244 * This phyint is leaving the group. Try to restore the phyint to its 245 * initial state. Return the addresses that belong to other group members, 246 * to the group, and take back any addresses owned by this phyint 247 */ 248 void 249 restore_phyint(struct phyint *pi) 250 { 251 if (pi->pi_group == phyint_anongroup) 252 return; 253 254 /* 255 * Move everthing to some other member in the group. 256 * The phyint has changed group in the kernel. But we 257 * have yet to do it in our tables. 258 */ 259 if (!pi->pi_empty) 260 (void) try_failover(pi, FAILOVER_TO_ANY); 261 /* 262 * Move all addresses owned by 'pi' back to pi, from each 263 * of the other members of the group 264 */ 265 (void) try_failback(pi, _B_FALSE); 266 } 267 268 /* 269 * Scan all interfaces to detect changes as well as new and deleted interfaces 270 */ 271 static void 272 initifs() 273 { 274 int n; 275 int af; 276 char *cp; 277 char *buf; 278 int numifs; 279 struct lifnum lifn; 280 struct lifconf lifc; 281 struct lifreq *lifr; 282 struct logint *li; 283 struct phyint_instance *pii; 284 struct phyint_instance *next_pii; 285 char pi_name[LIFNAMSIZ + 1]; 286 boolean_t exists; 287 struct phyint *pi; 288 struct local_addr *next; 289 290 if (debug & D_PHYINT) 291 logdebug("initifs: Scanning interfaces\n"); 292 293 last_initifs_time = getcurrenttime(); 294 295 /* 296 * Free the laddr_list before collecting the local addresses. 297 */ 298 while (laddr_list != NULL) { 299 next = laddr_list->next; 300 free(laddr_list); 301 laddr_list = next; 302 } 303 304 /* 305 * Mark the interfaces so that we can find phyints and logints 306 * which have disappeared from the kernel. pii_process() and 307 * logint_init_from_k() will set {pii,li}_in_use when they find 308 * the interface in the kernel. Also, clear dupaddr bit on probe 309 * logint. check_addr_unique() will set the dupaddr bit on the 310 * probe logint, if the testaddress is not unique. 311 */ 312 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 313 pii->pii_in_use = 0; 314 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 315 li->li_in_use = 0; 316 if (pii->pii_probe_logint == li) 317 li->li_dupaddr = 0; 318 } 319 } 320 321 lifn.lifn_family = AF_UNSPEC; 322 lifn.lifn_flags = LIFC_ALLZONES; 323 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 324 logperror("initifs: ioctl (get interface numbers)"); 325 return; 326 } 327 numifs = lifn.lifn_count; 328 329 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 330 if (buf == NULL) { 331 logperror("initifs: calloc"); 332 return; 333 } 334 335 lifc.lifc_family = AF_UNSPEC; 336 lifc.lifc_flags = LIFC_ALLZONES; 337 lifc.lifc_len = numifs * sizeof (struct lifreq); 338 lifc.lifc_buf = buf; 339 340 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 341 /* 342 * EINVAL is commonly encountered, when things change 343 * underneath us rapidly, (eg. at boot, when new interfaces 344 * are plumbed successively) and the kernel finds the buffer 345 * size we passed as too small. We will retry again 346 * when we see the next routing socket msg, or at worst after 347 * IF_SCAN_INTERVAL ms. 348 */ 349 if (errno != EINVAL) { 350 logperror("initifs: ioctl" 351 " (get interface configuration)"); 352 } 353 free(buf); 354 return; 355 } 356 357 lifr = (struct lifreq *)lifc.lifc_req; 358 359 /* 360 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 361 * and get the state of the corresponding phyint_instance. If it is 362 * successful, then call logint_init_from_k() to get the state of the 363 * logint. 364 */ 365 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 366 int sockfd; 367 struct local_addr *taddr; 368 struct sockaddr_in *sin; 369 struct sockaddr_in6 *sin6; 370 struct lifreq lifreq; 371 372 af = lifr->lifr_addr.ss_family; 373 374 /* 375 * Collect all local addresses. 376 */ 377 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 378 (void) memset(&lifreq, 0, sizeof (lifreq)); 379 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, 380 sizeof (lifreq.lifr_name)); 381 382 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 383 if (errno != ENXIO) 384 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 385 continue; 386 } 387 388 /* 389 * Add the interface address to laddr_list. 390 * Another node might have the same IP address which is up. 391 * In that case, it is appropriate to use the address as a 392 * target, even though it is also configured (but not up) on 393 * the local system. 394 * Hence,the interface address is not added to laddr_list 395 * unless it is IFF_UP. 396 */ 397 if (lifreq.lifr_flags & IFF_UP) { 398 taddr = malloc(sizeof (struct local_addr)); 399 if (taddr == NULL) { 400 logperror("initifs: malloc"); 401 continue; 402 } 403 if (af == AF_INET) { 404 sin = (struct sockaddr_in *)&lifr->lifr_addr; 405 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 406 &taddr->addr); 407 } else { 408 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; 409 taddr->addr = sin6->sin6_addr; 410 } 411 taddr->next = laddr_list; 412 laddr_list = taddr; 413 } 414 415 /* 416 * Need to pass a phyint name to pii_process. Insert the 417 * null where the ':' IF_SEPARATOR is found in the logical 418 * name. 419 */ 420 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 421 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 422 *cp = '\0'; 423 424 exists = pii_process(af, pi_name, &pii); 425 if (exists) { 426 /* The phyint is fine. So process the logint */ 427 logint_init_from_k(pii, lifr->lifr_name); 428 } 429 check_addr_unique(af, lifr->lifr_name); 430 } 431 432 free(buf); 433 434 /* 435 * If the test address is now unique, and if it was not unique 436 * previously, clear the li_dupaddrmsg_printed flag and log a 437 * recovery message 438 */ 439 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 440 struct logint *li; 441 char abuf[INET6_ADDRSTRLEN]; 442 443 li = pii->pii_probe_logint; 444 if ((li != NULL) && !li->li_dupaddr && 445 li->li_dupaddrmsg_printed) { 446 logerr("Test address %s is unique; enabling probe-" 447 "based failure detection\n", 448 pr_addr(pii->pii_af, li->li_addr, abuf, 449 sizeof (abuf))); 450 li->li_dupaddrmsg_printed = 0; 451 } 452 } 453 454 /* 455 * Scan for phyints and logints that have disappeared from the 456 * kernel, and delete them. 457 */ 458 pii = phyint_instances; 459 460 while (pii != NULL) { 461 next_pii = pii->pii_next; 462 check_if_removed(pii); 463 pii = next_pii; 464 } 465 466 /* 467 * Select a test address for sending probes on each phyint instance 468 */ 469 select_test_ifs(); 470 471 /* 472 * Handle link up/down notifications from the NICs. 473 */ 474 process_link_state_changes(); 475 476 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 477 /* 478 * If this is a case of group failure, we don't have much 479 * to do until the group recovers again. 480 */ 481 if (GROUP_FAILED(pi->pi_group)) 482 continue; 483 484 /* 485 * Try/Retry any pending failovers / failbacks, that did not 486 * not complete, or that could not be initiated previously. 487 * This implements the 3 invariants described in the big block 488 * comment at the beginning of probe.c 489 */ 490 if (pi->pi_flags & IFF_INACTIVE) { 491 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 492 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 493 } else { 494 struct phyint_instance *pii; 495 496 pii = pi->pi_v4; 497 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 498 pii = pi->pi_v6; 499 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 500 continue; 501 /* 502 * It is possible that the phyint has started 503 * receiving packets, after it has been marked 504 * PI_FAILED. Don't initiate failover, if the 505 * phyint has started recovering. failure_state() 506 * captures this check. A similar logic is used 507 * for failback/repair case. 508 */ 509 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 510 (failure_state(pii) == PHYINT_FAILURE)) { 511 (void) try_failover(pi, FAILOVER_NORMAL); 512 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 513 if (try_failback(pi, _B_FALSE) != 514 IPMP_FAILURE) { 515 (void) change_lif_flags(pi, IFF_FAILED, 516 _B_FALSE); 517 /* Per state diagram */ 518 pi->pi_empty = 0; 519 } 520 } 521 } 522 } 523 } 524 525 /* 526 * Check that test/probe addresses are always unique. link-locals and 527 * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER) 528 * address can produce unexpected results. Log an error and alert the user. 529 */ 530 static void 531 check_addr_unique(int af, char *name) 532 { 533 struct lifreq lifr; 534 struct phyint *pi; 535 struct in6_addr addr; 536 struct phyint_instance *pii; 537 struct sockaddr_in *sin; 538 struct sockaddr_in6 *sin6; 539 int ifsock; 540 char abuf[INET6_ADDRSTRLEN]; 541 542 /* Get the socket for doing ioctls */ 543 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 544 545 (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); 546 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 547 /* 548 * Get the address corresponding to 'name'. We cannot 549 * do a logint lookup in our tables, because, not all logints 550 * in the system are tracked by mpathd. (eg. things not in a group) 551 */ 552 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 553 if (errno == ENXIO) { 554 /* Interface has vanished */ 555 return; 556 } else { 557 logperror("ioctl (get addr)"); 558 return; 559 } 560 } 561 562 if (af == AF_INET) { 563 sin = (struct sockaddr_in *)&lifr.lifr_addr; 564 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 565 } else { 566 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 567 addr = sin6->sin6_addr; 568 } 569 570 /* 571 * Does the address 'addr' match any known test address ? If so 572 * it is a duplicate, unless we are looking at the same logint 573 */ 574 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 575 pii = PHYINT_INSTANCE(pi, af); 576 if (pii == NULL || pii->pii_probe_logint == NULL) 577 continue; 578 579 if (!IN6_ARE_ADDR_EQUAL(&addr, 580 &pii->pii_probe_logint->li_addr)) { 581 continue; 582 } 583 584 if (strncmp(pii->pii_probe_logint->li_name, name, 585 sizeof (pii->pii_probe_logint->li_name)) == 0) { 586 continue; 587 } 588 589 /* 590 * This test address is not unique. Set the dupaddr bit 591 */ 592 pii->pii_probe_logint->li_dupaddr = 1; 593 594 /* 595 * Log an error message if not already logged 596 */ 597 if (pii->pii_probe_logint->li_dupaddrmsg_printed) 598 continue; 599 600 logerr("Test address %s is not unique; disabling " 601 "probe-based failure detection\n", 602 pr_addr(af, addr, abuf, sizeof (abuf))); 603 604 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 605 } 606 } 607 608 /* 609 * Stop probing an interface. Called when an interface is offlined. 610 * The probe socket is closed on each interface instance, and the 611 * interface state set to PI_OFFLINE. 612 */ 613 static void 614 stop_probing(struct phyint *pi) 615 { 616 struct phyint_instance *pii; 617 618 pii = pi->pi_v4; 619 if (pii != NULL) { 620 if (pii->pii_probe_sock != -1) 621 close_probe_socket(pii, _B_TRUE); 622 pii->pii_probe_logint = NULL; 623 } 624 625 pii = pi->pi_v6; 626 if (pii != NULL) { 627 if (pii->pii_probe_sock != -1) 628 close_probe_socket(pii, _B_TRUE); 629 pii->pii_probe_logint = NULL; 630 } 631 632 phyint_chstate(pi, PI_OFFLINE); 633 } 634 635 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 636 637 /* 638 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 639 * IFF_UP must also be set so that the associated address can be used as a 640 * source address. Further, we must be able to exchange packets with local 641 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 642 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 643 */ 644 static int 645 rate_testflags(uint64_t flags) 646 { 647 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 648 return (BAD_TESTFLAGS); 649 650 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 651 return (BAD_TESTFLAGS); 652 653 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 654 return (BEST_TESTFLAGS); 655 656 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 657 return (BEST_TESTFLAGS); 658 659 return (OK_TESTFLAGS); 660 } 661 662 /* 663 * Attempt to select a test address for each phyint instance. 664 * Call phyint_inst_sockinit() to complete the initializations. 665 */ 666 static void 667 select_test_ifs(void) 668 { 669 struct phyint *pi; 670 struct phyint_instance *pii; 671 struct phyint_instance *next_pii; 672 struct logint *li; 673 struct logint *probe_logint; 674 boolean_t target_scan_reqd = _B_FALSE; 675 struct target *tg; 676 int rating; 677 678 if (debug & D_PHYINT) 679 logdebug("select_test_ifs\n"); 680 681 /* 682 * For each phyint instance, do the test address selection 683 */ 684 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 685 next_pii = pii->pii_next; 686 probe_logint = NULL; 687 688 /* 689 * An interface that is offline, should not be probed. 690 * Offline interfaces should always in PI_OFFLINE state, 691 * unless some other entity has set the offline flag. 692 */ 693 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 694 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 695 logerr("shouldn't be probing offline" 696 " interface %s (state is: %u)." 697 " Stopping probes.\n", 698 pii->pii_phyint->pi_name, 699 pii->pii_phyint->pi_state); 700 stop_probing(pii->pii_phyint); 701 } 702 continue; 703 } 704 705 li = pii->pii_probe_logint; 706 if (li != NULL) { 707 /* 708 * We've already got a test address; only proceed 709 * if it's suboptimal. 710 */ 711 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 712 continue; 713 } 714 715 /* 716 * Walk the logints of this phyint instance, and select 717 * the best available test address 718 */ 719 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 720 /* 721 * Skip any IPv6 logints that are not link-local, 722 * since we should always have a link-local address 723 * anyway and in6_data() expects link-local replies. 724 */ 725 if (pii->pii_af == AF_INET6 && 726 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 727 continue; 728 729 /* 730 * Rate the testflags. If we've found an optimal 731 * match, then break out; otherwise, record the most 732 * recent OK one. 733 */ 734 rating = rate_testflags(li->li_flags); 735 if (rating == BAD_TESTFLAGS) 736 continue; 737 738 probe_logint = li; 739 if (rating == BEST_TESTFLAGS) 740 break; 741 } 742 743 /* 744 * If the probe logint has changed, ditch the old one. 745 */ 746 if (pii->pii_probe_logint != NULL && 747 pii->pii_probe_logint != probe_logint) { 748 if (pii->pii_probe_sock != -1) 749 close_probe_socket(pii, _B_TRUE); 750 pii->pii_probe_logint = NULL; 751 } 752 753 if (probe_logint == NULL) { 754 /* 755 * We don't have a test address. Don't print an 756 * error message immediately. check_config() will 757 * take care of it. Zero out the probe stats array 758 * since it is no longer relevant. Optimize by 759 * checking if it is already zeroed out. 760 */ 761 int pr_ndx; 762 763 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 764 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 765 clear_pii_probe_stats(pii); 766 reset_crtt_all(pii->pii_phyint); 767 } 768 continue; 769 } else if (probe_logint == pii->pii_probe_logint) { 770 /* 771 * If we didn't find any new test addr, go to the 772 * next phyint. 773 */ 774 continue; 775 } 776 777 /* 778 * The phyint is either being assigned a new testaddr 779 * or is being assigned a testaddr for the 1st time. 780 * Need to initialize the phyint socket 781 */ 782 pii->pii_probe_logint = probe_logint; 783 if (!phyint_inst_sockinit(pii)) { 784 if (debug & D_PHYINT) { 785 logdebug("select_test_ifs: " 786 "phyint_sockinit failed\n"); 787 } 788 phyint_inst_delete(pii); 789 continue; 790 } 791 792 /* 793 * This phyint instance is now enabled for probes; this 794 * impacts our state machine in two ways: 795 * 796 * 1. If we're probe *capable* as well (i.e., we have 797 * probe targets) and the interface is in PI_NOTARGETS, 798 * then transition to PI_RUNNING. 799 * 800 * 2. If we're not probe capable, and the other phyint 801 * instance is also not probe capable, and we were in 802 * PI_RUNNING, then transition to PI_NOTARGETS. 803 * 804 * Also see the state diagram in mpd_probe.c. 805 */ 806 if (PROBE_CAPABLE(pii)) { 807 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 808 phyint_chstate(pii->pii_phyint, PI_RUNNING); 809 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 810 if (pii->pii_phyint->pi_state == PI_RUNNING) 811 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 812 } 813 814 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 815 tg = pii->pii_targets; 816 if (tg != NULL) 817 target_delete(tg); 818 assert(pii->pii_targets == NULL); 819 assert(pii->pii_target_next == NULL); 820 assert(pii->pii_ntargets == 0); 821 target_create(pii, probe_logint->li_dstaddr, 822 _B_TRUE); 823 } 824 825 /* 826 * If no targets are currently known for this phyint 827 * we need to call init_router_targets. Since 828 * init_router_targets() initializes the list of targets 829 * for all phyints it is done below the loop. 830 */ 831 if (pii->pii_targets == NULL) 832 target_scan_reqd = _B_TRUE; 833 834 /* 835 * Start the probe timer for this instance. 836 */ 837 if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) { 838 start_timer(pii); 839 pii->pii_basetime_inited = 1; 840 } 841 } 842 843 /* 844 * Check the interface list for any interfaces that are marked 845 * PI_FAILED but no longer enabled to send probes, and call 846 * phyint_check_for_repair() to see if the link now indicates that the 847 * interface should be repaired. Also see the state diagram in 848 * mpd_probe.c. 849 */ 850 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 851 if (pi->pi_state == PI_FAILED && 852 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 853 phyint_check_for_repair(pi); 854 } 855 } 856 857 /* 858 * Try to populate the target list. init_router_targets populates 859 * the target list from the routing table. If our target list is 860 * still empty, init_host_targets adds host targets based on the 861 * host target list of other phyints in the group. 862 */ 863 if (target_scan_reqd) { 864 init_router_targets(); 865 init_host_targets(); 866 } 867 } 868 869 /* 870 * Check phyint group configuration, to detect any inconsistencies, 871 * and log an error message. This is called from runtimeouts every 872 * 20 secs. But the error message is displayed once. If the 873 * consistency is resolved by the admin, a recovery message is displayed 874 * once. 875 */ 876 static void 877 check_config(void) 878 { 879 struct phyint_group *pg; 880 struct phyint *pi; 881 boolean_t v4_in_group; 882 boolean_t v6_in_group; 883 884 /* 885 * All phyints of a group must be homogenous to ensure that 886 * failover or failback can be done. If any phyint in a group 887 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 888 * Do a similar check for IPv6. 889 */ 890 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 891 if (pg == phyint_anongroup) 892 continue; 893 894 v4_in_group = _B_FALSE; 895 v6_in_group = _B_FALSE; 896 /* 897 * 1st pass. Determine if at least 1 phyint in the group 898 * has IPv4 plumbed and if so set v4_in_group to true. 899 * Repeat similarly for IPv6. 900 */ 901 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 902 if (pi->pi_v4 != NULL) 903 v4_in_group = _B_TRUE; 904 if (pi->pi_v6 != NULL) 905 v6_in_group = _B_TRUE; 906 } 907 908 /* 909 * 2nd pass. If v4_in_group is true, check that phyint 910 * has IPv4 plumbed. Repeat similarly for IPv6. Print 911 * out a message the 1st time only. 912 */ 913 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 914 if (pi->pi_flags & IFF_OFFLINE) 915 continue; 916 917 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 918 if (!pi->pi_cfgmsg_printed) { 919 logerr("NIC %s of group %s is" 920 " not plumbed for IPv4 and may" 921 " affect failover capability\n", 922 pi->pi_name, 923 pi->pi_group->pg_name); 924 pi->pi_cfgmsg_printed = 1; 925 } 926 } else if (v6_in_group == _B_TRUE && 927 pi->pi_v6 == NULL) { 928 if (!pi->pi_cfgmsg_printed) { 929 logerr("NIC %s of group %s is" 930 " not plumbed for IPv6 and may" 931 " affect failover capability\n", 932 pi->pi_name, 933 pi->pi_group->pg_name); 934 pi->pi_cfgmsg_printed = 1; 935 } 936 } else { 937 /* 938 * The phyint matches the group configuration, 939 * if we have reached this point. If it was 940 * improperly configured earlier, log an 941 * error recovery message 942 */ 943 if (pi->pi_cfgmsg_printed) { 944 logerr("NIC %s is now consistent with " 945 "group %s and failover capability " 946 "is restored\n", pi->pi_name, 947 pi->pi_group->pg_name); 948 pi->pi_cfgmsg_printed = 0; 949 } 950 } 951 952 } 953 } 954 955 /* 956 * In order to perform probe-based failure detection, a phyint must 957 * have at least 1 test/probe address for sending and receiving probes 958 * (either on IPv4 or IPv6 instance or both). If no test address has 959 * been configured, notify the administrator, but continue on since we 960 * can still perform load spreading, along with "link up/down" based 961 * failure detection. 962 */ 963 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 964 if (pi->pi_flags & IFF_OFFLINE) 965 continue; 966 967 if ((pi->pi_v4 == NULL || 968 pi->pi_v4->pii_probe_logint == NULL) && 969 (pi->pi_v6 == NULL || 970 pi->pi_v6->pii_probe_logint == NULL)) { 971 if (!pi->pi_taddrmsg_printed) { 972 logerr("No test address configured on " 973 "interface %s; disabling probe-based " 974 "failure detection on it\n", pi->pi_name); 975 pi->pi_taddrmsg_printed = 1; 976 } 977 } else if (pi->pi_taddrmsg_printed) { 978 logerr("Test address now configured on interface %s; " 979 "enabling probe-based failure detection on it\n", 980 pi->pi_name); 981 pi->pi_taddrmsg_printed = 0; 982 } 983 984 } 985 } 986 987 /* 988 * Timer mechanism using relative time (in milliseconds) from the 989 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 990 * will fire after TIMER_INFINITY milliseconds. 991 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 992 * time values. Hence 2 consecutive timer events cannot be spaced farther 993 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 994 * that can be passed for the delay parameter of timer_schedule() 995 */ 996 static uint_t timer_next; /* Currently scheduled timeout */ 997 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 998 999 static void 1000 timer_init(void) 1001 { 1002 timer_next = getcurrenttime() + TIMER_INFINITY; 1003 /* 1004 * The call to run_timeouts() will get the timer started 1005 * Since there are no phyints at this point, the timer will 1006 * be set for IF_SCAN_INTERVAL ms. 1007 */ 1008 run_timeouts(); 1009 } 1010 1011 /* 1012 * Make sure the next SIGALRM occurs delay milliseconds from the current 1013 * time if not earlier. We are interested only in time differences. 1014 */ 1015 void 1016 timer_schedule(uint_t delay) 1017 { 1018 uint_t now; 1019 struct itimerval itimerval; 1020 1021 if (debug & D_TIMER) 1022 logdebug("timer_schedule(%u)\n", delay); 1023 1024 assert(delay <= TIMER_INFINITY); 1025 1026 now = getcurrenttime(); 1027 if (delay == 0) { 1028 /* Minimum allowed delay */ 1029 delay = 1; 1030 } 1031 /* Will this timer occur before the currently scheduled SIGALRM? */ 1032 if (timer_active && TIME_GE(now + delay, timer_next)) { 1033 if (debug & D_TIMER) { 1034 logdebug("timer_schedule(%u) - no action: " 1035 "now %u next %u\n", delay, now, timer_next); 1036 } 1037 return; 1038 } 1039 timer_next = now + delay; 1040 1041 itimerval.it_value.tv_sec = delay / 1000; 1042 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1043 itimerval.it_interval.tv_sec = 0; 1044 itimerval.it_interval.tv_usec = 0; 1045 if (debug & D_TIMER) { 1046 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1047 delay, itimerval.it_value.tv_sec, 1048 itimerval.it_value.tv_usec); 1049 } 1050 timer_active = _B_TRUE; 1051 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1052 logperror("timer_schedule: setitimer"); 1053 exit(2); 1054 } 1055 } 1056 1057 /* 1058 * Timer has fired. Determine when the next timer event will occur by asking 1059 * all the timer routines. Should not be called from a timer routine. 1060 */ 1061 static void 1062 run_timeouts(void) 1063 { 1064 uint_t next; 1065 uint_t next_event_time; 1066 struct phyint_instance *pii; 1067 struct phyint_instance *next_pii; 1068 static boolean_t timeout_running; 1069 1070 /* assert that recursive timeouts don't happen. */ 1071 assert(!timeout_running); 1072 1073 timeout_running = _B_TRUE; 1074 1075 if (debug & D_TIMER) 1076 logdebug("run_timeouts()\n"); 1077 1078 next = TIMER_INFINITY; 1079 1080 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1081 next_pii = pii->pii_next; 1082 next_event_time = phyint_inst_timer(pii); 1083 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1084 next = next_event_time; 1085 1086 if (debug & D_TIMER) { 1087 logdebug("run_timeouts(%s %s): next scheduled for" 1088 " this phyint inst %u, next scheduled global" 1089 " %u ms\n", 1090 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1091 next_event_time, next); 1092 } 1093 } 1094 1095 /* 1096 * Make sure initifs() is called at least once every 1097 * IF_SCAN_INTERVAL, to make sure that we are in sync 1098 * with the kernel, in case we have missed any routing 1099 * socket messages. 1100 */ 1101 if (next > IF_SCAN_INTERVAL) 1102 next = IF_SCAN_INTERVAL; 1103 1104 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1105 initifs(); 1106 check_config(); 1107 } 1108 1109 if (debug & D_TIMER) 1110 logdebug("run_timeouts: %u ms\n", next); 1111 1112 timer_schedule(next); 1113 timeout_running = _B_FALSE; 1114 } 1115 1116 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1117 static int eventpipe_write = -1; 1118 static boolean_t cleanup_started = _B_FALSE; 1119 /* Don't write to eventpipe if in cleanup */ 1120 /* 1121 * Ensure that signals are processed synchronously with the rest of 1122 * the code by just writing a one character signal number on the pipe. 1123 * The poll loop will pick this up and process the signal event. 1124 */ 1125 static void 1126 sig_handler(int signo) 1127 { 1128 uchar_t buf = (uchar_t)signo; 1129 1130 /* 1131 * Don't write to pipe if cleanup has already begun. cleanup() 1132 * might have closed the pipe already 1133 */ 1134 if (cleanup_started) 1135 return; 1136 1137 if (eventpipe_write == -1) { 1138 logerr("sig_handler: no pipe found\n"); 1139 return; 1140 } 1141 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1142 logperror("sig_handler: write"); 1143 } 1144 1145 extern struct probes_missed probes_missed; 1146 1147 /* 1148 * Pick up a signal "byte" from the pipe and process it. 1149 */ 1150 static void 1151 in_signal(int fd) 1152 { 1153 uchar_t buf; 1154 uint64_t sent, acked, lost, unacked, unknown; 1155 struct phyint_instance *pii; 1156 int pr_ndx; 1157 1158 switch (read(fd, &buf, sizeof (buf))) { 1159 case -1: 1160 logperror("in_signal: read"); 1161 exit(1); 1162 /* NOTREACHED */ 1163 case 1: 1164 break; 1165 case 0: 1166 logerr("in_signal: read end of file\n"); 1167 exit(1); 1168 /* NOTREACHED */ 1169 default: 1170 logerr("in_signal: read > 1\n"); 1171 exit(1); 1172 } 1173 1174 if (debug & D_TIMER) 1175 logdebug("in_signal() got %d\n", buf); 1176 1177 switch (buf) { 1178 case SIGALRM: 1179 if (debug & D_TIMER) { 1180 uint_t now = getcurrenttime(); 1181 1182 logdebug("in_signal(SIGALRM) delta %u\n", 1183 now - timer_next); 1184 } 1185 timer_active = _B_FALSE; 1186 run_timeouts(); 1187 break; 1188 case SIGUSR1: 1189 logdebug("Printing configuration:\n"); 1190 /* Print out the internal tables */ 1191 phyint_inst_print_all(); 1192 1193 /* 1194 * Print out the accumulated statistics about missed 1195 * probes (happens due to scheduling delay). 1196 */ 1197 logerr("Missed sending total of %d probes spread over" 1198 " %d occurrences\n", probes_missed.pm_nprobes, 1199 probes_missed.pm_ntimes); 1200 1201 /* 1202 * Print out the accumulated statistics about probes 1203 * that were sent. 1204 */ 1205 for (pii = phyint_instances; pii != NULL; 1206 pii = pii->pii_next) { 1207 unacked = 0; 1208 acked = pii->pii_cum_stats.acked; 1209 lost = pii->pii_cum_stats.lost; 1210 sent = pii->pii_cum_stats.sent; 1211 unknown = pii->pii_cum_stats.unknown; 1212 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1213 switch (pii->pii_probes[pr_ndx].pr_status) { 1214 case PR_ACKED: 1215 acked++; 1216 break; 1217 case PR_LOST: 1218 lost++; 1219 break; 1220 case PR_UNACKED: 1221 unacked++; 1222 break; 1223 } 1224 } 1225 logerr("\nProbe stats on (%s %s)\n" 1226 "Number of probes sent %lld\n" 1227 "Number of probe acks received %lld\n" 1228 "Number of probes/acks lost %lld\n" 1229 "Number of valid unacknowled probes %lld\n" 1230 "Number of ambiguous probe acks received %lld\n", 1231 AF_STR(pii->pii_af), pii->pii_name, 1232 sent, acked, lost, unacked, unknown); 1233 } 1234 break; 1235 case SIGHUP: 1236 logerr("SIGHUP: restart and reread config file\n"); 1237 cleanup(); 1238 (void) execv(argv0[0], argv0); 1239 _exit(0177); 1240 /* NOTREACHED */ 1241 case SIGINT: 1242 case SIGTERM: 1243 case SIGQUIT: 1244 cleanup(); 1245 exit(0); 1246 /* NOTREACHED */ 1247 default: 1248 logerr("in_signal: unknown signal: %d\n", buf); 1249 } 1250 } 1251 1252 static void 1253 cleanup(void) 1254 { 1255 struct phyint_instance *pii; 1256 struct phyint_instance *next_pii; 1257 1258 /* 1259 * Make sure that we don't write to eventpipe in 1260 * sig_handler() if any signal notably SIGALRM, 1261 * occurs after we close the eventpipe descriptor below 1262 */ 1263 cleanup_started = _B_TRUE; 1264 1265 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1266 next_pii = pii->pii_next; 1267 phyint_inst_delete(pii); 1268 } 1269 1270 (void) close(ifsock_v4); 1271 (void) close(ifsock_v6); 1272 (void) close(rtsock_v4); 1273 (void) close(rtsock_v6); 1274 (void) close(lsock_v4); 1275 (void) close(lsock_v6); 1276 (void) close(0); 1277 (void) close(1); 1278 (void) close(2); 1279 (void) close(mibfd); 1280 (void) close(eventpipe_read); 1281 (void) close(eventpipe_write); 1282 } 1283 1284 /* 1285 * Create pipe for signal delivery and set up signal handlers. 1286 */ 1287 static void 1288 setup_eventpipe(void) 1289 { 1290 int fds[2]; 1291 struct sigaction act; 1292 1293 if ((pipe(fds)) < 0) { 1294 logperror("setup_eventpipe: pipe"); 1295 exit(1); 1296 } 1297 eventpipe_read = fds[0]; 1298 eventpipe_write = fds[1]; 1299 if (poll_add(eventpipe_read) == -1) { 1300 exit(1); 1301 } 1302 1303 act.sa_handler = sig_handler; 1304 act.sa_flags = SA_RESTART; 1305 (void) sigaction(SIGALRM, &act, NULL); 1306 1307 (void) sigset(SIGHUP, sig_handler); 1308 (void) sigset(SIGUSR1, sig_handler); 1309 (void) sigset(SIGTERM, sig_handler); 1310 (void) sigset(SIGINT, sig_handler); 1311 (void) sigset(SIGQUIT, sig_handler); 1312 } 1313 1314 /* 1315 * Create a routing socket for receiving RTM_IFINFO messages. 1316 */ 1317 static int 1318 setup_rtsock(int af) 1319 { 1320 int s; 1321 int flags; 1322 1323 s = socket(PF_ROUTE, SOCK_RAW, af); 1324 if (s == -1) { 1325 logperror("setup_rtsock: socket PF_ROUTE"); 1326 exit(1); 1327 } 1328 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1329 logperror("setup_rtsock: fcntl F_GETFL"); 1330 (void) close(s); 1331 exit(1); 1332 } 1333 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1334 logperror("setup_rtsock: fcntl F_SETFL"); 1335 (void) close(s); 1336 exit(1); 1337 } 1338 if (poll_add(s) == -1) { 1339 (void) close(s); 1340 exit(1); 1341 } 1342 return (s); 1343 } 1344 1345 /* 1346 * Process an RTM_IFINFO message received on a routing socket. 1347 * The return value indicates whether a full interface scan is required. 1348 * Link up/down notifications from the NICs are reflected in the 1349 * IFF_RUNNING flag. 1350 * If just the state of the IFF_RUNNING interface flag has changed, a 1351 * a full interface scan isn't required. 1352 */ 1353 static boolean_t 1354 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1355 { 1356 struct sockaddr_dl *sdl; 1357 struct phyint *pi; 1358 uint64_t old_flags; 1359 struct phyint_instance *pii; 1360 1361 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1362 1363 /* 1364 * Although the sockaddr_dl structure is directly after the 1365 * if_msghdr_t structure. At the time of writing, the size of the 1366 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1367 * to the presence of a timeval structure, which contains longs, 1368 * in the if_data structure. Anyway, we know where the message ends, 1369 * so we work backwards to get the start of the sockaddr_dl structure. 1370 */ 1371 /*LINTED*/ 1372 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1373 sizeof (struct sockaddr_dl)); 1374 1375 assert(sdl->sdl_family == AF_LINK); 1376 1377 /* 1378 * The interface name is in sdl_data. 1379 * RTM_IFINFO messages are only generated for logical interface 1380 * zero, so there is no colon and logical interface number to 1381 * strip from the name. The name is not null terminated, but 1382 * there should be enough space in sdl_data to add the null. 1383 */ 1384 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1385 if (debug & D_LINKNOTE) 1386 logdebug("process_rtm_ifinfo: " 1387 "phyint name too long\n"); 1388 return (_B_TRUE); 1389 } 1390 sdl->sdl_data[sdl->sdl_nlen] = 0; 1391 1392 pi = phyint_lookup(sdl->sdl_data); 1393 if (pi == NULL) { 1394 if (debug & D_LINKNOTE) 1395 logdebug("process_rtm_ifinfo: phyint lookup failed" 1396 " for %s\n", sdl->sdl_data); 1397 return (_B_TRUE); 1398 } 1399 1400 /* 1401 * We want to try and avoid doing a full interface scan for 1402 * link state notifications from the NICs, as indicated 1403 * by the state of the IFF_RUNNING flag. If just the 1404 * IFF_RUNNING flag has changed state, the link state changes 1405 * are processed without a full scan. 1406 * If there is both an IPv4 and IPv6 instance associated with 1407 * the physical interface, we will get an RTM_IFINFO message 1408 * for each instance. If we just maintained a single copy of 1409 * the physical interface flags, it would appear that no flags 1410 * had changed when the second message is processed, leading us 1411 * to believe that the message wasn't generated by a flags change, 1412 * and that a full interface scan is required. 1413 * To get around this problem, two additional copies of the flags 1414 * are kept, one copy for each instance. These are only used in 1415 * this routine. At any one time, all three copies of the flags 1416 * should be identical except for the IFF_RUNNING flag. The 1417 * copy of the flags in the "phyint" structure is always up to 1418 * date. 1419 */ 1420 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1421 if (pii == NULL) { 1422 if (debug & D_LINKNOTE) 1423 logdebug("process_rtm_ifinfo: no instance of address " 1424 "family %s for %s\n", AF_STR(type), pi->pi_name); 1425 return (_B_TRUE); 1426 } 1427 1428 old_flags = pii->pii_flags; 1429 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1430 pi->pi_flags = pii->pii_flags; 1431 1432 if (debug & D_LINKNOTE) { 1433 logdebug("process_rtm_ifinfo: %s address family: %s, " 1434 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1435 AF_STR(type), old_flags, pi->pi_flags); 1436 } 1437 1438 /* 1439 * If IFF_STANDBY has changed, indicate that the interface has changed 1440 * types. 1441 */ 1442 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1443 phyint_newtype(pi); 1444 1445 /* 1446 * If IFF_INACTIVE has been set, then no data addresses should be 1447 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1448 * move previously failed-over addresses back to it, provided it is 1449 * not failed. For details, see the state diagram in mpd_probe.c. 1450 */ 1451 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1452 if (pii->pii_flags & IFF_INACTIVE) { 1453 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1454 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1455 } else { 1456 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1457 pi->pi_empty = 0; 1458 (void) try_failback(pi, _B_FALSE); 1459 } 1460 } 1461 } 1462 1463 /* Has just the IFF_RUNNING flag changed state ? */ 1464 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1465 struct phyint_instance *pii_other; 1466 /* 1467 * It wasn't just a link state change. Update 1468 * the other instance's copy of the flags. 1469 */ 1470 pii_other = phyint_inst_other(pii); 1471 if (pii_other != NULL) 1472 pii_other->pii_flags = pii->pii_flags; 1473 return (_B_TRUE); 1474 } 1475 1476 return (_B_FALSE); 1477 } 1478 1479 /* 1480 * Retrieve as many routing socket messages as possible, and try to 1481 * empty the routing sockets. Initiate full scan of targets or interfaces 1482 * as needed. 1483 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1484 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1485 */ 1486 static void 1487 process_rtsock(int rtsock_v4, int rtsock_v6) 1488 { 1489 int nbytes; 1490 int64_t msg[2048 / 8]; 1491 struct rt_msghdr *rtm; 1492 boolean_t need_if_scan = _B_FALSE; 1493 boolean_t need_rt_scan = _B_FALSE; 1494 boolean_t rtm_ifinfo_seen = _B_FALSE; 1495 int type; 1496 1497 /* Read as many messages as possible and try to empty the sockets */ 1498 for (type = AF_INET; ; type = AF_INET6) { 1499 for (;;) { 1500 nbytes = read((type == AF_INET) ? rtsock_v4 : 1501 rtsock_v6, msg, sizeof (msg)); 1502 if (nbytes <= 0) { 1503 /* No more messages */ 1504 break; 1505 } 1506 rtm = (struct rt_msghdr *)msg; 1507 if (rtm->rtm_version != RTM_VERSION) { 1508 logerr("process_rtsock: version %d " 1509 "not understood\n", rtm->rtm_version); 1510 break; 1511 } 1512 1513 if (debug & D_PHYINT) { 1514 logdebug("process_rtsock: message %d\n", 1515 rtm->rtm_type); 1516 } 1517 1518 switch (rtm->rtm_type) { 1519 case RTM_NEWADDR: 1520 case RTM_DELADDR: 1521 /* 1522 * Some logical interface has changed, 1523 * have to scan everything to determine 1524 * what actually changed. 1525 */ 1526 need_if_scan = _B_TRUE; 1527 break; 1528 1529 case RTM_IFINFO: 1530 rtm_ifinfo_seen = _B_TRUE; 1531 need_if_scan |= 1532 process_rtm_ifinfo((if_msghdr_t *)rtm, 1533 type); 1534 break; 1535 1536 case RTM_ADD: 1537 case RTM_DELETE: 1538 case RTM_CHANGE: 1539 case RTM_OLDADD: 1540 case RTM_OLDDEL: 1541 need_rt_scan = _B_TRUE; 1542 break; 1543 1544 default: 1545 /* Not interesting */ 1546 break; 1547 } 1548 } 1549 if (type == AF_INET6) 1550 break; 1551 } 1552 1553 if (need_if_scan) { 1554 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1555 logdebug("process_rtsock: synchronizing with kernel\n"); 1556 initifs(); 1557 } else if (rtm_ifinfo_seen) { 1558 if (debug & D_LINKNOTE) 1559 logdebug("process_rtsock: " 1560 "link up/down notification(s) seen\n"); 1561 process_link_state_changes(); 1562 } 1563 1564 if (need_rt_scan) 1565 init_router_targets(); 1566 } 1567 1568 /* 1569 * Look if the phyint instance or one of its logints have been removed from 1570 * the kernel and take appropriate action. 1571 * Uses {pii,li}_in_use. 1572 */ 1573 static void 1574 check_if_removed(struct phyint_instance *pii) 1575 { 1576 struct logint *li; 1577 struct logint *next_li; 1578 1579 /* Detect phyints that have been removed from the kernel. */ 1580 if (!pii->pii_in_use) { 1581 logtrace("%s %s has been removed from kernel\n", 1582 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1583 phyint_inst_delete(pii); 1584 } else { 1585 /* Detect logints that have been removed. */ 1586 for (li = pii->pii_logint; li != NULL; li = next_li) { 1587 next_li = li->li_next; 1588 if (!li->li_in_use) { 1589 logint_delete(li); 1590 } 1591 } 1592 } 1593 } 1594 1595 /* 1596 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1597 * tables defined by mib2.h. Parse the returned data and extract 1598 * the 'routing' information table. Process the 'routing' table 1599 * to get the list of known onlink routers, and update our database. 1600 * These onlink routers will serve as our probe targets. 1601 * Returns false, if any system calls resulted in errors, true otherwise. 1602 */ 1603 static boolean_t 1604 update_router_list(int fd) 1605 { 1606 union { 1607 char ubuf[1024]; 1608 union T_primitives uprim; 1609 } buf; 1610 1611 int flags; 1612 struct strbuf ctlbuf; 1613 struct strbuf databuf; 1614 struct T_optmgmt_req *tor; 1615 struct T_optmgmt_ack *toa; 1616 struct T_error_ack *tea; 1617 struct opthdr *optp; 1618 struct opthdr *req; 1619 int status; 1620 t_scalar_t prim; 1621 1622 tor = (struct T_optmgmt_req *)&buf; 1623 1624 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1625 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1626 tor->OPT_length = sizeof (struct opthdr); 1627 tor->MGMT_flags = T_CURRENT; 1628 1629 req = (struct opthdr *)&tor[1]; 1630 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1631 req->name = 0; 1632 req->len = 0; 1633 1634 ctlbuf.buf = (char *)&buf; 1635 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1636 ctlbuf.maxlen = sizeof (buf); 1637 flags = 0; 1638 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1639 logperror("update_router_list: putmsg(ctl)"); 1640 return (_B_FALSE); 1641 } 1642 1643 /* 1644 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1645 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1646 * a control and data part. The control part contains a struct 1647 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1648 * the level, name and length of the data in the data part. The 1649 * data part contains the actual table data. The last message 1650 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1651 * single option with zero optlen. 1652 */ 1653 1654 for (;;) { 1655 /* 1656 * Go around this loop once for each table. Ignore 1657 * all tables except the routing information table. 1658 */ 1659 flags = 0; 1660 status = getmsg(fd, &ctlbuf, NULL, &flags); 1661 if (status < 0) { 1662 if (errno == EINTR) 1663 continue; 1664 logperror("update_router_list: getmsg(ctl)"); 1665 return (_B_FALSE); 1666 } 1667 if (ctlbuf.len < sizeof (t_scalar_t)) { 1668 logerr("update_router_list: ctlbuf.len %d\n", 1669 ctlbuf.len); 1670 return (_B_FALSE); 1671 } 1672 1673 prim = buf.uprim.type; 1674 1675 switch (prim) { 1676 1677 case T_ERROR_ACK: 1678 tea = &buf.uprim.error_ack; 1679 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1680 logerr("update_router_list: T_ERROR_ACK" 1681 " ctlbuf.len %d\n", ctlbuf.len); 1682 return (_B_FALSE); 1683 } 1684 logerr("update_router_list: T_ERROR_ACK:" 1685 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1686 tea->TLI_error, tea->UNIX_error); 1687 return (_B_FALSE); 1688 1689 case T_OPTMGMT_ACK: 1690 toa = &buf.uprim.optmgmt_ack; 1691 optp = (struct opthdr *)&toa[1]; 1692 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1693 logerr("update_router_list: ctlbuf.len %d\n", 1694 ctlbuf.len); 1695 return (_B_FALSE); 1696 } 1697 if (toa->MGMT_flags != T_SUCCESS) { 1698 logerr("update_router_list: MGMT_flags 0x%lx\n", 1699 toa->MGMT_flags); 1700 return (_B_FALSE); 1701 } 1702 break; 1703 1704 default: 1705 logerr("update_router_list: unknown primitive %ld\n", 1706 prim); 1707 return (_B_FALSE); 1708 } 1709 1710 /* Process the T_OPGMGMT_ACK below */ 1711 assert(prim == T_OPTMGMT_ACK); 1712 1713 switch (status) { 1714 case 0: 1715 /* 1716 * We have reached the end of this T_OPTMGMT_ACK 1717 * message. If this is the last message i.e EOD, 1718 * return, else process the next T_OPTMGMT_ACK msg. 1719 */ 1720 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1721 sizeof (struct opthdr)) && optp->len == 0 && 1722 optp->name == 0 && optp->level == 0) { 1723 /* 1724 * This is the EOD message. Return 1725 */ 1726 return (_B_TRUE); 1727 } 1728 continue; 1729 1730 case MORECTL: 1731 case MORECTL | MOREDATA: 1732 /* 1733 * This should not happen. We should be able to read 1734 * the control portion in a single getmsg. 1735 */ 1736 logerr("update_router_list: MORECTL\n"); 1737 return (_B_FALSE); 1738 1739 case MOREDATA: 1740 databuf.maxlen = optp->len; 1741 /* malloc of 0 bytes is ok */ 1742 databuf.buf = malloc((size_t)optp->len); 1743 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1744 logperror("update_router_list: malloc"); 1745 return (_B_FALSE); 1746 } 1747 databuf.len = 0; 1748 flags = 0; 1749 for (;;) { 1750 status = getmsg(fd, NULL, &databuf, &flags); 1751 if (status >= 0) { 1752 break; 1753 } else if (errno == EINTR) { 1754 continue; 1755 } else { 1756 logperror("update_router_list:" 1757 " getmsg(data)"); 1758 free(databuf.buf); 1759 return (_B_FALSE); 1760 } 1761 } 1762 1763 if (optp->level == MIB2_IP && 1764 optp->name == MIB2_IP_ROUTE) { 1765 /* LINTED */ 1766 ire_process_v4((mib2_ipRouteEntry_t *) 1767 databuf.buf, databuf.len); 1768 } else if (optp->level == MIB2_IP6 && 1769 optp->name == MIB2_IP6_ROUTE) { 1770 /* LINTED */ 1771 ire_process_v6((mib2_ipv6RouteEntry_t *) 1772 databuf.buf, databuf.len); 1773 } 1774 free(databuf.buf); 1775 } 1776 } 1777 /* NOTREACHED */ 1778 } 1779 1780 /* 1781 * Examine the IPv4 routing table, for default routers. For each default 1782 * router, populate the list of targets of each phyint that is on the same 1783 * link as the default router 1784 */ 1785 static void 1786 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1787 { 1788 mib2_ipRouteEntry_t *rp; 1789 mib2_ipRouteEntry_t *rp1; 1790 struct in_addr nexthop_v4; 1791 mib2_ipRouteEntry_t *endp; 1792 1793 if (len == 0) 1794 return; 1795 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1796 1797 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1798 1799 /* 1800 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1801 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1802 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1803 * This is a potential target for probing, which we try to add 1804 * to the list of probe targets. 1805 */ 1806 for (rp = buf; rp < endp; rp++) { 1807 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1808 continue; 1809 1810 /* Get the nexthop address. */ 1811 nexthop_v4.s_addr = rp->ipRouteNextHop; 1812 1813 /* 1814 * Get the nexthop address. Then determine the outgoing 1815 * interface, by examining all interface IREs, and picking the 1816 * match. We don't look at the interface specified in the route 1817 * because we need to add the router target on all matching 1818 * interfaces anyway; the goal is to avoid falling back to 1819 * multicast when some interfaces are in the same subnet but 1820 * not in the same group. 1821 */ 1822 for (rp1 = buf; rp1 < endp; rp1++) { 1823 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1824 continue; 1825 } 1826 1827 /* 1828 * Determine the interface IRE that matches the nexthop. 1829 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1830 */ 1831 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1832 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1833 /* 1834 * We found the interface ire 1835 */ 1836 router_add_v4(rp1, nexthop_v4); 1837 } 1838 } 1839 } 1840 } 1841 1842 void 1843 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1844 { 1845 char *cp; 1846 char ifname[LIFNAMSIZ + 1]; 1847 struct in6_addr nexthop; 1848 int len; 1849 1850 if (debug & D_TARGET) 1851 logdebug("router_add_v4()\n"); 1852 1853 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1854 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1855 ifname[len] = '\0'; 1856 1857 if (ifname[0] == '\0') 1858 return; 1859 1860 cp = strchr(ifname, IF_SEPARATOR); 1861 if (cp != NULL) 1862 *cp = '\0'; 1863 1864 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1865 router_add_common(AF_INET, ifname, nexthop); 1866 } 1867 1868 void 1869 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1870 { 1871 struct phyint_instance *pii; 1872 struct phyint *pi; 1873 1874 if (debug & D_TARGET) 1875 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1876 1877 /* 1878 * Retrieve the phyint instance; bail if it's not known to us yet. 1879 */ 1880 pii = phyint_inst_lookup(af, ifname); 1881 if (pii == NULL) 1882 return; 1883 1884 /* 1885 * Don't use our own addresses as targets. 1886 */ 1887 if (own_address(nexthop)) 1888 return; 1889 1890 /* 1891 * If the phyint is part a named group, then add the address to all 1892 * members of the group; note that this is suboptimal in the IPv4 case 1893 * as it has already been added to all matching interfaces in 1894 * ire_process_v4(). Otherwise, add the address only to the phyint 1895 * itself, since other phyints in the anongroup may not be on the same 1896 * subnet. 1897 */ 1898 pi = pii->pii_phyint; 1899 if (pi->pi_group == phyint_anongroup) { 1900 target_add(pii, nexthop, _B_TRUE); 1901 } else { 1902 pi = pi->pi_group->pg_phyint; 1903 for (; pi != NULL; pi = pi->pi_pgnext) 1904 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1905 } 1906 } 1907 1908 /* 1909 * Examine the IPv6 routing table, for default routers. For each default 1910 * router, populate the list of targets of each phyint that is on the same 1911 * link as the default router 1912 */ 1913 static void 1914 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1915 { 1916 mib2_ipv6RouteEntry_t *rp; 1917 mib2_ipv6RouteEntry_t *endp; 1918 struct in6_addr nexthop_v6; 1919 1920 if (debug & D_TARGET) 1921 logdebug("ire_process_v6(len %d)\n", len); 1922 1923 if (len == 0) 1924 return; 1925 1926 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1927 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1928 1929 /* 1930 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1931 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1932 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1933 * This is a potential target for probing, which we try to add 1934 * to the list of probe targets. 1935 */ 1936 for (rp = buf; rp < endp; rp++) { 1937 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1938 continue; 1939 1940 /* 1941 * We have the outgoing interface in ipv6RouteIfIndex 1942 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1943 * interface must be present for link-local addresses. Since 1944 * we use only link-local addreses for probing, we don't 1945 * consider the case when the outgoing interface is not 1946 * known and we need to scan interface ires 1947 */ 1948 nexthop_v6 = rp->ipv6RouteNextHop; 1949 if (rp->ipv6RouteIfIndex.o_length != 0) { 1950 /* 1951 * We already have the outgoing interface 1952 * in ipv6RouteIfIndex. 1953 */ 1954 router_add_v6(rp, nexthop_v6); 1955 } 1956 } 1957 } 1958 1959 1960 void 1961 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1962 { 1963 char ifname[LIFNAMSIZ + 1]; 1964 char *cp; 1965 int len; 1966 1967 if (debug & D_TARGET) 1968 logdebug("router_add_v6()\n"); 1969 1970 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1971 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1972 ifname[len] = '\0'; 1973 1974 if (ifname[0] == '\0') 1975 return; 1976 1977 cp = strchr(ifname, IF_SEPARATOR); 1978 if (cp != NULL) 1979 *cp = '\0'; 1980 1981 router_add_common(AF_INET6, ifname, nexthop_v6); 1982 } 1983 1984 1985 1986 /* 1987 * Build a list of target routers, by scanning the routing tables. 1988 * It is assumed that interface routes exist, to reach the routers. 1989 */ 1990 static void 1991 init_router_targets(void) 1992 { 1993 struct target *tg; 1994 struct target *next_tg; 1995 struct phyint_instance *pii; 1996 struct phyint *pi; 1997 1998 if (force_mcast) 1999 return; 2000 2001 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2002 pi = pii->pii_phyint; 2003 /* 2004 * Exclude ptp and host targets. Set tg_in_use to false, 2005 * only for router targets. 2006 */ 2007 if (!pii->pii_targets_are_routers || 2008 (pi->pi_flags & IFF_POINTOPOINT)) 2009 continue; 2010 2011 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 2012 tg->tg_in_use = 0; 2013 } 2014 2015 if (mibfd < 0) { 2016 mibfd = open("/dev/ip", O_RDWR); 2017 if (mibfd < 0) { 2018 logperror("mibopen: ip open"); 2019 exit(1); 2020 } 2021 } 2022 2023 if (!update_router_list(mibfd)) { 2024 (void) close(mibfd); 2025 mibfd = -1; 2026 } 2027 2028 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2029 if (!pii->pii_targets_are_routers || 2030 (pi->pi_flags & IFF_POINTOPOINT)) 2031 continue; 2032 2033 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 2034 next_tg = tg->tg_next; 2035 if (!tg->tg_in_use) { 2036 target_delete(tg); 2037 } 2038 } 2039 } 2040 } 2041 2042 /* 2043 * Attempt to assign host targets to any interfaces that do not currently 2044 * have probe targets by sharing targets with other interfaces in the group. 2045 */ 2046 static void 2047 init_host_targets(void) 2048 { 2049 struct phyint_instance *pii; 2050 struct phyint_group *pg; 2051 2052 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2053 pg = pii->pii_phyint->pi_group; 2054 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2055 dup_host_targets(pii); 2056 } 2057 } 2058 2059 /* 2060 * Duplicate host targets from other phyints of the group to 2061 * the phyint instance 'desired_pii'. 2062 */ 2063 static void 2064 dup_host_targets(struct phyint_instance *desired_pii) 2065 { 2066 int af; 2067 struct phyint *pi; 2068 struct phyint_instance *pii; 2069 struct target *tg; 2070 2071 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2072 2073 af = desired_pii->pii_af; 2074 2075 /* 2076 * For every phyint in the same group as desired_pii, check if 2077 * it has any host targets. If so add them to desired_pii. 2078 */ 2079 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2080 pii = PHYINT_INSTANCE(pi, af); 2081 /* 2082 * We know that we don't have targets on this phyint instance 2083 * since we have been called. But we still check for 2084 * pii_targets_are_routers because another phyint instance 2085 * could have router targets, since IFF_NOFAILOVER addresses 2086 * on different phyint instances may belong to different 2087 * subnets. 2088 */ 2089 if ((pii == NULL) || (pii == desired_pii) || 2090 pii->pii_targets_are_routers) 2091 continue; 2092 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2093 target_create(desired_pii, tg->tg_address, _B_FALSE); 2094 } 2095 } 2096 } 2097 2098 static void 2099 usage(char *cmd) 2100 { 2101 (void) fprintf(stderr, "usage: %s\n", cmd); 2102 } 2103 2104 2105 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2106 2107 /* Get an option from the /etc/default/mpathd file */ 2108 static char * 2109 getdefault(char *name) 2110 { 2111 char namebuf[BUFSIZ]; 2112 char *value = NULL; 2113 2114 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2115 char *cp; 2116 int flags; 2117 2118 /* 2119 * ignore case 2120 */ 2121 flags = defcntl(DC_GETFLAGS, 0); 2122 TURNOFF(flags, DC_CASE); 2123 (void) defcntl(DC_SETFLAGS, flags); 2124 2125 /* Add "=" to the name */ 2126 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2127 (void) strncat(namebuf, "=", 2); 2128 2129 if ((cp = defread(namebuf)) != NULL) 2130 value = strdup(cp); 2131 2132 /* close */ 2133 (void) defopen((char *)NULL); 2134 } 2135 return (value); 2136 } 2137 2138 2139 /* 2140 * Command line options below 2141 */ 2142 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2143 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2144 static boolean_t adopt = _B_FALSE; 2145 static boolean_t foreground = _B_FALSE; 2146 2147 int 2148 main(int argc, char *argv[]) 2149 { 2150 int i; 2151 int c; 2152 struct phyint_instance *pii; 2153 char *value; 2154 2155 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2156 srandom(gethostid()); /* Initialize the random number generator */ 2157 2158 /* 2159 * NOTE: The messages output by in.mpathd are not suitable for 2160 * translation, so we do not call textdomain(). 2161 */ 2162 (void) setlocale(LC_ALL, ""); 2163 2164 /* 2165 * Get the user specified value of 'failure detection time' 2166 * from /etc/default/mpathd 2167 */ 2168 value = getdefault("FAILURE_DETECTION_TIME"); 2169 if (value != NULL) { 2170 user_failure_detection_time = 2171 (int)strtol((char *)value, NULL, 0); 2172 2173 if (user_failure_detection_time <= 0) { 2174 user_failure_detection_time = FAILURE_DETECTION_TIME; 2175 logerr("Invalid failure detection time %s, assuming " 2176 "default %d\n", value, user_failure_detection_time); 2177 2178 } else if (user_failure_detection_time < 2179 MIN_FAILURE_DETECTION_TIME) { 2180 user_failure_detection_time = 2181 MIN_FAILURE_DETECTION_TIME; 2182 logerr("Too small failure detection time of %s, " 2183 "assuming minimum %d\n", value, 2184 user_failure_detection_time); 2185 } 2186 free(value); 2187 } else { 2188 /* User has not specified the parameter, Use default value */ 2189 user_failure_detection_time = FAILURE_DETECTION_TIME; 2190 } 2191 2192 /* 2193 * This gives the frequency at which probes will be sent. 2194 * When fdt ms elapses, we should be able to determine 2195 * whether 5 consecutive probes have failed or not. 2196 * 1 probe will be sent in every user_probe_interval ms, 2197 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2198 * user_probe_interval. Thus when we send out probe 'n' we 2199 * can be sure that probe 'n - 2' is lost, if we have not 2200 * got the ack. (since the probe interval is > crtt). But 2201 * probe 'n - 1' may be a valid unacked probe, since the 2202 * time between 2 successive probes could be as small as 2203 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2204 */ 2205 user_probe_interval = user_failure_detection_time / 2206 (NUM_PROBE_FAILS + 2); 2207 2208 /* 2209 * Get the user specified value of failback_enabled from 2210 * /etc/default/mpathd 2211 */ 2212 value = getdefault("FAILBACK"); 2213 if (value != NULL) { 2214 if (strncasecmp(value, "yes", 3) == 0) 2215 failback_enabled = _B_TRUE; 2216 else if (strncasecmp(value, "no", 2) == 0) 2217 failback_enabled = _B_FALSE; 2218 else 2219 logerr("Invalid value for FAILBACK %s\n", value); 2220 free(value); 2221 } else { 2222 failback_enabled = _B_TRUE; 2223 } 2224 2225 /* 2226 * Get the user specified value of track_all_phyints from 2227 * /etc/default/mpathd. The sense is reversed in 2228 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2229 */ 2230 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2231 if (value != NULL) { 2232 if (strncasecmp(value, "yes", 3) == 0) 2233 track_all_phyints = _B_FALSE; 2234 else if (strncasecmp(value, "no", 2) == 0) 2235 track_all_phyints = _B_TRUE; 2236 else 2237 logerr("Invalid value for " 2238 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2239 free(value); 2240 } else { 2241 track_all_phyints = _B_FALSE; 2242 } 2243 2244 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2245 switch (c) { 2246 case 'a': 2247 adopt = _B_TRUE; 2248 break; 2249 case 'm': 2250 force_mcast = _B_TRUE; 2251 break; 2252 case 'd': 2253 debug = D_ALL; 2254 foreground = _B_TRUE; 2255 break; 2256 case 'D': 2257 i = (int)strtol(optarg, NULL, 0); 2258 if (i == 0) { 2259 (void) fprintf(stderr, "Bad debug flags: %s\n", 2260 optarg); 2261 exit(1); 2262 } 2263 debug |= i; 2264 foreground = _B_TRUE; 2265 break; 2266 case 'l': 2267 /* 2268 * Turn off link state notification handling. 2269 * Undocumented command line flag, for debugging 2270 * purposes. 2271 */ 2272 handle_link_notifications = _B_FALSE; 2273 break; 2274 default: 2275 usage(argv[0]); 2276 exit(1); 2277 } 2278 } 2279 2280 /* 2281 * The sockets for the loopback command interface should be listening 2282 * before we fork and exit in daemonize(). This way, whoever started us 2283 * can use the loopback interface as soon as they get a zero exit 2284 * status. 2285 */ 2286 lsock_v4 = setup_listener(AF_INET); 2287 lsock_v6 = setup_listener(AF_INET6); 2288 2289 if (lsock_v4 < 0 && lsock_v6 < 0) { 2290 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2291 exit(1); 2292 } 2293 2294 if (!foreground) { 2295 if (!daemonize()) { 2296 logerr("cannot daemonize\n"); 2297 exit(EXIT_FAILURE); 2298 } 2299 initlog(); 2300 } 2301 2302 /* 2303 * Initializations: 2304 * 1. Create ifsock* sockets. These are used for performing SIOC* 2305 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2306 * 2. Initialize a pipe for handling/recording signal events. 2307 * 3. Create the routing sockets, used for listening 2308 * to routing / interface changes. 2309 * 4. phyint_init() - Initialize physical interface state 2310 * (in mpd_tables.c). Must be done before creating interfaces, 2311 * which timer_init() does indirectly. 2312 * 5. timer_init() - Initialize timer related stuff 2313 * 6. initifs() - Initialize our database of all known interfaces 2314 * 7. init_router_targets() - Initialize our database of all known 2315 * router targets. 2316 */ 2317 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2318 if (ifsock_v4 < 0) { 2319 logperror("main: IPv4 socket open"); 2320 exit(1); 2321 } 2322 2323 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2324 if (ifsock_v6 < 0) { 2325 logperror("main: IPv6 socket open"); 2326 exit(1); 2327 } 2328 2329 setup_eventpipe(); 2330 2331 rtsock_v4 = setup_rtsock(AF_INET); 2332 rtsock_v6 = setup_rtsock(AF_INET6); 2333 2334 if (phyint_init() == -1) { 2335 logerr("cannot initialize physical interface structures"); 2336 exit(1); 2337 } 2338 2339 timer_init(); 2340 2341 initifs(); 2342 2343 /* Inform kernel whether failback is enabled or disabled */ 2344 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2345 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2346 exit(1); 2347 } 2348 2349 /* 2350 * If we're operating in "adopt" mode and no interfaces need to be 2351 * tracked, shut down (ifconfig(1M) will restart us on demand if 2352 * interfaces are subsequently put into multipathing groups). 2353 */ 2354 if (adopt && phyint_instances == NULL) 2355 exit(0); 2356 2357 /* 2358 * Main body. Keep listening for activity on any of the sockets 2359 * that we are monitoring and take appropriate action as necessary. 2360 * signals are also handled synchronously. 2361 */ 2362 for (;;) { 2363 if (poll(pollfds, pollfd_num, -1) < 0) { 2364 if (errno == EINTR) 2365 continue; 2366 logperror("main: poll"); 2367 exit(1); 2368 } 2369 for (i = 0; i < pollfd_num; i++) { 2370 if ((pollfds[i].fd == -1) || 2371 !(pollfds[i].revents & POLLIN)) 2372 continue; 2373 if (pollfds[i].fd == eventpipe_read) { 2374 in_signal(eventpipe_read); 2375 break; 2376 } 2377 if (pollfds[i].fd == rtsock_v4 || 2378 pollfds[i].fd == rtsock_v6) { 2379 process_rtsock(rtsock_v4, rtsock_v6); 2380 break; 2381 } 2382 for (pii = phyint_instances; pii != NULL; 2383 pii = pii->pii_next) { 2384 if (pollfds[i].fd == pii->pii_probe_sock) { 2385 if (pii->pii_af == AF_INET) 2386 in_data(pii); 2387 else 2388 in6_data(pii); 2389 break; 2390 } 2391 } 2392 if (pollfds[i].fd == lsock_v4) 2393 loopback_cmd(lsock_v4, AF_INET); 2394 else if (pollfds[i].fd == lsock_v6) 2395 loopback_cmd(lsock_v6, AF_INET6); 2396 } 2397 if (full_scan_required) { 2398 initifs(); 2399 full_scan_required = _B_FALSE; 2400 } 2401 } 2402 /* NOTREACHED */ 2403 return (EXIT_SUCCESS); 2404 } 2405 2406 static int 2407 setup_listener(int af) 2408 { 2409 int sock; 2410 int on; 2411 int len; 2412 int ret; 2413 struct sockaddr_storage laddr; 2414 struct sockaddr_in *sin; 2415 struct sockaddr_in6 *sin6; 2416 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2417 2418 assert(af == AF_INET || af == AF_INET6); 2419 2420 sock = socket(af, SOCK_STREAM, 0); 2421 if (sock < 0) { 2422 logperror("setup_listener: socket"); 2423 exit(1); 2424 } 2425 2426 on = 1; 2427 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2428 sizeof (on)) < 0) { 2429 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2430 exit(1); 2431 } 2432 2433 bzero(&laddr, sizeof (laddr)); 2434 laddr.ss_family = af; 2435 2436 if (af == AF_INET) { 2437 sin = (struct sockaddr_in *)&laddr; 2438 sin->sin_port = htons(MPATHD_PORT); 2439 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2440 len = sizeof (struct sockaddr_in); 2441 } else { 2442 sin6 = (struct sockaddr_in6 *)&laddr; 2443 sin6->sin6_port = htons(MPATHD_PORT); 2444 sin6->sin6_addr = loopback_addr; 2445 len = sizeof (struct sockaddr_in6); 2446 } 2447 2448 ret = bind(sock, (struct sockaddr *)&laddr, len); 2449 if (ret < 0) { 2450 if (errno == EADDRINUSE) { 2451 /* 2452 * Another instance of mpathd may be already active. 2453 */ 2454 logerr("main: is another instance of in.mpathd " 2455 "already active?\n"); 2456 exit(1); 2457 } else { 2458 (void) close(sock); 2459 return (-1); 2460 } 2461 } 2462 if (listen(sock, 30) < 0) { 2463 logperror("main: listen"); 2464 exit(1); 2465 } 2466 if (poll_add(sock) == -1) { 2467 (void) close(sock); 2468 exit(1); 2469 } 2470 2471 return (sock); 2472 } 2473 2474 /* 2475 * Table of commands and their expected size; used by loopback_cmd(). 2476 */ 2477 static struct { 2478 const char *name; 2479 unsigned int size; 2480 } commands[] = { 2481 { "MI_PING", sizeof (uint32_t) }, 2482 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2483 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2484 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2485 { "MI_QUERY", sizeof (mi_query_t) } 2486 }; 2487 2488 /* 2489 * Commands received over the loopback interface come here. Currently 2490 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2491 * module. ifconfig only makes a connection, and closes it to check if 2492 * in.mpathd is running. 2493 * if_mpadm sends commands in the format specified by the mpathd_interface 2494 * structure. 2495 */ 2496 static void 2497 loopback_cmd(int sock, int family) 2498 { 2499 int newfd; 2500 ssize_t len; 2501 struct sockaddr_storage peer; 2502 struct sockaddr_in *peer_sin; 2503 struct sockaddr_in6 *peer_sin6; 2504 socklen_t peerlen; 2505 union mi_commands mpi; 2506 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2507 char abuf[INET6_ADDRSTRLEN]; 2508 uint_t cmd; 2509 int retval; 2510 2511 peerlen = sizeof (peer); 2512 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2513 if (newfd < 0) { 2514 logperror("loopback_cmd: accept"); 2515 return; 2516 } 2517 2518 switch (family) { 2519 case AF_INET: 2520 /* 2521 * Validate the address and port to make sure that 2522 * non privileged processes don't connect and start 2523 * talking to us. 2524 */ 2525 if (peerlen != sizeof (struct sockaddr_in)) { 2526 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2527 (void) close(newfd); 2528 return; 2529 } 2530 peer_sin = (struct sockaddr_in *)&peer; 2531 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2532 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2533 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2534 abuf, sizeof (abuf)); 2535 logerr("Attempt to connect from addr %s port %d\n", 2536 abuf, ntohs(peer_sin->sin_port)); 2537 (void) close(newfd); 2538 return; 2539 } 2540 break; 2541 2542 case AF_INET6: 2543 if (peerlen != sizeof (struct sockaddr_in6)) { 2544 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2545 (void) close(newfd); 2546 return; 2547 } 2548 /* 2549 * Validate the address and port to make sure that 2550 * non privileged processes don't connect and start 2551 * talking to us. 2552 */ 2553 peer_sin6 = (struct sockaddr_in6 *)&peer; 2554 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2555 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2556 &loopback_addr))) { 2557 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2558 sizeof (abuf)); 2559 logerr("Attempt to connect from addr %s port %d\n", 2560 abuf, ntohs(peer_sin6->sin6_port)); 2561 (void) close(newfd); 2562 return; 2563 } 2564 2565 default: 2566 logdebug("loopback_cmd: family %d\n", family); 2567 (void) close(newfd); 2568 return; 2569 } 2570 2571 /* 2572 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2573 * all supported commands 2574 */ 2575 len = read(newfd, &mpi, sizeof (mpi)); 2576 2577 /* 2578 * ifconfig does not send any data. Just tests to see if mpathd 2579 * is already running. 2580 */ 2581 if (len <= 0) { 2582 (void) close(newfd); 2583 return; 2584 } 2585 2586 /* 2587 * In theory, we can receive any sized message for a stream socket, 2588 * but we don't expect that to happen for a small message over a 2589 * loopback connection. 2590 */ 2591 if (len < sizeof (uint32_t)) { 2592 logerr("loopback_cmd: bad command format or read returns " 2593 "partial data %d\n", len); 2594 } 2595 2596 cmd = mpi.mi_command; 2597 if (cmd >= MI_NCMD) { 2598 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2599 (void) close(newfd); 2600 return; 2601 } 2602 2603 if (len < commands[cmd].size) { 2604 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2605 commands[cmd].name, commands[cmd].size, len); 2606 (void) close(newfd); 2607 return; 2608 } 2609 2610 retval = process_cmd(newfd, &mpi); 2611 if (retval != IPMP_SUCCESS) { 2612 logerr("failed processing %s: %s\n", commands[cmd].name, 2613 ipmp_errmsg(retval)); 2614 } 2615 (void) close(newfd); 2616 } 2617 2618 extern int global_errno; /* set by failover() or failback() */ 2619 2620 /* 2621 * Process the offline, undo offline and set original index commands, 2622 * received from if_mpadm(1M) 2623 */ 2624 static unsigned int 2625 process_cmd(int newfd, union mi_commands *mpi) 2626 { 2627 uint_t nif = 0; 2628 uint32_t cmd; 2629 struct phyint *pi; 2630 struct phyint *pi2; 2631 struct phyint_group *pg; 2632 boolean_t success; 2633 int error; 2634 struct mi_offline *mio; 2635 struct mi_undo_offline *miu; 2636 struct lifreq lifr; 2637 int ifsock; 2638 struct mi_setoindex *mis; 2639 2640 cmd = mpi->mi_command; 2641 2642 switch (cmd) { 2643 case MI_OFFLINE: 2644 mio = &mpi->mi_ocmd; 2645 /* 2646 * Lookup the interface that needs to be offlined. 2647 * If it does not exist, return a suitable error. 2648 */ 2649 pi = phyint_lookup(mio->mio_ifname); 2650 if (pi == NULL) 2651 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2652 2653 /* 2654 * Verify that the minimum redundancy requirements are met. 2655 * The multipathing group must have at least the specified 2656 * number of functional interfaces after offlining the 2657 * requested interface. Otherwise return a suitable error. 2658 */ 2659 pg = pi->pi_group; 2660 nif = 0; 2661 if (pg != phyint_anongroup) { 2662 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2663 pi2 = pi2->pi_pgnext) { 2664 if ((pi2->pi_state == PI_RUNNING) || 2665 (pg->pg_groupfailed && 2666 !(pi2->pi_flags & IFF_OFFLINE))) 2667 nif++; 2668 } 2669 } 2670 if (nif < mio->mio_min_redundancy) 2671 return (send_result(newfd, IPMP_EMINRED, 0)); 2672 2673 /* 2674 * The order of operation is to set IFF_OFFLINE, followed by 2675 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2676 * can be created. Subsequent failover moves everything on 2677 * the OFFLINE interface to some other functional interface. 2678 */ 2679 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2680 if (success) { 2681 if (!pi->pi_empty) { 2682 error = try_failover(pi, FAILOVER_NORMAL); 2683 if (error != 0) { 2684 if (!change_lif_flags(pi, IFF_OFFLINE, 2685 _B_FALSE)) { 2686 logerr("process_cmd: couldn't" 2687 " clear OFFLINE flag on" 2688 " %s\n", pi->pi_name); 2689 /* 2690 * Offline interfaces should 2691 * not be probed. 2692 */ 2693 stop_probing(pi); 2694 } 2695 return (send_result(newfd, error, 2696 global_errno)); 2697 } 2698 } 2699 } else { 2700 return (send_result(newfd, IPMP_FAILURE, errno)); 2701 } 2702 2703 /* 2704 * The interface is now Offline, so stop probing it. 2705 * Note that if_mpadm(1M) will down the test addresses, 2706 * after receiving a success reply from us. The routing 2707 * socket message will then make us close the socket used 2708 * for sending probes. But it is more logical that an 2709 * offlined interface must not be probed, even if it has 2710 * test addresses. 2711 */ 2712 stop_probing(pi); 2713 return (send_result(newfd, IPMP_SUCCESS, 0)); 2714 2715 case MI_UNDO_OFFLINE: 2716 miu = &mpi->mi_ucmd; 2717 /* 2718 * Undo the offline command. As usual lookup the interface. 2719 * Send an error if it does not exist. 2720 */ 2721 pi = phyint_lookup(miu->miu_ifname); 2722 if (pi == NULL) 2723 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2724 2725 /* 2726 * Inverse of the offline operation. Do a failback, and then 2727 * clear the IFF_OFFLINE flag. 2728 */ 2729 error = do_failback(pi, _B_TRUE); 2730 if (error == IPMP_EFBPARTIAL) 2731 return (send_result(newfd, IPMP_EFBPARTIAL, 0)); 2732 error = do_failback(pi, _B_FALSE); 2733 2734 switch (error) { 2735 case IPMP_SUCCESS: 2736 if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) { 2737 logdebug("undo error %X\n", global_errno); 2738 error = IPMP_FAILURE; 2739 break; 2740 } 2741 /* FALLTHROUGH */ 2742 2743 case IPMP_EFBPARTIAL: 2744 /* 2745 * Reset the state of the interface based on the 2746 * current link state; if this phyint subsequently 2747 * acquires a test address, the state will be changed 2748 * again later as a result of the probes. 2749 */ 2750 if (LINK_UP(pi)) 2751 phyint_chstate(pi, PI_RUNNING); 2752 else 2753 phyint_chstate(pi, PI_FAILED); 2754 break; 2755 2756 case IPMP_FAILURE: 2757 break; 2758 2759 default: 2760 logdebug("do_failback: unexpected return value\n"); 2761 break; 2762 } 2763 return (send_result(newfd, error, global_errno)); 2764 2765 case MI_SETOINDEX: 2766 mis = &mpi->mi_scmd; 2767 2768 /* Get the socket for doing ioctls */ 2769 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2770 2771 /* 2772 * Get index of new original interface. 2773 * The index is returned in lifr.lifr_index. 2774 */ 2775 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2776 sizeof (lifr.lifr_name)); 2777 2778 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2779 return (send_result(newfd, IPMP_FAILURE, errno)); 2780 2781 /* 2782 * Set new original interface index. 2783 * The new index was put into lifr.lifr_index by the 2784 * SIOCGLIFINDEX ioctl. 2785 */ 2786 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2787 sizeof (lifr.lifr_name)); 2788 2789 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2790 return (send_result(newfd, IPMP_FAILURE, errno)); 2791 2792 return (send_result(newfd, IPMP_SUCCESS, 0)); 2793 2794 case MI_QUERY: 2795 return (process_query(newfd, &mpi->mi_qcmd)); 2796 2797 default: 2798 break; 2799 } 2800 2801 return (send_result(newfd, IPMP_EPROTO, 0)); 2802 } 2803 2804 /* 2805 * Process the query request pointed to by `miq' and send a reply on file 2806 * descriptor `fd'. Returns an IPMP error code. 2807 */ 2808 static unsigned int 2809 process_query(int fd, mi_query_t *miq) 2810 { 2811 ipmp_groupinfo_t *grinfop; 2812 ipmp_groupinfolist_t *grlp; 2813 ipmp_grouplist_t *grlistp; 2814 ipmp_ifinfo_t *ifinfop; 2815 ipmp_ifinfolist_t *iflp; 2816 ipmp_snap_t *snap; 2817 unsigned int retval; 2818 2819 switch (miq->miq_inforeq) { 2820 case IPMP_GROUPLIST: 2821 retval = getgrouplist(&grlistp); 2822 if (retval != IPMP_SUCCESS) 2823 return (send_result(fd, retval, errno)); 2824 2825 retval = send_result(fd, IPMP_SUCCESS, 0); 2826 if (retval == IPMP_SUCCESS) 2827 retval = send_grouplist(fd, grlistp); 2828 2829 ipmp_freegrouplist(grlistp); 2830 return (retval); 2831 2832 case IPMP_GROUPINFO: 2833 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2834 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2835 if (retval != IPMP_SUCCESS) 2836 return (send_result(fd, retval, errno)); 2837 2838 retval = send_result(fd, IPMP_SUCCESS, 0); 2839 if (retval == IPMP_SUCCESS) 2840 retval = send_groupinfo(fd, grinfop); 2841 2842 ipmp_freegroupinfo(grinfop); 2843 return (retval); 2844 2845 case IPMP_IFINFO: 2846 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2847 retval = getifinfo(miq->miq_ifname, &ifinfop); 2848 if (retval != IPMP_SUCCESS) 2849 return (send_result(fd, retval, errno)); 2850 2851 retval = send_result(fd, IPMP_SUCCESS, 0); 2852 if (retval == IPMP_SUCCESS) 2853 retval = send_ifinfo(fd, ifinfop); 2854 2855 ipmp_freeifinfo(ifinfop); 2856 return (retval); 2857 2858 case IPMP_SNAP: 2859 retval = getsnap(&snap); 2860 if (retval != IPMP_SUCCESS) 2861 return (send_result(fd, retval, errno)); 2862 2863 retval = send_result(fd, IPMP_SUCCESS, 0); 2864 if (retval != IPMP_SUCCESS) 2865 goto out; 2866 2867 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2868 if (retval != IPMP_SUCCESS) 2869 goto out; 2870 2871 retval = send_grouplist(fd, snap->sn_grlistp); 2872 if (retval != IPMP_SUCCESS) 2873 goto out; 2874 2875 iflp = snap->sn_ifinfolistp; 2876 for (; iflp != NULL; iflp = iflp->ifl_next) { 2877 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2878 if (retval != IPMP_SUCCESS) 2879 goto out; 2880 } 2881 2882 grlp = snap->sn_grinfolistp; 2883 for (; grlp != NULL; grlp = grlp->grl_next) { 2884 retval = send_groupinfo(fd, grlp->grl_grinfop); 2885 if (retval != IPMP_SUCCESS) 2886 goto out; 2887 } 2888 out: 2889 ipmp_snap_free(snap); 2890 return (retval); 2891 2892 default: 2893 break; 2894 2895 } 2896 return (send_result(fd, IPMP_EPROTO, 0)); 2897 } 2898 2899 /* 2900 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2901 * Returns an IPMP error code. 2902 */ 2903 static unsigned int 2904 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2905 { 2906 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2907 unsigned int retval; 2908 2909 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2910 if (retval != IPMP_SUCCESS) 2911 return (retval); 2912 2913 return (ipmp_writetlv(fd, IPMP_IFLIST, 2914 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2915 } 2916 2917 /* 2918 * Send the interface information pointed to by `ifinfop' on file descriptor 2919 * `fd'. Returns an IPMP error code. 2920 */ 2921 static unsigned int 2922 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2923 { 2924 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2925 } 2926 2927 /* 2928 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2929 * Returns an IPMP error code. 2930 */ 2931 static unsigned int 2932 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2933 { 2934 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2935 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2936 } 2937 2938 /* 2939 * Initialize an mi_result_t structure using `error' and `syserror' and 2940 * send it on file descriptor `fd'. Returns an IPMP error code. 2941 */ 2942 static unsigned int 2943 send_result(int fd, unsigned int error, int syserror) 2944 { 2945 mi_result_t me; 2946 2947 me.me_mpathd_error = error; 2948 if (error == IPMP_FAILURE) 2949 me.me_sys_error = syserror; 2950 else 2951 me.me_sys_error = 0; 2952 2953 return (ipmp_write(fd, &me, sizeof (me))); 2954 } 2955 2956 /* 2957 * Daemonize the process. 2958 */ 2959 static boolean_t 2960 daemonize(void) 2961 { 2962 switch (fork()) { 2963 case -1: 2964 return (_B_FALSE); 2965 2966 case 0: 2967 /* 2968 * Lose our controlling terminal, and become both a session 2969 * leader and a process group leader. 2970 */ 2971 if (setsid() == -1) 2972 return (_B_FALSE); 2973 2974 /* 2975 * Under POSIX, a session leader can accidentally (through 2976 * open(2)) acquire a controlling terminal if it does not 2977 * have one. Just to be safe, fork() again so we are not a 2978 * session leader. 2979 */ 2980 switch (fork()) { 2981 case -1: 2982 return (_B_FALSE); 2983 2984 case 0: 2985 (void) chdir("/"); 2986 (void) umask(022); 2987 (void) fdwalk(closefunc, NULL); 2988 break; 2989 2990 default: 2991 _exit(EXIT_SUCCESS); 2992 } 2993 break; 2994 2995 default: 2996 _exit(EXIT_SUCCESS); 2997 } 2998 2999 return (_B_TRUE); 3000 } 3001 3002 /* 3003 * The parent has created some fds before forking on purpose, keep them open. 3004 */ 3005 static int 3006 closefunc(void *not_used, int fd) 3007 /* ARGSUSED */ 3008 { 3009 if (fd != lsock_v4 && fd != lsock_v6) 3010 (void) close(fd); 3011 return (0); 3012 } 3013 3014 /* LOGGER */ 3015 3016 #include <syslog.h> 3017 3018 /* 3019 * Logging routines. All routines log to syslog, unless the daemon is 3020 * running in the foreground, in which case the logging goes to stderr. 3021 * 3022 * The following routines are available: 3023 * 3024 * logdebug(): A printf-like function for outputting debug messages 3025 * (messages at LOG_DEBUG) that are only of use to developers. 3026 * 3027 * logtrace(): A printf-like function for outputting tracing messages 3028 * (messages at LOG_INFO) from the daemon. This is typically used 3029 * to log the receipt of interesting network-related conditions. 3030 * 3031 * logerr(): A printf-like function for outputting error messages 3032 * (messages at LOG_ERR) from the daemon. 3033 * 3034 * logperror*(): A set of functions used to output error messages 3035 * (messages at LOG_ERR); these automatically append strerror(errno) 3036 * and a newline to the message passed to them. 3037 * 3038 * NOTE: since the logging functions write to syslog, the messages passed 3039 * to them are not eligible for localization. Thus, gettext() must 3040 * *not* be used. 3041 */ 3042 3043 static int logging = 0; 3044 3045 static void 3046 initlog(void) 3047 { 3048 logging++; 3049 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3050 } 3051 3052 /* PRINTFLIKE1 */ 3053 void 3054 logerr(char *fmt, ...) 3055 { 3056 va_list ap; 3057 3058 va_start(ap, fmt); 3059 3060 if (logging) 3061 vsyslog(LOG_ERR, fmt, ap); 3062 else 3063 (void) vfprintf(stderr, fmt, ap); 3064 va_end(ap); 3065 } 3066 3067 /* PRINTFLIKE1 */ 3068 void 3069 logtrace(char *fmt, ...) 3070 { 3071 va_list ap; 3072 3073 va_start(ap, fmt); 3074 3075 if (logging) 3076 vsyslog(LOG_INFO, fmt, ap); 3077 else 3078 (void) vfprintf(stderr, fmt, ap); 3079 va_end(ap); 3080 } 3081 3082 /* PRINTFLIKE1 */ 3083 void 3084 logdebug(char *fmt, ...) 3085 { 3086 va_list ap; 3087 3088 va_start(ap, fmt); 3089 3090 if (logging) 3091 vsyslog(LOG_DEBUG, fmt, ap); 3092 else 3093 (void) vfprintf(stderr, fmt, ap); 3094 va_end(ap); 3095 } 3096 3097 /* PRINTFLIKE1 */ 3098 void 3099 logperror(char *str) 3100 { 3101 if (logging) 3102 syslog(LOG_ERR, "%s: %m\n", str); 3103 else 3104 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3105 } 3106 3107 void 3108 logperror_pii(struct phyint_instance *pii, char *str) 3109 { 3110 if (logging) { 3111 syslog(LOG_ERR, "%s (%s %s): %m\n", 3112 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3113 } else { 3114 (void) fprintf(stderr, "%s (%s %s): %s\n", 3115 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3116 strerror(errno)); 3117 } 3118 } 3119 3120 void 3121 logperror_li(struct logint *li, char *str) 3122 { 3123 struct phyint_instance *pii = li->li_phyint_inst; 3124 3125 if (logging) { 3126 syslog(LOG_ERR, "%s (%s %s): %m\n", 3127 str, AF_STR(pii->pii_af), li->li_name); 3128 } else { 3129 (void) fprintf(stderr, "%s (%s %s): %s\n", 3130 str, AF_STR(pii->pii_af), li->li_name, 3131 strerror(errno)); 3132 } 3133 } 3134 3135 void 3136 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3137 { 3138 if (polled) 3139 (void) poll_remove(pii->pii_probe_sock); 3140 (void) close(pii->pii_probe_sock); 3141 pii->pii_probe_sock = -1; 3142 pii->pii_basetime_inited = 0; 3143 } 3144