1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 35 /* All times below in ms */ 36 int user_failure_detection_time; /* user specified failure detection */ 37 /* time (fdt) */ 38 int user_probe_interval; /* derived from user specified fdt */ 39 40 static int rtsock_v4; /* AF_INET routing socket */ 41 static int rtsock_v6; /* AF_INET6 routing socket */ 42 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 43 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 44 static int lsock_v4; /* Listen socket to detect mpathd */ 45 static int lsock_v6; /* Listen socket to detect mpathd */ 46 static int mibfd = -1; /* fd to get mib info */ 47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 48 49 boolean_t full_scan_required = _B_FALSE; 50 static uint_t last_initifs_time; /* Time when initifs was last run */ 51 static char **argv0; /* Saved for re-exec on SIGHUP */ 52 boolean_t handle_link_notifications = _B_TRUE; 53 54 static void initlog(void); 55 static void run_timeouts(void); 56 static void initifs(void); 57 static void check_if_removed(struct phyint_instance *pii); 58 static void select_test_ifs(void); 59 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 60 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 61 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 62 struct in_addr nexthop_v4); 63 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 64 struct in6_addr nexthop_v6); 65 static void router_add_common(int af, char *ifname, 66 struct in6_addr nexthop); 67 static void init_router_targets(); 68 static void cleanup(void); 69 static int setup_listener(int af); 70 static void check_config(void); 71 static void check_addr_unique(struct phyint_instance *, 72 struct sockaddr_storage *); 73 static void init_host_targets(void); 74 static void dup_host_targets(struct phyint_instance *desired_pii); 75 static void loopback_cmd(int sock, int family); 76 static int poll_remove(int fd); 77 static boolean_t daemonize(void); 78 static int closefunc(void *, int); 79 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 80 static unsigned int process_query(int fd, mi_query_t *miq); 81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 84 static unsigned int send_result(int fd, unsigned int error, int syserror); 85 86 struct local_addr *laddr_list = NULL; 87 88 /* 89 * Return the current time in milliseconds (from an arbitrary reference) 90 * truncated to fit into an int. Truncation is ok since we are interested 91 * only in differences and not the absolute values. 92 */ 93 uint_t 94 getcurrenttime(void) 95 { 96 uint_t cur_time; /* In ms */ 97 98 /* 99 * Use of a non-user-adjustable source of time is 100 * required. However millisecond precision is sufficient. 101 * divide by 10^6 102 */ 103 cur_time = (uint_t)(gethrtime() / 1000000LL); 104 return (cur_time); 105 } 106 107 /* 108 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 109 */ 110 int 111 poll_add(int fd) 112 { 113 int i; 114 int new_num; 115 struct pollfd *newfds; 116 retry: 117 /* Check if already present */ 118 for (i = 0; i < pollfd_num; i++) { 119 if (pollfds[i].fd == fd) 120 return (0); 121 } 122 /* Check for empty spot already present */ 123 for (i = 0; i < pollfd_num; i++) { 124 if (pollfds[i].fd == -1) { 125 pollfds[i].fd = fd; 126 return (0); 127 } 128 } 129 130 /* Allocate space for 32 more fds and initialize to -1 */ 131 new_num = pollfd_num + 32; 132 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 133 if (newfds == NULL) { 134 logperror("poll_add: realloc"); 135 return (-1); 136 } 137 for (i = pollfd_num; i < new_num; i++) { 138 newfds[i].fd = -1; 139 newfds[i].events = POLLIN; 140 } 141 pollfd_num = new_num; 142 pollfds = newfds; 143 goto retry; 144 } 145 146 /* 147 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 148 */ 149 static int 150 poll_remove(int fd) 151 { 152 int i; 153 154 /* Check if already present */ 155 for (i = 0; i < pollfd_num; i++) { 156 if (pollfds[i].fd == fd) { 157 pollfds[i].fd = -1; 158 return (0); 159 } 160 } 161 return (-1); 162 } 163 164 /* 165 * Extract information about the phyint instance. If the phyint instance still 166 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 167 * will use it to detect phyint instances that don't exist any longer and 168 * remove them, from our database of phyint instances. 169 * Return value: 170 * returns true if the phyint instance exists in the kernel, 171 * returns false otherwise 172 */ 173 static boolean_t 174 pii_process(int af, char *name, struct phyint_instance **pii_p) 175 { 176 int err; 177 struct phyint_instance *pii; 178 struct phyint_instance *pii_other; 179 180 if (debug & D_PHYINT) 181 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 182 183 pii = phyint_inst_lookup(af, name); 184 if (pii == NULL) { 185 /* 186 * Phyint instance does not exist in our tables, 187 * create new phyint instance 188 */ 189 pii = phyint_inst_init_from_k(af, name); 190 } else { 191 /* Phyint exists in our tables */ 192 err = phyint_inst_update_from_k(pii); 193 194 switch (err) { 195 case PI_IOCTL_ERROR: 196 /* Some ioctl error. don't change anything */ 197 pii->pii_in_use = 1; 198 break; 199 200 case PI_GROUP_CHANGED: 201 /* 202 * The phyint has changed group. 203 */ 204 restore_phyint(pii->pii_phyint); 205 /* FALLTHRU */ 206 207 case PI_IFINDEX_CHANGED: 208 /* 209 * Interface index has changed. Delete and 210 * recreate the phyint as it is quite likely 211 * the interface has been unplumbed and replumbed. 212 */ 213 pii_other = phyint_inst_other(pii); 214 if (pii_other != NULL) 215 phyint_inst_delete(pii_other); 216 phyint_inst_delete(pii); 217 pii = phyint_inst_init_from_k(af, name); 218 break; 219 220 case PI_DELETED: 221 /* Phyint instance has disappeared from kernel */ 222 pii->pii_in_use = 0; 223 break; 224 225 case PI_OK: 226 /* Phyint instance exists and is fine */ 227 pii->pii_in_use = 1; 228 break; 229 230 default: 231 /* Unknown status */ 232 logerr("pii_process: Unknown status %d\n", err); 233 break; 234 } 235 } 236 237 *pii_p = pii; 238 if (pii != NULL) 239 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 240 else 241 return (_B_FALSE); 242 } 243 244 /* 245 * This phyint is leaving the group. Try to restore the phyint to its 246 * initial state. Return the addresses that belong to other group members, 247 * to the group, and take back any addresses owned by this phyint 248 */ 249 void 250 restore_phyint(struct phyint *pi) 251 { 252 if (pi->pi_group == phyint_anongroup) 253 return; 254 255 /* 256 * Move everthing to some other member in the group. 257 * The phyint has changed group in the kernel. But we 258 * have yet to do it in our tables. 259 */ 260 if (!pi->pi_empty) 261 (void) try_failover(pi, FAILOVER_TO_ANY); 262 /* 263 * Move all addresses owned by 'pi' back to pi, from each 264 * of the other members of the group 265 */ 266 (void) try_failback(pi); 267 } 268 269 /* 270 * Scan all interfaces to detect changes as well as new and deleted interfaces 271 */ 272 static void 273 initifs() 274 { 275 int n; 276 int af; 277 char *cp; 278 char *buf; 279 int numifs; 280 struct lifnum lifn; 281 struct lifconf lifc; 282 struct lifreq *lifr; 283 struct logint *li; 284 struct phyint_instance *pii; 285 struct phyint_instance *next_pii; 286 char pi_name[LIFNAMSIZ + 1]; 287 boolean_t exists; 288 struct phyint *pi; 289 struct local_addr *next; 290 291 if (debug & D_PHYINT) 292 logdebug("initifs: Scanning interfaces\n"); 293 294 last_initifs_time = getcurrenttime(); 295 296 /* 297 * Free the laddr_list before collecting the local addresses. 298 */ 299 while (laddr_list != NULL) { 300 next = laddr_list->next; 301 free(laddr_list); 302 laddr_list = next; 303 } 304 305 /* 306 * Mark the interfaces so that we can find phyints and logints 307 * which have disappeared from the kernel. pii_process() and 308 * logint_init_from_k() will set {pii,li}_in_use when they find 309 * the interface in the kernel. Also, clear dupaddr bit on probe 310 * logint. check_addr_unique() will set the dupaddr bit on the 311 * probe logint, if the testaddress is not unique. 312 */ 313 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 314 pii->pii_in_use = 0; 315 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 316 li->li_in_use = 0; 317 if (pii->pii_probe_logint == li) 318 li->li_dupaddr = 0; 319 } 320 } 321 322 lifn.lifn_family = AF_UNSPEC; 323 lifn.lifn_flags = LIFC_ALLZONES; 324 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 325 logperror("initifs: ioctl (get interface numbers)"); 326 return; 327 } 328 numifs = lifn.lifn_count; 329 330 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 331 if (buf == NULL) { 332 logperror("initifs: calloc"); 333 return; 334 } 335 336 lifc.lifc_family = AF_UNSPEC; 337 lifc.lifc_flags = LIFC_ALLZONES; 338 lifc.lifc_len = numifs * sizeof (struct lifreq); 339 lifc.lifc_buf = buf; 340 341 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 342 /* 343 * EINVAL is commonly encountered, when things change 344 * underneath us rapidly, (eg. at boot, when new interfaces 345 * are plumbed successively) and the kernel finds the buffer 346 * size we passed as too small. We will retry again 347 * when we see the next routing socket msg, or at worst after 348 * IF_SCAN_INTERVAL ms. 349 */ 350 if (errno != EINVAL) { 351 logperror("initifs: ioctl" 352 " (get interface configuration)"); 353 } 354 free(buf); 355 return; 356 } 357 358 lifr = (struct lifreq *)lifc.lifc_req; 359 360 /* 361 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 362 * and get the state of the corresponding phyint_instance. If it is 363 * successful, then call logint_init_from_k() to get the state of the 364 * logint. 365 */ 366 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 367 int sockfd; 368 struct local_addr *taddr; 369 struct sockaddr_in *sin; 370 struct sockaddr_in6 *sin6; 371 struct lifreq lifreq; 372 373 af = lifr->lifr_addr.ss_family; 374 375 /* 376 * Collect all local addresses. 377 */ 378 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 379 (void) memset(&lifreq, 0, sizeof (lifreq)); 380 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, 381 sizeof (lifreq.lifr_name)); 382 383 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 384 if (errno != ENXIO) 385 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 386 continue; 387 } 388 389 /* 390 * Add the interface address to laddr_list. 391 * Another node might have the same IP address which is up. 392 * In that case, it is appropriate to use the address as a 393 * target, even though it is also configured (but not up) on 394 * the local system. 395 * Hence,the interface address is not added to laddr_list 396 * unless it is IFF_UP. 397 */ 398 if (lifreq.lifr_flags & IFF_UP) { 399 taddr = malloc(sizeof (struct local_addr)); 400 if (taddr == NULL) { 401 logperror("initifs: malloc"); 402 continue; 403 } 404 if (af == AF_INET) { 405 sin = (struct sockaddr_in *)&lifr->lifr_addr; 406 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 407 &taddr->addr); 408 } else { 409 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; 410 taddr->addr = sin6->sin6_addr; 411 } 412 taddr->next = laddr_list; 413 laddr_list = taddr; 414 } 415 416 /* 417 * Need to pass a phyint name to pii_process. Insert the 418 * null where the ':' IF_SEPARATOR is found in the logical 419 * name. 420 */ 421 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 422 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 423 *cp = '\0'; 424 425 exists = pii_process(af, pi_name, &pii); 426 if (exists) { 427 /* The phyint is fine. So process the logint */ 428 logint_init_from_k(pii, lifr->lifr_name); 429 check_addr_unique(pii, &lifr->lifr_addr); 430 } 431 432 } 433 434 free(buf); 435 436 /* 437 * If the test address is now unique, and if it was not unique 438 * previously, clear the li_dupaddrmsg_printed flag and log a 439 * recovery message 440 */ 441 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 442 struct logint *li; 443 char abuf[INET6_ADDRSTRLEN]; 444 445 li = pii->pii_probe_logint; 446 if ((li != NULL) && !li->li_dupaddr && 447 li->li_dupaddrmsg_printed) { 448 logerr("Test address %s is unique in group; enabling " 449 "probe-based failure detection on %s\n", 450 pr_addr(pii->pii_af, li->li_addr, abuf, 451 sizeof (abuf)), pii->pii_phyint->pi_name); 452 li->li_dupaddrmsg_printed = 0; 453 } 454 } 455 456 /* 457 * Scan for phyints and logints that have disappeared from the 458 * kernel, and delete them. 459 */ 460 pii = phyint_instances; 461 462 while (pii != NULL) { 463 next_pii = pii->pii_next; 464 check_if_removed(pii); 465 pii = next_pii; 466 } 467 468 /* 469 * Select a test address for sending probes on each phyint instance 470 */ 471 select_test_ifs(); 472 473 /* 474 * Handle link up/down notifications from the NICs. 475 */ 476 process_link_state_changes(); 477 478 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 479 /* 480 * If this is a case of group failure, we don't have much 481 * to do until the group recovers again. 482 */ 483 if (GROUP_FAILED(pi->pi_group)) 484 continue; 485 486 /* 487 * Try/Retry any pending failovers / failbacks, that did not 488 * not complete, or that could not be initiated previously. 489 * This implements the 3 invariants described in the big block 490 * comment at the beginning of probe.c 491 */ 492 if (pi->pi_flags & IFF_INACTIVE) { 493 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 494 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 495 } else { 496 struct phyint_instance *pii; 497 498 /* 499 * Skip interfaces which are not capable of probing, 500 * and interfaces that have downed links (as we will 501 * not get any response). 502 */ 503 if (LINK_DOWN(pi)) 504 continue; 505 506 pii = pi->pi_v4; 507 if (!PROBE_CAPABLE(pii)) { 508 pii = pi->pi_v6; 509 if (!PROBE_CAPABLE(pii)) 510 continue; 511 } 512 513 /* 514 * It is possible that the phyint has started 515 * receiving packets, after it has been marked 516 * PI_FAILED. Don't initiate failover, if the 517 * phyint has started recovering. failure_state() 518 * captures this check. A similar logic is used 519 * for failback/repair case. 520 */ 521 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 522 (failure_state(pii) == PHYINT_FAILURE)) { 523 (void) try_failover(pi, FAILOVER_NORMAL); 524 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 525 if (try_failback(pi) != IPMP_FAILURE) { 526 (void) change_lif_flags(pi, IFF_FAILED, 527 _B_FALSE); 528 /* Per state diagram */ 529 pi->pi_empty = 0; 530 } 531 } 532 } 533 } 534 } 535 536 /* 537 * Check that a given test address is unique across all of the interfaces in a 538 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 539 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 540 * Log an error and alert the user. 541 */ 542 static void 543 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 544 { 545 struct phyint *pi; 546 struct phyint_group *pg; 547 struct in6_addr addr; 548 struct phyint_instance *pii; 549 struct sockaddr_in *sin; 550 char abuf[INET6_ADDRSTRLEN]; 551 552 if (ss->ss_family == AF_INET) { 553 sin = (struct sockaddr_in *)ss; 554 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 555 } else { 556 assert(ss->ss_family == AF_INET6); 557 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 558 } 559 560 /* 561 * For anonymous groups, every interface is assumed to be on its own 562 * link, so there is no chance of overlapping addresses. 563 */ 564 pg = ourpii->pii_phyint->pi_group; 565 if (pg == phyint_anongroup) 566 return; 567 568 /* 569 * Walk the list of phyint instances in the group and check for test 570 * addresses matching ours. Of course, we skip ourself. 571 */ 572 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 573 pii = PHYINT_INSTANCE(pi, ss->ss_family); 574 if (pii == NULL || pii == ourpii || 575 pii->pii_probe_logint == NULL) 576 continue; 577 578 if (!IN6_ARE_ADDR_EQUAL(&addr, 579 &pii->pii_probe_logint->li_addr)) { 580 continue; 581 } 582 583 /* 584 * This test address is not unique. Set the dupaddr bit 585 * and log an error message if not already logged. 586 */ 587 pii->pii_probe_logint->li_dupaddr = 1; 588 if (!pii->pii_probe_logint->li_dupaddrmsg_printed) { 589 logerr("Test address %s is not unique in group; " 590 "disabling probe-based failure detection on %s\n", 591 pr_addr(ss->ss_family, addr, abuf, sizeof (abuf)), 592 pii->pii_phyint->pi_name); 593 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 594 } 595 } 596 } 597 598 /* 599 * Stop probing an interface. Called when an interface is offlined. 600 * The probe socket is closed on each interface instance, and the 601 * interface state set to PI_OFFLINE. 602 */ 603 static void 604 stop_probing(struct phyint *pi) 605 { 606 struct phyint_instance *pii; 607 608 pii = pi->pi_v4; 609 if (pii != NULL) { 610 if (pii->pii_probe_sock != -1) 611 close_probe_socket(pii, _B_TRUE); 612 pii->pii_probe_logint = NULL; 613 } 614 615 pii = pi->pi_v6; 616 if (pii != NULL) { 617 if (pii->pii_probe_sock != -1) 618 close_probe_socket(pii, _B_TRUE); 619 pii->pii_probe_logint = NULL; 620 } 621 622 phyint_chstate(pi, PI_OFFLINE); 623 } 624 625 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 626 627 /* 628 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 629 * IFF_UP must also be set so that the associated address can be used as a 630 * source address. Further, we must be able to exchange packets with local 631 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 632 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 633 */ 634 static int 635 rate_testflags(uint64_t flags) 636 { 637 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 638 return (BAD_TESTFLAGS); 639 640 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 641 return (BAD_TESTFLAGS); 642 643 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 644 return (BEST_TESTFLAGS); 645 646 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 647 return (BEST_TESTFLAGS); 648 649 return (OK_TESTFLAGS); 650 } 651 652 /* 653 * Attempt to select a test address for each phyint instance. 654 * Call phyint_inst_sockinit() to complete the initializations. 655 */ 656 static void 657 select_test_ifs(void) 658 { 659 struct phyint *pi; 660 struct phyint_instance *pii; 661 struct phyint_instance *next_pii; 662 struct logint *li; 663 struct logint *probe_logint; 664 boolean_t target_scan_reqd = _B_FALSE; 665 struct target *tg; 666 int rating; 667 668 if (debug & D_PHYINT) 669 logdebug("select_test_ifs\n"); 670 671 /* 672 * For each phyint instance, do the test address selection 673 */ 674 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 675 next_pii = pii->pii_next; 676 probe_logint = NULL; 677 678 /* 679 * An interface that is offline, should not be probed. 680 * Offline interfaces should always in PI_OFFLINE state, 681 * unless some other entity has set the offline flag. 682 */ 683 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 684 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 685 logerr("shouldn't be probing offline" 686 " interface %s (state is: %u)." 687 " Stopping probes.\n", 688 pii->pii_phyint->pi_name, 689 pii->pii_phyint->pi_state); 690 stop_probing(pii->pii_phyint); 691 } 692 continue; 693 } 694 695 li = pii->pii_probe_logint; 696 if (li != NULL) { 697 /* 698 * We've already got a test address; only proceed 699 * if it's suboptimal. 700 */ 701 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 702 continue; 703 } 704 705 /* 706 * Walk the logints of this phyint instance, and select 707 * the best available test address 708 */ 709 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 710 /* 711 * Skip 0.0.0.0 addresses, as those are never 712 * actually usable. 713 */ 714 if (pii->pii_af == AF_INET && 715 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 716 continue; 717 718 /* 719 * Skip any IPv6 logints that are not link-local, 720 * since we should always have a link-local address 721 * anyway and in6_data() expects link-local replies. 722 */ 723 if (pii->pii_af == AF_INET6 && 724 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 725 continue; 726 727 /* 728 * Rate the testflags. If we've found an optimal 729 * match, then break out; otherwise, record the most 730 * recent OK one. 731 */ 732 rating = rate_testflags(li->li_flags); 733 if (rating == BAD_TESTFLAGS) 734 continue; 735 736 probe_logint = li; 737 if (rating == BEST_TESTFLAGS) 738 break; 739 } 740 741 /* 742 * If the probe logint has changed, ditch the old one. 743 */ 744 if (pii->pii_probe_logint != NULL && 745 pii->pii_probe_logint != probe_logint) { 746 if (pii->pii_probe_sock != -1) 747 close_probe_socket(pii, _B_TRUE); 748 pii->pii_probe_logint = NULL; 749 } 750 751 if (probe_logint == NULL) { 752 /* 753 * We don't have a test address. Don't print an 754 * error message immediately. check_config() will 755 * take care of it. Zero out the probe stats array 756 * since it is no longer relevant. Optimize by 757 * checking if it is already zeroed out. 758 */ 759 int pr_ndx; 760 761 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 762 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 763 clear_pii_probe_stats(pii); 764 reset_crtt_all(pii->pii_phyint); 765 } 766 continue; 767 } else if (probe_logint == pii->pii_probe_logint) { 768 /* 769 * If we didn't find any new test addr, go to the 770 * next phyint. 771 */ 772 continue; 773 } 774 775 /* 776 * The phyint is either being assigned a new testaddr 777 * or is being assigned a testaddr for the 1st time. 778 * Need to initialize the phyint socket 779 */ 780 pii->pii_probe_logint = probe_logint; 781 if (!phyint_inst_sockinit(pii)) { 782 if (debug & D_PHYINT) { 783 logdebug("select_test_ifs: " 784 "phyint_sockinit failed\n"); 785 } 786 phyint_inst_delete(pii); 787 continue; 788 } 789 790 /* 791 * This phyint instance is now enabled for probes; this 792 * impacts our state machine in two ways: 793 * 794 * 1. If we're probe *capable* as well (i.e., we have 795 * probe targets) and the interface is in PI_NOTARGETS, 796 * then transition to PI_RUNNING. 797 * 798 * 2. If we're not probe capable, and the other phyint 799 * instance is also not probe capable, and we were in 800 * PI_RUNNING, then transition to PI_NOTARGETS. 801 * 802 * Also see the state diagram in mpd_probe.c. 803 */ 804 if (PROBE_CAPABLE(pii)) { 805 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 806 phyint_chstate(pii->pii_phyint, PI_RUNNING); 807 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 808 if (pii->pii_phyint->pi_state == PI_RUNNING) 809 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 810 } 811 812 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 813 tg = pii->pii_targets; 814 if (tg != NULL) 815 target_delete(tg); 816 assert(pii->pii_targets == NULL); 817 assert(pii->pii_target_next == NULL); 818 assert(pii->pii_ntargets == 0); 819 target_create(pii, probe_logint->li_dstaddr, 820 _B_TRUE); 821 } 822 823 /* 824 * If no targets are currently known for this phyint 825 * we need to call init_router_targets. Since 826 * init_router_targets() initializes the list of targets 827 * for all phyints it is done below the loop. 828 */ 829 if (pii->pii_targets == NULL) 830 target_scan_reqd = _B_TRUE; 831 832 /* 833 * Start the probe timer for this instance. 834 */ 835 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 836 start_timer(pii); 837 pii->pii_basetime_inited = 1; 838 } 839 } 840 841 /* 842 * Check the interface list for any interfaces that are marked 843 * PI_FAILED but no longer enabled to send probes, and call 844 * phyint_check_for_repair() to see if the link now indicates that the 845 * interface should be repaired. Also see the state diagram in 846 * mpd_probe.c. 847 */ 848 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 849 if (pi->pi_state == PI_FAILED && 850 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 851 phyint_check_for_repair(pi); 852 } 853 } 854 855 /* 856 * Try to populate the target list. init_router_targets populates 857 * the target list from the routing table. If our target list is 858 * still empty, init_host_targets adds host targets based on the 859 * host target list of other phyints in the group. 860 */ 861 if (target_scan_reqd) { 862 init_router_targets(); 863 init_host_targets(); 864 } 865 } 866 867 /* 868 * Check phyint group configuration, to detect any inconsistencies, 869 * and log an error message. This is called from runtimeouts every 870 * 20 secs. But the error message is displayed once. If the 871 * consistency is resolved by the admin, a recovery message is displayed 872 * once. 873 */ 874 static void 875 check_config(void) 876 { 877 struct phyint_group *pg; 878 struct phyint *pi; 879 boolean_t v4_in_group; 880 boolean_t v6_in_group; 881 882 /* 883 * All phyints of a group must be homogenous to ensure that 884 * failover or failback can be done. If any phyint in a group 885 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 886 * Do a similar check for IPv6. 887 */ 888 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 889 if (pg == phyint_anongroup) 890 continue; 891 892 v4_in_group = _B_FALSE; 893 v6_in_group = _B_FALSE; 894 /* 895 * 1st pass. Determine if at least 1 phyint in the group 896 * has IPv4 plumbed and if so set v4_in_group to true. 897 * Repeat similarly for IPv6. 898 */ 899 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 900 if (pi->pi_v4 != NULL) 901 v4_in_group = _B_TRUE; 902 if (pi->pi_v6 != NULL) 903 v6_in_group = _B_TRUE; 904 } 905 906 /* 907 * 2nd pass. If v4_in_group is true, check that phyint 908 * has IPv4 plumbed. Repeat similarly for IPv6. Print 909 * out a message the 1st time only. 910 */ 911 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 912 if (pi->pi_flags & IFF_OFFLINE) 913 continue; 914 915 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 916 if (!pi->pi_cfgmsg_printed) { 917 logerr("NIC %s of group %s is" 918 " not plumbed for IPv4 and may" 919 " affect failover capability\n", 920 pi->pi_name, 921 pi->pi_group->pg_name); 922 pi->pi_cfgmsg_printed = 1; 923 } 924 } else if (v6_in_group == _B_TRUE && 925 pi->pi_v6 == NULL) { 926 if (!pi->pi_cfgmsg_printed) { 927 logerr("NIC %s of group %s is" 928 " not plumbed for IPv6 and may" 929 " affect failover capability\n", 930 pi->pi_name, 931 pi->pi_group->pg_name); 932 pi->pi_cfgmsg_printed = 1; 933 } 934 } else { 935 /* 936 * The phyint matches the group configuration, 937 * if we have reached this point. If it was 938 * improperly configured earlier, log an 939 * error recovery message 940 */ 941 if (pi->pi_cfgmsg_printed) { 942 logerr("NIC %s is now consistent with " 943 "group %s and failover capability " 944 "is restored\n", pi->pi_name, 945 pi->pi_group->pg_name); 946 pi->pi_cfgmsg_printed = 0; 947 } 948 } 949 950 } 951 } 952 953 /* 954 * In order to perform probe-based failure detection, a phyint must 955 * have at least 1 test/probe address for sending and receiving probes 956 * (either on IPv4 or IPv6 instance or both). If no test address has 957 * been configured, notify the administrator, but continue on since we 958 * can still perform load spreading, along with "link up/down" based 959 * failure detection. 960 */ 961 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 962 if (pi->pi_flags & IFF_OFFLINE) 963 continue; 964 965 if ((pi->pi_v4 == NULL || 966 pi->pi_v4->pii_probe_logint == NULL) && 967 (pi->pi_v6 == NULL || 968 pi->pi_v6->pii_probe_logint == NULL)) { 969 if (!pi->pi_taddrmsg_printed) { 970 logerr("No test address configured on " 971 "interface %s; disabling probe-based " 972 "failure detection on it\n", pi->pi_name); 973 pi->pi_taddrmsg_printed = 1; 974 } 975 } else if (pi->pi_taddrmsg_printed) { 976 logerr("Test address now configured on interface %s; " 977 "enabling probe-based failure detection on it\n", 978 pi->pi_name); 979 pi->pi_taddrmsg_printed = 0; 980 } 981 982 } 983 } 984 985 /* 986 * Timer mechanism using relative time (in milliseconds) from the 987 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 988 * will fire after TIMER_INFINITY milliseconds. 989 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 990 * time values. Hence 2 consecutive timer events cannot be spaced farther 991 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 992 * that can be passed for the delay parameter of timer_schedule() 993 */ 994 static uint_t timer_next; /* Currently scheduled timeout */ 995 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 996 997 static void 998 timer_init(void) 999 { 1000 timer_next = getcurrenttime() + TIMER_INFINITY; 1001 /* 1002 * The call to run_timeouts() will get the timer started 1003 * Since there are no phyints at this point, the timer will 1004 * be set for IF_SCAN_INTERVAL ms. 1005 */ 1006 run_timeouts(); 1007 } 1008 1009 /* 1010 * Make sure the next SIGALRM occurs delay milliseconds from the current 1011 * time if not earlier. We are interested only in time differences. 1012 */ 1013 void 1014 timer_schedule(uint_t delay) 1015 { 1016 uint_t now; 1017 struct itimerval itimerval; 1018 1019 if (debug & D_TIMER) 1020 logdebug("timer_schedule(%u)\n", delay); 1021 1022 assert(delay <= TIMER_INFINITY); 1023 1024 now = getcurrenttime(); 1025 if (delay == 0) { 1026 /* Minimum allowed delay */ 1027 delay = 1; 1028 } 1029 /* Will this timer occur before the currently scheduled SIGALRM? */ 1030 if (timer_active && TIME_GE(now + delay, timer_next)) { 1031 if (debug & D_TIMER) { 1032 logdebug("timer_schedule(%u) - no action: " 1033 "now %u next %u\n", delay, now, timer_next); 1034 } 1035 return; 1036 } 1037 timer_next = now + delay; 1038 1039 itimerval.it_value.tv_sec = delay / 1000; 1040 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1041 itimerval.it_interval.tv_sec = 0; 1042 itimerval.it_interval.tv_usec = 0; 1043 if (debug & D_TIMER) { 1044 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1045 delay, itimerval.it_value.tv_sec, 1046 itimerval.it_value.tv_usec); 1047 } 1048 timer_active = _B_TRUE; 1049 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1050 logperror("timer_schedule: setitimer"); 1051 exit(2); 1052 } 1053 } 1054 1055 /* 1056 * Timer has fired. Determine when the next timer event will occur by asking 1057 * all the timer routines. Should not be called from a timer routine. 1058 */ 1059 static void 1060 run_timeouts(void) 1061 { 1062 uint_t next; 1063 uint_t next_event_time; 1064 struct phyint_instance *pii; 1065 struct phyint_instance *next_pii; 1066 static boolean_t timeout_running; 1067 1068 /* assert that recursive timeouts don't happen. */ 1069 assert(!timeout_running); 1070 1071 timeout_running = _B_TRUE; 1072 1073 if (debug & D_TIMER) 1074 logdebug("run_timeouts()\n"); 1075 1076 next = TIMER_INFINITY; 1077 1078 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1079 next_pii = pii->pii_next; 1080 next_event_time = phyint_inst_timer(pii); 1081 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1082 next = next_event_time; 1083 1084 if (debug & D_TIMER) { 1085 logdebug("run_timeouts(%s %s): next scheduled for" 1086 " this phyint inst %u, next scheduled global" 1087 " %u ms\n", 1088 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1089 next_event_time, next); 1090 } 1091 } 1092 1093 /* 1094 * Make sure initifs() is called at least once every 1095 * IF_SCAN_INTERVAL, to make sure that we are in sync 1096 * with the kernel, in case we have missed any routing 1097 * socket messages. 1098 */ 1099 if (next > IF_SCAN_INTERVAL) 1100 next = IF_SCAN_INTERVAL; 1101 1102 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1103 initifs(); 1104 check_config(); 1105 } 1106 1107 if (debug & D_TIMER) 1108 logdebug("run_timeouts: %u ms\n", next); 1109 1110 timer_schedule(next); 1111 timeout_running = _B_FALSE; 1112 } 1113 1114 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1115 static int eventpipe_write = -1; 1116 static boolean_t cleanup_started = _B_FALSE; 1117 /* Don't write to eventpipe if in cleanup */ 1118 /* 1119 * Ensure that signals are processed synchronously with the rest of 1120 * the code by just writing a one character signal number on the pipe. 1121 * The poll loop will pick this up and process the signal event. 1122 */ 1123 static void 1124 sig_handler(int signo) 1125 { 1126 uchar_t buf = (uchar_t)signo; 1127 1128 /* 1129 * Don't write to pipe if cleanup has already begun. cleanup() 1130 * might have closed the pipe already 1131 */ 1132 if (cleanup_started) 1133 return; 1134 1135 if (eventpipe_write == -1) { 1136 logerr("sig_handler: no pipe found\n"); 1137 return; 1138 } 1139 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1140 logperror("sig_handler: write"); 1141 } 1142 1143 extern struct probes_missed probes_missed; 1144 1145 /* 1146 * Pick up a signal "byte" from the pipe and process it. 1147 */ 1148 static void 1149 in_signal(int fd) 1150 { 1151 uchar_t buf; 1152 uint64_t sent, acked, lost, unacked, unknown; 1153 struct phyint_instance *pii; 1154 int pr_ndx; 1155 1156 switch (read(fd, &buf, sizeof (buf))) { 1157 case -1: 1158 logperror("in_signal: read"); 1159 exit(1); 1160 /* NOTREACHED */ 1161 case 1: 1162 break; 1163 case 0: 1164 logerr("in_signal: read end of file\n"); 1165 exit(1); 1166 /* NOTREACHED */ 1167 default: 1168 logerr("in_signal: read > 1\n"); 1169 exit(1); 1170 } 1171 1172 if (debug & D_TIMER) 1173 logdebug("in_signal() got %d\n", buf); 1174 1175 switch (buf) { 1176 case SIGALRM: 1177 if (debug & D_TIMER) { 1178 uint_t now = getcurrenttime(); 1179 1180 logdebug("in_signal(SIGALRM) delta %u\n", 1181 now - timer_next); 1182 } 1183 timer_active = _B_FALSE; 1184 run_timeouts(); 1185 break; 1186 case SIGUSR1: 1187 logdebug("Printing configuration:\n"); 1188 /* Print out the internal tables */ 1189 phyint_inst_print_all(); 1190 1191 /* 1192 * Print out the accumulated statistics about missed 1193 * probes (happens due to scheduling delay). 1194 */ 1195 logerr("Missed sending total of %d probes spread over" 1196 " %d occurrences\n", probes_missed.pm_nprobes, 1197 probes_missed.pm_ntimes); 1198 1199 /* 1200 * Print out the accumulated statistics about probes 1201 * that were sent. 1202 */ 1203 for (pii = phyint_instances; pii != NULL; 1204 pii = pii->pii_next) { 1205 unacked = 0; 1206 acked = pii->pii_cum_stats.acked; 1207 lost = pii->pii_cum_stats.lost; 1208 sent = pii->pii_cum_stats.sent; 1209 unknown = pii->pii_cum_stats.unknown; 1210 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1211 switch (pii->pii_probes[pr_ndx].pr_status) { 1212 case PR_ACKED: 1213 acked++; 1214 break; 1215 case PR_LOST: 1216 lost++; 1217 break; 1218 case PR_UNACKED: 1219 unacked++; 1220 break; 1221 } 1222 } 1223 logerr("\nProbe stats on (%s %s)\n" 1224 "Number of probes sent %lld\n" 1225 "Number of probe acks received %lld\n" 1226 "Number of probes/acks lost %lld\n" 1227 "Number of valid unacknowled probes %lld\n" 1228 "Number of ambiguous probe acks received %lld\n", 1229 AF_STR(pii->pii_af), pii->pii_name, 1230 sent, acked, lost, unacked, unknown); 1231 } 1232 break; 1233 case SIGHUP: 1234 logerr("SIGHUP: restart and reread config file\n"); 1235 cleanup(); 1236 (void) execv(argv0[0], argv0); 1237 _exit(0177); 1238 /* NOTREACHED */ 1239 case SIGINT: 1240 case SIGTERM: 1241 case SIGQUIT: 1242 cleanup(); 1243 exit(0); 1244 /* NOTREACHED */ 1245 default: 1246 logerr("in_signal: unknown signal: %d\n", buf); 1247 } 1248 } 1249 1250 static void 1251 cleanup(void) 1252 { 1253 struct phyint_instance *pii; 1254 struct phyint_instance *next_pii; 1255 1256 /* 1257 * Make sure that we don't write to eventpipe in 1258 * sig_handler() if any signal notably SIGALRM, 1259 * occurs after we close the eventpipe descriptor below 1260 */ 1261 cleanup_started = _B_TRUE; 1262 1263 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1264 next_pii = pii->pii_next; 1265 phyint_inst_delete(pii); 1266 } 1267 1268 (void) close(ifsock_v4); 1269 (void) close(ifsock_v6); 1270 (void) close(rtsock_v4); 1271 (void) close(rtsock_v6); 1272 (void) close(lsock_v4); 1273 (void) close(lsock_v6); 1274 (void) close(0); 1275 (void) close(1); 1276 (void) close(2); 1277 (void) close(mibfd); 1278 (void) close(eventpipe_read); 1279 (void) close(eventpipe_write); 1280 } 1281 1282 /* 1283 * Create pipe for signal delivery and set up signal handlers. 1284 */ 1285 static void 1286 setup_eventpipe(void) 1287 { 1288 int fds[2]; 1289 struct sigaction act; 1290 1291 if ((pipe(fds)) < 0) { 1292 logperror("setup_eventpipe: pipe"); 1293 exit(1); 1294 } 1295 eventpipe_read = fds[0]; 1296 eventpipe_write = fds[1]; 1297 if (poll_add(eventpipe_read) == -1) { 1298 exit(1); 1299 } 1300 1301 act.sa_handler = sig_handler; 1302 act.sa_flags = SA_RESTART; 1303 (void) sigaction(SIGALRM, &act, NULL); 1304 1305 (void) sigset(SIGHUP, sig_handler); 1306 (void) sigset(SIGUSR1, sig_handler); 1307 (void) sigset(SIGTERM, sig_handler); 1308 (void) sigset(SIGINT, sig_handler); 1309 (void) sigset(SIGQUIT, sig_handler); 1310 } 1311 1312 /* 1313 * Create a routing socket for receiving RTM_IFINFO messages. 1314 */ 1315 static int 1316 setup_rtsock(int af) 1317 { 1318 int s; 1319 int flags; 1320 1321 s = socket(PF_ROUTE, SOCK_RAW, af); 1322 if (s == -1) { 1323 logperror("setup_rtsock: socket PF_ROUTE"); 1324 exit(1); 1325 } 1326 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1327 logperror("setup_rtsock: fcntl F_GETFL"); 1328 (void) close(s); 1329 exit(1); 1330 } 1331 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1332 logperror("setup_rtsock: fcntl F_SETFL"); 1333 (void) close(s); 1334 exit(1); 1335 } 1336 if (poll_add(s) == -1) { 1337 (void) close(s); 1338 exit(1); 1339 } 1340 return (s); 1341 } 1342 1343 /* 1344 * Process an RTM_IFINFO message received on a routing socket. 1345 * The return value indicates whether a full interface scan is required. 1346 * Link up/down notifications from the NICs are reflected in the 1347 * IFF_RUNNING flag. 1348 * If just the state of the IFF_RUNNING interface flag has changed, a 1349 * a full interface scan isn't required. 1350 */ 1351 static boolean_t 1352 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1353 { 1354 struct sockaddr_dl *sdl; 1355 struct phyint *pi; 1356 uint64_t old_flags; 1357 struct phyint_instance *pii; 1358 1359 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1360 1361 /* 1362 * Although the sockaddr_dl structure is directly after the 1363 * if_msghdr_t structure. At the time of writing, the size of the 1364 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1365 * to the presence of a timeval structure, which contains longs, 1366 * in the if_data structure. Anyway, we know where the message ends, 1367 * so we work backwards to get the start of the sockaddr_dl structure. 1368 */ 1369 /*LINTED*/ 1370 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1371 sizeof (struct sockaddr_dl)); 1372 1373 assert(sdl->sdl_family == AF_LINK); 1374 1375 /* 1376 * The interface name is in sdl_data. 1377 * RTM_IFINFO messages are only generated for logical interface 1378 * zero, so there is no colon and logical interface number to 1379 * strip from the name. The name is not null terminated, but 1380 * there should be enough space in sdl_data to add the null. 1381 */ 1382 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1383 if (debug & D_LINKNOTE) 1384 logdebug("process_rtm_ifinfo: " 1385 "phyint name too long\n"); 1386 return (_B_TRUE); 1387 } 1388 sdl->sdl_data[sdl->sdl_nlen] = 0; 1389 1390 pi = phyint_lookup(sdl->sdl_data); 1391 if (pi == NULL) { 1392 if (debug & D_LINKNOTE) 1393 logdebug("process_rtm_ifinfo: phyint lookup failed" 1394 " for %s\n", sdl->sdl_data); 1395 return (_B_TRUE); 1396 } 1397 1398 /* 1399 * We want to try and avoid doing a full interface scan for 1400 * link state notifications from the NICs, as indicated 1401 * by the state of the IFF_RUNNING flag. If just the 1402 * IFF_RUNNING flag has changed state, the link state changes 1403 * are processed without a full scan. 1404 * If there is both an IPv4 and IPv6 instance associated with 1405 * the physical interface, we will get an RTM_IFINFO message 1406 * for each instance. If we just maintained a single copy of 1407 * the physical interface flags, it would appear that no flags 1408 * had changed when the second message is processed, leading us 1409 * to believe that the message wasn't generated by a flags change, 1410 * and that a full interface scan is required. 1411 * To get around this problem, two additional copies of the flags 1412 * are kept, one copy for each instance. These are only used in 1413 * this routine. At any one time, all three copies of the flags 1414 * should be identical except for the IFF_RUNNING flag. The 1415 * copy of the flags in the "phyint" structure is always up to 1416 * date. 1417 */ 1418 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1419 if (pii == NULL) { 1420 if (debug & D_LINKNOTE) 1421 logdebug("process_rtm_ifinfo: no instance of address " 1422 "family %s for %s\n", AF_STR(type), pi->pi_name); 1423 return (_B_TRUE); 1424 } 1425 1426 old_flags = pii->pii_flags; 1427 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1428 pi->pi_flags = pii->pii_flags; 1429 1430 if (debug & D_LINKNOTE) { 1431 logdebug("process_rtm_ifinfo: %s address family: %s, " 1432 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1433 AF_STR(type), old_flags, pi->pi_flags); 1434 } 1435 1436 /* 1437 * If IFF_STANDBY has changed, indicate that the interface has changed 1438 * types. 1439 */ 1440 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1441 phyint_newtype(pi); 1442 1443 /* 1444 * If IFF_INACTIVE has been set, then no data addresses should be 1445 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1446 * move previously failed-over addresses back to it, provided it is 1447 * not failed. For details, see the state diagram in mpd_probe.c. 1448 */ 1449 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1450 if (pii->pii_flags & IFF_INACTIVE) { 1451 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1452 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1453 } else { 1454 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1455 pi->pi_empty = 0; 1456 (void) try_failback(pi); 1457 } 1458 } 1459 } 1460 1461 /* Has just the IFF_RUNNING flag changed state ? */ 1462 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1463 struct phyint_instance *pii_other; 1464 /* 1465 * It wasn't just a link state change. Update 1466 * the other instance's copy of the flags. 1467 */ 1468 pii_other = phyint_inst_other(pii); 1469 if (pii_other != NULL) 1470 pii_other->pii_flags = pii->pii_flags; 1471 return (_B_TRUE); 1472 } 1473 1474 return (_B_FALSE); 1475 } 1476 1477 /* 1478 * Retrieve as many routing socket messages as possible, and try to 1479 * empty the routing sockets. Initiate full scan of targets or interfaces 1480 * as needed. 1481 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1482 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1483 */ 1484 static void 1485 process_rtsock(int rtsock_v4, int rtsock_v6) 1486 { 1487 int nbytes; 1488 int64_t msg[2048 / 8]; 1489 struct rt_msghdr *rtm; 1490 boolean_t need_if_scan = _B_FALSE; 1491 boolean_t need_rt_scan = _B_FALSE; 1492 boolean_t rtm_ifinfo_seen = _B_FALSE; 1493 int type; 1494 1495 /* Read as many messages as possible and try to empty the sockets */ 1496 for (type = AF_INET; ; type = AF_INET6) { 1497 for (;;) { 1498 nbytes = read((type == AF_INET) ? rtsock_v4 : 1499 rtsock_v6, msg, sizeof (msg)); 1500 if (nbytes <= 0) { 1501 /* No more messages */ 1502 break; 1503 } 1504 rtm = (struct rt_msghdr *)msg; 1505 if (rtm->rtm_version != RTM_VERSION) { 1506 logerr("process_rtsock: version %d " 1507 "not understood\n", rtm->rtm_version); 1508 break; 1509 } 1510 1511 if (debug & D_PHYINT) { 1512 logdebug("process_rtsock: message %d\n", 1513 rtm->rtm_type); 1514 } 1515 1516 switch (rtm->rtm_type) { 1517 case RTM_NEWADDR: 1518 case RTM_DELADDR: 1519 /* 1520 * Some logical interface has changed, 1521 * have to scan everything to determine 1522 * what actually changed. 1523 */ 1524 need_if_scan = _B_TRUE; 1525 break; 1526 1527 case RTM_IFINFO: 1528 rtm_ifinfo_seen = _B_TRUE; 1529 need_if_scan |= 1530 process_rtm_ifinfo((if_msghdr_t *)rtm, 1531 type); 1532 break; 1533 1534 case RTM_ADD: 1535 case RTM_DELETE: 1536 case RTM_CHANGE: 1537 case RTM_OLDADD: 1538 case RTM_OLDDEL: 1539 need_rt_scan = _B_TRUE; 1540 break; 1541 1542 default: 1543 /* Not interesting */ 1544 break; 1545 } 1546 } 1547 if (type == AF_INET6) 1548 break; 1549 } 1550 1551 if (need_if_scan) { 1552 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1553 logdebug("process_rtsock: synchronizing with kernel\n"); 1554 initifs(); 1555 } else if (rtm_ifinfo_seen) { 1556 if (debug & D_LINKNOTE) 1557 logdebug("process_rtsock: " 1558 "link up/down notification(s) seen\n"); 1559 process_link_state_changes(); 1560 } 1561 1562 if (need_rt_scan) 1563 init_router_targets(); 1564 } 1565 1566 /* 1567 * Look if the phyint instance or one of its logints have been removed from 1568 * the kernel and take appropriate action. 1569 * Uses {pii,li}_in_use. 1570 */ 1571 static void 1572 check_if_removed(struct phyint_instance *pii) 1573 { 1574 struct logint *li; 1575 struct logint *next_li; 1576 1577 /* Detect phyints that have been removed from the kernel. */ 1578 if (!pii->pii_in_use) { 1579 logtrace("%s %s has been removed from kernel\n", 1580 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1581 phyint_inst_delete(pii); 1582 } else { 1583 /* Detect logints that have been removed. */ 1584 for (li = pii->pii_logint; li != NULL; li = next_li) { 1585 next_li = li->li_next; 1586 if (!li->li_in_use) { 1587 logint_delete(li); 1588 } 1589 } 1590 } 1591 } 1592 1593 /* 1594 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1595 * tables defined by mib2.h. Parse the returned data and extract 1596 * the 'routing' information table. Process the 'routing' table 1597 * to get the list of known onlink routers, and update our database. 1598 * These onlink routers will serve as our probe targets. 1599 * Returns false, if any system calls resulted in errors, true otherwise. 1600 */ 1601 static boolean_t 1602 update_router_list(int fd) 1603 { 1604 union { 1605 char ubuf[1024]; 1606 union T_primitives uprim; 1607 } buf; 1608 1609 int flags; 1610 struct strbuf ctlbuf; 1611 struct strbuf databuf; 1612 struct T_optmgmt_req *tor; 1613 struct T_optmgmt_ack *toa; 1614 struct T_error_ack *tea; 1615 struct opthdr *optp; 1616 struct opthdr *req; 1617 int status; 1618 t_scalar_t prim; 1619 1620 tor = (struct T_optmgmt_req *)&buf; 1621 1622 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1623 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1624 tor->OPT_length = sizeof (struct opthdr); 1625 tor->MGMT_flags = T_CURRENT; 1626 1627 req = (struct opthdr *)&tor[1]; 1628 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1629 req->name = 0; 1630 req->len = 0; 1631 1632 ctlbuf.buf = (char *)&buf; 1633 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1634 ctlbuf.maxlen = sizeof (buf); 1635 flags = 0; 1636 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1637 logperror("update_router_list: putmsg(ctl)"); 1638 return (_B_FALSE); 1639 } 1640 1641 /* 1642 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1643 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1644 * a control and data part. The control part contains a struct 1645 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1646 * the level, name and length of the data in the data part. The 1647 * data part contains the actual table data. The last message 1648 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1649 * single option with zero optlen. 1650 */ 1651 1652 for (;;) { 1653 /* 1654 * Go around this loop once for each table. Ignore 1655 * all tables except the routing information table. 1656 */ 1657 flags = 0; 1658 status = getmsg(fd, &ctlbuf, NULL, &flags); 1659 if (status < 0) { 1660 if (errno == EINTR) 1661 continue; 1662 logperror("update_router_list: getmsg(ctl)"); 1663 return (_B_FALSE); 1664 } 1665 if (ctlbuf.len < sizeof (t_scalar_t)) { 1666 logerr("update_router_list: ctlbuf.len %d\n", 1667 ctlbuf.len); 1668 return (_B_FALSE); 1669 } 1670 1671 prim = buf.uprim.type; 1672 1673 switch (prim) { 1674 1675 case T_ERROR_ACK: 1676 tea = &buf.uprim.error_ack; 1677 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1678 logerr("update_router_list: T_ERROR_ACK" 1679 " ctlbuf.len %d\n", ctlbuf.len); 1680 return (_B_FALSE); 1681 } 1682 logerr("update_router_list: T_ERROR_ACK:" 1683 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1684 tea->TLI_error, tea->UNIX_error); 1685 return (_B_FALSE); 1686 1687 case T_OPTMGMT_ACK: 1688 toa = &buf.uprim.optmgmt_ack; 1689 optp = (struct opthdr *)&toa[1]; 1690 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1691 logerr("update_router_list: ctlbuf.len %d\n", 1692 ctlbuf.len); 1693 return (_B_FALSE); 1694 } 1695 if (toa->MGMT_flags != T_SUCCESS) { 1696 logerr("update_router_list: MGMT_flags 0x%lx\n", 1697 toa->MGMT_flags); 1698 return (_B_FALSE); 1699 } 1700 break; 1701 1702 default: 1703 logerr("update_router_list: unknown primitive %ld\n", 1704 prim); 1705 return (_B_FALSE); 1706 } 1707 1708 /* Process the T_OPGMGMT_ACK below */ 1709 assert(prim == T_OPTMGMT_ACK); 1710 1711 switch (status) { 1712 case 0: 1713 /* 1714 * We have reached the end of this T_OPTMGMT_ACK 1715 * message. If this is the last message i.e EOD, 1716 * return, else process the next T_OPTMGMT_ACK msg. 1717 */ 1718 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1719 sizeof (struct opthdr)) && optp->len == 0 && 1720 optp->name == 0 && optp->level == 0) { 1721 /* 1722 * This is the EOD message. Return 1723 */ 1724 return (_B_TRUE); 1725 } 1726 continue; 1727 1728 case MORECTL: 1729 case MORECTL | MOREDATA: 1730 /* 1731 * This should not happen. We should be able to read 1732 * the control portion in a single getmsg. 1733 */ 1734 logerr("update_router_list: MORECTL\n"); 1735 return (_B_FALSE); 1736 1737 case MOREDATA: 1738 databuf.maxlen = optp->len; 1739 /* malloc of 0 bytes is ok */ 1740 databuf.buf = malloc((size_t)optp->len); 1741 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1742 logperror("update_router_list: malloc"); 1743 return (_B_FALSE); 1744 } 1745 databuf.len = 0; 1746 flags = 0; 1747 for (;;) { 1748 status = getmsg(fd, NULL, &databuf, &flags); 1749 if (status >= 0) { 1750 break; 1751 } else if (errno == EINTR) { 1752 continue; 1753 } else { 1754 logperror("update_router_list:" 1755 " getmsg(data)"); 1756 free(databuf.buf); 1757 return (_B_FALSE); 1758 } 1759 } 1760 1761 if (optp->level == MIB2_IP && 1762 optp->name == MIB2_IP_ROUTE) { 1763 /* LINTED */ 1764 ire_process_v4((mib2_ipRouteEntry_t *) 1765 databuf.buf, databuf.len); 1766 } else if (optp->level == MIB2_IP6 && 1767 optp->name == MIB2_IP6_ROUTE) { 1768 /* LINTED */ 1769 ire_process_v6((mib2_ipv6RouteEntry_t *) 1770 databuf.buf, databuf.len); 1771 } 1772 free(databuf.buf); 1773 } 1774 } 1775 /* NOTREACHED */ 1776 } 1777 1778 /* 1779 * Examine the IPv4 routing table, for default routers. For each default 1780 * router, populate the list of targets of each phyint that is on the same 1781 * link as the default router 1782 */ 1783 static void 1784 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1785 { 1786 mib2_ipRouteEntry_t *rp; 1787 mib2_ipRouteEntry_t *rp1; 1788 struct in_addr nexthop_v4; 1789 mib2_ipRouteEntry_t *endp; 1790 1791 if (len == 0) 1792 return; 1793 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1794 1795 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1796 1797 /* 1798 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1799 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1800 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1801 * This is a potential target for probing, which we try to add 1802 * to the list of probe targets. 1803 */ 1804 for (rp = buf; rp < endp; rp++) { 1805 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1806 continue; 1807 1808 /* Get the nexthop address. */ 1809 nexthop_v4.s_addr = rp->ipRouteNextHop; 1810 1811 /* 1812 * Get the nexthop address. Then determine the outgoing 1813 * interface, by examining all interface IREs, and picking the 1814 * match. We don't look at the interface specified in the route 1815 * because we need to add the router target on all matching 1816 * interfaces anyway; the goal is to avoid falling back to 1817 * multicast when some interfaces are in the same subnet but 1818 * not in the same group. 1819 */ 1820 for (rp1 = buf; rp1 < endp; rp1++) { 1821 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1822 continue; 1823 } 1824 1825 /* 1826 * Determine the interface IRE that matches the nexthop. 1827 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1828 */ 1829 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1830 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1831 /* 1832 * We found the interface ire 1833 */ 1834 router_add_v4(rp1, nexthop_v4); 1835 } 1836 } 1837 } 1838 } 1839 1840 void 1841 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1842 { 1843 char *cp; 1844 char ifname[LIFNAMSIZ + 1]; 1845 struct in6_addr nexthop; 1846 int len; 1847 1848 if (debug & D_TARGET) 1849 logdebug("router_add_v4()\n"); 1850 1851 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1852 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1853 ifname[len] = '\0'; 1854 1855 if (ifname[0] == '\0') 1856 return; 1857 1858 cp = strchr(ifname, IF_SEPARATOR); 1859 if (cp != NULL) 1860 *cp = '\0'; 1861 1862 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1863 router_add_common(AF_INET, ifname, nexthop); 1864 } 1865 1866 void 1867 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1868 { 1869 struct phyint_instance *pii; 1870 struct phyint *pi; 1871 1872 if (debug & D_TARGET) 1873 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1874 1875 /* 1876 * Retrieve the phyint instance; bail if it's not known to us yet. 1877 */ 1878 pii = phyint_inst_lookup(af, ifname); 1879 if (pii == NULL) 1880 return; 1881 1882 /* 1883 * Don't use our own addresses as targets. 1884 */ 1885 if (own_address(nexthop)) 1886 return; 1887 1888 /* 1889 * If the phyint is part a named group, then add the address to all 1890 * members of the group; note that this is suboptimal in the IPv4 case 1891 * as it has already been added to all matching interfaces in 1892 * ire_process_v4(). Otherwise, add the address only to the phyint 1893 * itself, since other phyints in the anongroup may not be on the same 1894 * subnet. 1895 */ 1896 pi = pii->pii_phyint; 1897 if (pi->pi_group == phyint_anongroup) { 1898 target_add(pii, nexthop, _B_TRUE); 1899 } else { 1900 pi = pi->pi_group->pg_phyint; 1901 for (; pi != NULL; pi = pi->pi_pgnext) 1902 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1903 } 1904 } 1905 1906 /* 1907 * Examine the IPv6 routing table, for default routers. For each default 1908 * router, populate the list of targets of each phyint that is on the same 1909 * link as the default router 1910 */ 1911 static void 1912 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1913 { 1914 mib2_ipv6RouteEntry_t *rp; 1915 mib2_ipv6RouteEntry_t *endp; 1916 struct in6_addr nexthop_v6; 1917 1918 if (debug & D_TARGET) 1919 logdebug("ire_process_v6(len %d)\n", len); 1920 1921 if (len == 0) 1922 return; 1923 1924 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1925 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1926 1927 /* 1928 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1929 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1930 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1931 * This is a potential target for probing, which we try to add 1932 * to the list of probe targets. 1933 */ 1934 for (rp = buf; rp < endp; rp++) { 1935 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1936 continue; 1937 1938 /* 1939 * We have the outgoing interface in ipv6RouteIfIndex 1940 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1941 * interface must be present for link-local addresses. Since 1942 * we use only link-local addreses for probing, we don't 1943 * consider the case when the outgoing interface is not 1944 * known and we need to scan interface ires 1945 */ 1946 nexthop_v6 = rp->ipv6RouteNextHop; 1947 if (rp->ipv6RouteIfIndex.o_length != 0) { 1948 /* 1949 * We already have the outgoing interface 1950 * in ipv6RouteIfIndex. 1951 */ 1952 router_add_v6(rp, nexthop_v6); 1953 } 1954 } 1955 } 1956 1957 1958 void 1959 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1960 { 1961 char ifname[LIFNAMSIZ + 1]; 1962 char *cp; 1963 int len; 1964 1965 if (debug & D_TARGET) 1966 logdebug("router_add_v6()\n"); 1967 1968 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1969 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1970 ifname[len] = '\0'; 1971 1972 if (ifname[0] == '\0') 1973 return; 1974 1975 cp = strchr(ifname, IF_SEPARATOR); 1976 if (cp != NULL) 1977 *cp = '\0'; 1978 1979 router_add_common(AF_INET6, ifname, nexthop_v6); 1980 } 1981 1982 1983 1984 /* 1985 * Build a list of target routers, by scanning the routing tables. 1986 * It is assumed that interface routes exist, to reach the routers. 1987 */ 1988 static void 1989 init_router_targets(void) 1990 { 1991 struct target *tg; 1992 struct target *next_tg; 1993 struct phyint_instance *pii; 1994 struct phyint *pi; 1995 1996 if (force_mcast) 1997 return; 1998 1999 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2000 pi = pii->pii_phyint; 2001 /* 2002 * Exclude ptp and host targets. Set tg_in_use to false, 2003 * only for router targets. 2004 */ 2005 if (!pii->pii_targets_are_routers || 2006 (pi->pi_flags & IFF_POINTOPOINT)) 2007 continue; 2008 2009 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 2010 tg->tg_in_use = 0; 2011 } 2012 2013 if (mibfd < 0) { 2014 mibfd = open("/dev/ip", O_RDWR); 2015 if (mibfd < 0) { 2016 logperror("mibopen: ip open"); 2017 exit(1); 2018 } 2019 } 2020 2021 if (!update_router_list(mibfd)) { 2022 (void) close(mibfd); 2023 mibfd = -1; 2024 } 2025 2026 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2027 if (!pii->pii_targets_are_routers || 2028 (pi->pi_flags & IFF_POINTOPOINT)) 2029 continue; 2030 2031 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 2032 next_tg = tg->tg_next; 2033 if (!tg->tg_in_use) { 2034 target_delete(tg); 2035 } 2036 } 2037 } 2038 } 2039 2040 /* 2041 * Attempt to assign host targets to any interfaces that do not currently 2042 * have probe targets by sharing targets with other interfaces in the group. 2043 */ 2044 static void 2045 init_host_targets(void) 2046 { 2047 struct phyint_instance *pii; 2048 struct phyint_group *pg; 2049 2050 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2051 pg = pii->pii_phyint->pi_group; 2052 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2053 dup_host_targets(pii); 2054 } 2055 } 2056 2057 /* 2058 * Duplicate host targets from other phyints of the group to 2059 * the phyint instance 'desired_pii'. 2060 */ 2061 static void 2062 dup_host_targets(struct phyint_instance *desired_pii) 2063 { 2064 int af; 2065 struct phyint *pi; 2066 struct phyint_instance *pii; 2067 struct target *tg; 2068 2069 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2070 2071 af = desired_pii->pii_af; 2072 2073 /* 2074 * For every phyint in the same group as desired_pii, check if 2075 * it has any host targets. If so add them to desired_pii. 2076 */ 2077 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2078 pii = PHYINT_INSTANCE(pi, af); 2079 /* 2080 * We know that we don't have targets on this phyint instance 2081 * since we have been called. But we still check for 2082 * pii_targets_are_routers because another phyint instance 2083 * could have router targets, since IFF_NOFAILOVER addresses 2084 * on different phyint instances may belong to different 2085 * subnets. 2086 */ 2087 if ((pii == NULL) || (pii == desired_pii) || 2088 pii->pii_targets_are_routers) 2089 continue; 2090 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2091 target_create(desired_pii, tg->tg_address, _B_FALSE); 2092 } 2093 } 2094 } 2095 2096 static void 2097 usage(char *cmd) 2098 { 2099 (void) fprintf(stderr, "usage: %s\n", cmd); 2100 } 2101 2102 2103 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2104 2105 /* Get an option from the /etc/default/mpathd file */ 2106 static char * 2107 getdefault(char *name) 2108 { 2109 char namebuf[BUFSIZ]; 2110 char *value = NULL; 2111 2112 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2113 char *cp; 2114 int flags; 2115 2116 /* 2117 * ignore case 2118 */ 2119 flags = defcntl(DC_GETFLAGS, 0); 2120 TURNOFF(flags, DC_CASE); 2121 (void) defcntl(DC_SETFLAGS, flags); 2122 2123 /* Add "=" to the name */ 2124 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2125 (void) strncat(namebuf, "=", 2); 2126 2127 if ((cp = defread(namebuf)) != NULL) 2128 value = strdup(cp); 2129 2130 /* close */ 2131 (void) defopen((char *)NULL); 2132 } 2133 return (value); 2134 } 2135 2136 2137 /* 2138 * Command line options below 2139 */ 2140 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2141 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2142 static boolean_t adopt = _B_FALSE; 2143 static boolean_t foreground = _B_FALSE; 2144 2145 int 2146 main(int argc, char *argv[]) 2147 { 2148 int i; 2149 int c; 2150 struct phyint_instance *pii; 2151 char *value; 2152 2153 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2154 srandom(gethostid()); /* Initialize the random number generator */ 2155 2156 /* 2157 * NOTE: The messages output by in.mpathd are not suitable for 2158 * translation, so we do not call textdomain(). 2159 */ 2160 (void) setlocale(LC_ALL, ""); 2161 2162 /* 2163 * Get the user specified value of 'failure detection time' 2164 * from /etc/default/mpathd 2165 */ 2166 value = getdefault("FAILURE_DETECTION_TIME"); 2167 if (value != NULL) { 2168 user_failure_detection_time = 2169 (int)strtol((char *)value, NULL, 0); 2170 2171 if (user_failure_detection_time <= 0) { 2172 user_failure_detection_time = FAILURE_DETECTION_TIME; 2173 logerr("Invalid failure detection time %s, assuming " 2174 "default %d\n", value, user_failure_detection_time); 2175 2176 } else if (user_failure_detection_time < 2177 MIN_FAILURE_DETECTION_TIME) { 2178 user_failure_detection_time = 2179 MIN_FAILURE_DETECTION_TIME; 2180 logerr("Too small failure detection time of %s, " 2181 "assuming minimum %d\n", value, 2182 user_failure_detection_time); 2183 } 2184 free(value); 2185 } else { 2186 /* User has not specified the parameter, Use default value */ 2187 user_failure_detection_time = FAILURE_DETECTION_TIME; 2188 } 2189 2190 /* 2191 * This gives the frequency at which probes will be sent. 2192 * When fdt ms elapses, we should be able to determine 2193 * whether 5 consecutive probes have failed or not. 2194 * 1 probe will be sent in every user_probe_interval ms, 2195 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2196 * user_probe_interval. Thus when we send out probe 'n' we 2197 * can be sure that probe 'n - 2' is lost, if we have not 2198 * got the ack. (since the probe interval is > crtt). But 2199 * probe 'n - 1' may be a valid unacked probe, since the 2200 * time between 2 successive probes could be as small as 2201 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2202 */ 2203 user_probe_interval = user_failure_detection_time / 2204 (NUM_PROBE_FAILS + 2); 2205 2206 /* 2207 * Get the user specified value of failback_enabled from 2208 * /etc/default/mpathd 2209 */ 2210 value = getdefault("FAILBACK"); 2211 if (value != NULL) { 2212 if (strncasecmp(value, "yes", 3) == 0) 2213 failback_enabled = _B_TRUE; 2214 else if (strncasecmp(value, "no", 2) == 0) 2215 failback_enabled = _B_FALSE; 2216 else 2217 logerr("Invalid value for FAILBACK %s\n", value); 2218 free(value); 2219 } else { 2220 failback_enabled = _B_TRUE; 2221 } 2222 2223 /* 2224 * Get the user specified value of track_all_phyints from 2225 * /etc/default/mpathd. The sense is reversed in 2226 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2227 */ 2228 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2229 if (value != NULL) { 2230 if (strncasecmp(value, "yes", 3) == 0) 2231 track_all_phyints = _B_FALSE; 2232 else if (strncasecmp(value, "no", 2) == 0) 2233 track_all_phyints = _B_TRUE; 2234 else 2235 logerr("Invalid value for " 2236 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2237 free(value); 2238 } else { 2239 track_all_phyints = _B_FALSE; 2240 } 2241 2242 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2243 switch (c) { 2244 case 'a': 2245 adopt = _B_TRUE; 2246 break; 2247 case 'm': 2248 force_mcast = _B_TRUE; 2249 break; 2250 case 'd': 2251 debug = D_ALL; 2252 foreground = _B_TRUE; 2253 break; 2254 case 'D': 2255 i = (int)strtol(optarg, NULL, 0); 2256 if (i == 0) { 2257 (void) fprintf(stderr, "Bad debug flags: %s\n", 2258 optarg); 2259 exit(1); 2260 } 2261 debug |= i; 2262 foreground = _B_TRUE; 2263 break; 2264 case 'l': 2265 /* 2266 * Turn off link state notification handling. 2267 * Undocumented command line flag, for debugging 2268 * purposes. 2269 */ 2270 handle_link_notifications = _B_FALSE; 2271 break; 2272 default: 2273 usage(argv[0]); 2274 exit(1); 2275 } 2276 } 2277 2278 /* 2279 * The sockets for the loopback command interface should be listening 2280 * before we fork and exit in daemonize(). This way, whoever started us 2281 * can use the loopback interface as soon as they get a zero exit 2282 * status. 2283 */ 2284 lsock_v4 = setup_listener(AF_INET); 2285 lsock_v6 = setup_listener(AF_INET6); 2286 2287 if (lsock_v4 < 0 && lsock_v6 < 0) { 2288 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2289 exit(1); 2290 } 2291 2292 if (!foreground) { 2293 if (!daemonize()) { 2294 logerr("cannot daemonize\n"); 2295 exit(EXIT_FAILURE); 2296 } 2297 initlog(); 2298 } 2299 2300 /* 2301 * Initializations: 2302 * 1. Create ifsock* sockets. These are used for performing SIOC* 2303 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2304 * 2. Initialize a pipe for handling/recording signal events. 2305 * 3. Create the routing sockets, used for listening 2306 * to routing / interface changes. 2307 * 4. phyint_init() - Initialize physical interface state 2308 * (in mpd_tables.c). Must be done before creating interfaces, 2309 * which timer_init() does indirectly. 2310 * 5. timer_init() - Initialize timer related stuff 2311 * 6. initifs() - Initialize our database of all known interfaces 2312 * 7. init_router_targets() - Initialize our database of all known 2313 * router targets. 2314 */ 2315 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2316 if (ifsock_v4 < 0) { 2317 logperror("main: IPv4 socket open"); 2318 exit(1); 2319 } 2320 2321 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2322 if (ifsock_v6 < 0) { 2323 logperror("main: IPv6 socket open"); 2324 exit(1); 2325 } 2326 2327 setup_eventpipe(); 2328 2329 rtsock_v4 = setup_rtsock(AF_INET); 2330 rtsock_v6 = setup_rtsock(AF_INET6); 2331 2332 if (phyint_init() == -1) { 2333 logerr("cannot initialize physical interface structures"); 2334 exit(1); 2335 } 2336 2337 timer_init(); 2338 2339 initifs(); 2340 2341 /* Inform kernel whether failback is enabled or disabled */ 2342 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2343 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2344 exit(1); 2345 } 2346 2347 /* 2348 * If we're operating in "adopt" mode and no interfaces need to be 2349 * tracked, shut down (ifconfig(1M) will restart us on demand if 2350 * interfaces are subsequently put into multipathing groups). 2351 */ 2352 if (adopt && phyint_instances == NULL) 2353 exit(0); 2354 2355 /* 2356 * Main body. Keep listening for activity on any of the sockets 2357 * that we are monitoring and take appropriate action as necessary. 2358 * signals are also handled synchronously. 2359 */ 2360 for (;;) { 2361 if (poll(pollfds, pollfd_num, -1) < 0) { 2362 if (errno == EINTR) 2363 continue; 2364 logperror("main: poll"); 2365 exit(1); 2366 } 2367 for (i = 0; i < pollfd_num; i++) { 2368 if ((pollfds[i].fd == -1) || 2369 !(pollfds[i].revents & POLLIN)) 2370 continue; 2371 if (pollfds[i].fd == eventpipe_read) { 2372 in_signal(eventpipe_read); 2373 break; 2374 } 2375 if (pollfds[i].fd == rtsock_v4 || 2376 pollfds[i].fd == rtsock_v6) { 2377 process_rtsock(rtsock_v4, rtsock_v6); 2378 break; 2379 } 2380 for (pii = phyint_instances; pii != NULL; 2381 pii = pii->pii_next) { 2382 if (pollfds[i].fd == pii->pii_probe_sock) { 2383 if (pii->pii_af == AF_INET) 2384 in_data(pii); 2385 else 2386 in6_data(pii); 2387 break; 2388 } 2389 } 2390 if (pollfds[i].fd == lsock_v4) 2391 loopback_cmd(lsock_v4, AF_INET); 2392 else if (pollfds[i].fd == lsock_v6) 2393 loopback_cmd(lsock_v6, AF_INET6); 2394 } 2395 if (full_scan_required) { 2396 initifs(); 2397 full_scan_required = _B_FALSE; 2398 } 2399 } 2400 /* NOTREACHED */ 2401 return (EXIT_SUCCESS); 2402 } 2403 2404 static int 2405 setup_listener(int af) 2406 { 2407 int sock; 2408 int on; 2409 int len; 2410 int ret; 2411 struct sockaddr_storage laddr; 2412 struct sockaddr_in *sin; 2413 struct sockaddr_in6 *sin6; 2414 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2415 2416 assert(af == AF_INET || af == AF_INET6); 2417 2418 sock = socket(af, SOCK_STREAM, 0); 2419 if (sock < 0) { 2420 logperror("setup_listener: socket"); 2421 exit(1); 2422 } 2423 2424 on = 1; 2425 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2426 sizeof (on)) < 0) { 2427 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2428 exit(1); 2429 } 2430 2431 bzero(&laddr, sizeof (laddr)); 2432 laddr.ss_family = af; 2433 2434 if (af == AF_INET) { 2435 sin = (struct sockaddr_in *)&laddr; 2436 sin->sin_port = htons(MPATHD_PORT); 2437 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2438 len = sizeof (struct sockaddr_in); 2439 } else { 2440 sin6 = (struct sockaddr_in6 *)&laddr; 2441 sin6->sin6_port = htons(MPATHD_PORT); 2442 sin6->sin6_addr = loopback_addr; 2443 len = sizeof (struct sockaddr_in6); 2444 } 2445 2446 ret = bind(sock, (struct sockaddr *)&laddr, len); 2447 if (ret < 0) { 2448 if (errno == EADDRINUSE) { 2449 /* 2450 * Another instance of mpathd may be already active. 2451 */ 2452 logerr("main: is another instance of in.mpathd " 2453 "already active?\n"); 2454 exit(1); 2455 } else { 2456 (void) close(sock); 2457 return (-1); 2458 } 2459 } 2460 if (listen(sock, 30) < 0) { 2461 logperror("main: listen"); 2462 exit(1); 2463 } 2464 if (poll_add(sock) == -1) { 2465 (void) close(sock); 2466 exit(1); 2467 } 2468 2469 return (sock); 2470 } 2471 2472 /* 2473 * Table of commands and their expected size; used by loopback_cmd(). 2474 */ 2475 static struct { 2476 const char *name; 2477 unsigned int size; 2478 } commands[] = { 2479 { "MI_PING", sizeof (uint32_t) }, 2480 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2481 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2482 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2483 { "MI_QUERY", sizeof (mi_query_t) } 2484 }; 2485 2486 /* 2487 * Commands received over the loopback interface come here. Currently 2488 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2489 * module. ifconfig only makes a connection, and closes it to check if 2490 * in.mpathd is running. 2491 * if_mpadm sends commands in the format specified by the mpathd_interface 2492 * structure. 2493 */ 2494 static void 2495 loopback_cmd(int sock, int family) 2496 { 2497 int newfd; 2498 ssize_t len; 2499 struct sockaddr_storage peer; 2500 struct sockaddr_in *peer_sin; 2501 struct sockaddr_in6 *peer_sin6; 2502 socklen_t peerlen; 2503 union mi_commands mpi; 2504 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2505 char abuf[INET6_ADDRSTRLEN]; 2506 uint_t cmd; 2507 int retval; 2508 2509 peerlen = sizeof (peer); 2510 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2511 if (newfd < 0) { 2512 logperror("loopback_cmd: accept"); 2513 return; 2514 } 2515 2516 switch (family) { 2517 case AF_INET: 2518 /* 2519 * Validate the address and port to make sure that 2520 * non privileged processes don't connect and start 2521 * talking to us. 2522 */ 2523 if (peerlen != sizeof (struct sockaddr_in)) { 2524 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2525 (void) close(newfd); 2526 return; 2527 } 2528 peer_sin = (struct sockaddr_in *)&peer; 2529 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2530 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2531 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2532 abuf, sizeof (abuf)); 2533 logerr("Attempt to connect from addr %s port %d\n", 2534 abuf, ntohs(peer_sin->sin_port)); 2535 (void) close(newfd); 2536 return; 2537 } 2538 break; 2539 2540 case AF_INET6: 2541 if (peerlen != sizeof (struct sockaddr_in6)) { 2542 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2543 (void) close(newfd); 2544 return; 2545 } 2546 /* 2547 * Validate the address and port to make sure that 2548 * non privileged processes don't connect and start 2549 * talking to us. 2550 */ 2551 peer_sin6 = (struct sockaddr_in6 *)&peer; 2552 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2553 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2554 &loopback_addr))) { 2555 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2556 sizeof (abuf)); 2557 logerr("Attempt to connect from addr %s port %d\n", 2558 abuf, ntohs(peer_sin6->sin6_port)); 2559 (void) close(newfd); 2560 return; 2561 } 2562 2563 default: 2564 logdebug("loopback_cmd: family %d\n", family); 2565 (void) close(newfd); 2566 return; 2567 } 2568 2569 /* 2570 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2571 * all supported commands 2572 */ 2573 len = read(newfd, &mpi, sizeof (mpi)); 2574 2575 /* 2576 * ifconfig does not send any data. Just tests to see if mpathd 2577 * is already running. 2578 */ 2579 if (len <= 0) { 2580 (void) close(newfd); 2581 return; 2582 } 2583 2584 /* 2585 * In theory, we can receive any sized message for a stream socket, 2586 * but we don't expect that to happen for a small message over a 2587 * loopback connection. 2588 */ 2589 if (len < sizeof (uint32_t)) { 2590 logerr("loopback_cmd: bad command format or read returns " 2591 "partial data %d\n", len); 2592 } 2593 2594 cmd = mpi.mi_command; 2595 if (cmd >= MI_NCMD) { 2596 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2597 (void) close(newfd); 2598 return; 2599 } 2600 2601 if (len < commands[cmd].size) { 2602 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2603 commands[cmd].name, commands[cmd].size, len); 2604 (void) close(newfd); 2605 return; 2606 } 2607 2608 retval = process_cmd(newfd, &mpi); 2609 if (retval != IPMP_SUCCESS) { 2610 logerr("failed processing %s: %s\n", commands[cmd].name, 2611 ipmp_errmsg(retval)); 2612 } 2613 (void) close(newfd); 2614 } 2615 2616 extern int global_errno; /* set by failover() or failback() */ 2617 2618 /* 2619 * Process the offline, undo offline and set original index commands, 2620 * received from if_mpadm(1M) 2621 */ 2622 static unsigned int 2623 process_cmd(int newfd, union mi_commands *mpi) 2624 { 2625 uint_t nif = 0; 2626 uint32_t cmd; 2627 struct phyint *pi; 2628 struct phyint *pi2; 2629 struct phyint_group *pg; 2630 boolean_t success; 2631 int error; 2632 struct mi_offline *mio; 2633 struct mi_undo_offline *miu; 2634 struct lifreq lifr; 2635 int ifsock; 2636 struct mi_setoindex *mis; 2637 2638 cmd = mpi->mi_command; 2639 2640 switch (cmd) { 2641 case MI_OFFLINE: 2642 mio = &mpi->mi_ocmd; 2643 /* 2644 * Lookup the interface that needs to be offlined. 2645 * If it does not exist, return a suitable error. 2646 */ 2647 pi = phyint_lookup(mio->mio_ifname); 2648 if (pi == NULL) 2649 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2650 2651 /* 2652 * Verify that the minimum redundancy requirements are met. 2653 * The multipathing group must have at least the specified 2654 * number of functional interfaces after offlining the 2655 * requested interface. Otherwise return a suitable error. 2656 */ 2657 pg = pi->pi_group; 2658 nif = 0; 2659 if (pg != phyint_anongroup) { 2660 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2661 pi2 = pi2->pi_pgnext) { 2662 if ((pi2->pi_state == PI_RUNNING) || 2663 (pg->pg_groupfailed && 2664 !(pi2->pi_flags & IFF_OFFLINE))) 2665 nif++; 2666 } 2667 } 2668 if (nif < mio->mio_min_redundancy) 2669 return (send_result(newfd, IPMP_EMINRED, 0)); 2670 2671 /* 2672 * The order of operation is to set IFF_OFFLINE, followed by 2673 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2674 * can be created. Subsequent failover moves everything on 2675 * the OFFLINE interface to some other functional interface. 2676 */ 2677 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2678 if (success) { 2679 if (!pi->pi_empty) { 2680 error = try_failover(pi, FAILOVER_NORMAL); 2681 if (error != 0) { 2682 if (!change_lif_flags(pi, IFF_OFFLINE, 2683 _B_FALSE)) { 2684 logerr("process_cmd: couldn't" 2685 " clear OFFLINE flag on" 2686 " %s\n", pi->pi_name); 2687 /* 2688 * Offline interfaces should 2689 * not be probed. 2690 */ 2691 stop_probing(pi); 2692 } 2693 return (send_result(newfd, error, 2694 global_errno)); 2695 } 2696 } 2697 } else { 2698 return (send_result(newfd, IPMP_FAILURE, errno)); 2699 } 2700 2701 /* 2702 * The interface is now Offline, so stop probing it. 2703 * Note that if_mpadm(1M) will down the test addresses, 2704 * after receiving a success reply from us. The routing 2705 * socket message will then make us close the socket used 2706 * for sending probes. But it is more logical that an 2707 * offlined interface must not be probed, even if it has 2708 * test addresses. 2709 */ 2710 stop_probing(pi); 2711 return (send_result(newfd, IPMP_SUCCESS, 0)); 2712 2713 case MI_UNDO_OFFLINE: 2714 miu = &mpi->mi_ucmd; 2715 /* 2716 * Undo the offline command. As usual lookup the interface. 2717 * Send an error if it does not exist or is not offline. 2718 */ 2719 pi = phyint_lookup(miu->miu_ifname); 2720 if (pi == NULL || pi->pi_state != PI_OFFLINE) 2721 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2722 2723 /* 2724 * Reset the state of the interface based on the current link 2725 * state; if this phyint subsequently acquires a test address, 2726 * the state will be updated later as a result of the probes. 2727 */ 2728 if (LINK_UP(pi)) 2729 phyint_chstate(pi, PI_RUNNING); 2730 else 2731 phyint_chstate(pi, PI_FAILED); 2732 2733 if (pi->pi_state == PI_RUNNING) { 2734 /* 2735 * Note that the success of MI_UNDO_OFFLINE is not 2736 * contingent on actually failing back; in the odd 2737 * case where we cannot do it here, we will try again 2738 * in initifs() since pi->pi_full will still be zero. 2739 */ 2740 if (do_failback(pi) != IPMP_SUCCESS) { 2741 logdebug("process_cmd: cannot failback from " 2742 "%s during MI_UNDO_OFFLINE\n", pi->pi_name); 2743 } 2744 } 2745 2746 /* 2747 * Clear the IFF_OFFLINE flag. We have to do this last 2748 * because do_failback() relies on it being set to decide 2749 * when to display messages. 2750 */ 2751 (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE); 2752 2753 return (send_result(newfd, IPMP_SUCCESS, 0)); 2754 2755 case MI_SETOINDEX: 2756 mis = &mpi->mi_scmd; 2757 2758 /* Get the socket for doing ioctls */ 2759 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2760 2761 /* 2762 * Get index of new original interface. 2763 * The index is returned in lifr.lifr_index. 2764 */ 2765 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2766 sizeof (lifr.lifr_name)); 2767 2768 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2769 return (send_result(newfd, IPMP_FAILURE, errno)); 2770 2771 /* 2772 * Set new original interface index. 2773 * The new index was put into lifr.lifr_index by the 2774 * SIOCGLIFINDEX ioctl. 2775 */ 2776 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2777 sizeof (lifr.lifr_name)); 2778 2779 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2780 return (send_result(newfd, IPMP_FAILURE, errno)); 2781 2782 return (send_result(newfd, IPMP_SUCCESS, 0)); 2783 2784 case MI_QUERY: 2785 return (process_query(newfd, &mpi->mi_qcmd)); 2786 2787 default: 2788 break; 2789 } 2790 2791 return (send_result(newfd, IPMP_EPROTO, 0)); 2792 } 2793 2794 /* 2795 * Process the query request pointed to by `miq' and send a reply on file 2796 * descriptor `fd'. Returns an IPMP error code. 2797 */ 2798 static unsigned int 2799 process_query(int fd, mi_query_t *miq) 2800 { 2801 ipmp_groupinfo_t *grinfop; 2802 ipmp_groupinfolist_t *grlp; 2803 ipmp_grouplist_t *grlistp; 2804 ipmp_ifinfo_t *ifinfop; 2805 ipmp_ifinfolist_t *iflp; 2806 ipmp_snap_t *snap; 2807 unsigned int retval; 2808 2809 switch (miq->miq_inforeq) { 2810 case IPMP_GROUPLIST: 2811 retval = getgrouplist(&grlistp); 2812 if (retval != IPMP_SUCCESS) 2813 return (send_result(fd, retval, errno)); 2814 2815 retval = send_result(fd, IPMP_SUCCESS, 0); 2816 if (retval == IPMP_SUCCESS) 2817 retval = send_grouplist(fd, grlistp); 2818 2819 ipmp_freegrouplist(grlistp); 2820 return (retval); 2821 2822 case IPMP_GROUPINFO: 2823 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2824 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2825 if (retval != IPMP_SUCCESS) 2826 return (send_result(fd, retval, errno)); 2827 2828 retval = send_result(fd, IPMP_SUCCESS, 0); 2829 if (retval == IPMP_SUCCESS) 2830 retval = send_groupinfo(fd, grinfop); 2831 2832 ipmp_freegroupinfo(grinfop); 2833 return (retval); 2834 2835 case IPMP_IFINFO: 2836 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2837 retval = getifinfo(miq->miq_ifname, &ifinfop); 2838 if (retval != IPMP_SUCCESS) 2839 return (send_result(fd, retval, errno)); 2840 2841 retval = send_result(fd, IPMP_SUCCESS, 0); 2842 if (retval == IPMP_SUCCESS) 2843 retval = send_ifinfo(fd, ifinfop); 2844 2845 ipmp_freeifinfo(ifinfop); 2846 return (retval); 2847 2848 case IPMP_SNAP: 2849 retval = getsnap(&snap); 2850 if (retval != IPMP_SUCCESS) 2851 return (send_result(fd, retval, errno)); 2852 2853 retval = send_result(fd, IPMP_SUCCESS, 0); 2854 if (retval != IPMP_SUCCESS) 2855 goto out; 2856 2857 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2858 if (retval != IPMP_SUCCESS) 2859 goto out; 2860 2861 retval = send_grouplist(fd, snap->sn_grlistp); 2862 if (retval != IPMP_SUCCESS) 2863 goto out; 2864 2865 iflp = snap->sn_ifinfolistp; 2866 for (; iflp != NULL; iflp = iflp->ifl_next) { 2867 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2868 if (retval != IPMP_SUCCESS) 2869 goto out; 2870 } 2871 2872 grlp = snap->sn_grinfolistp; 2873 for (; grlp != NULL; grlp = grlp->grl_next) { 2874 retval = send_groupinfo(fd, grlp->grl_grinfop); 2875 if (retval != IPMP_SUCCESS) 2876 goto out; 2877 } 2878 out: 2879 ipmp_snap_free(snap); 2880 return (retval); 2881 2882 default: 2883 break; 2884 2885 } 2886 return (send_result(fd, IPMP_EPROTO, 0)); 2887 } 2888 2889 /* 2890 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2891 * Returns an IPMP error code. 2892 */ 2893 static unsigned int 2894 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2895 { 2896 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2897 unsigned int retval; 2898 2899 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2900 if (retval != IPMP_SUCCESS) 2901 return (retval); 2902 2903 return (ipmp_writetlv(fd, IPMP_IFLIST, 2904 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2905 } 2906 2907 /* 2908 * Send the interface information pointed to by `ifinfop' on file descriptor 2909 * `fd'. Returns an IPMP error code. 2910 */ 2911 static unsigned int 2912 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2913 { 2914 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2915 } 2916 2917 /* 2918 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2919 * Returns an IPMP error code. 2920 */ 2921 static unsigned int 2922 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2923 { 2924 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2925 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2926 } 2927 2928 /* 2929 * Initialize an mi_result_t structure using `error' and `syserror' and 2930 * send it on file descriptor `fd'. Returns an IPMP error code. 2931 */ 2932 static unsigned int 2933 send_result(int fd, unsigned int error, int syserror) 2934 { 2935 mi_result_t me; 2936 2937 me.me_mpathd_error = error; 2938 if (error == IPMP_FAILURE) 2939 me.me_sys_error = syserror; 2940 else 2941 me.me_sys_error = 0; 2942 2943 return (ipmp_write(fd, &me, sizeof (me))); 2944 } 2945 2946 /* 2947 * Daemonize the process. 2948 */ 2949 static boolean_t 2950 daemonize(void) 2951 { 2952 switch (fork()) { 2953 case -1: 2954 return (_B_FALSE); 2955 2956 case 0: 2957 /* 2958 * Lose our controlling terminal, and become both a session 2959 * leader and a process group leader. 2960 */ 2961 if (setsid() == -1) 2962 return (_B_FALSE); 2963 2964 /* 2965 * Under POSIX, a session leader can accidentally (through 2966 * open(2)) acquire a controlling terminal if it does not 2967 * have one. Just to be safe, fork() again so we are not a 2968 * session leader. 2969 */ 2970 switch (fork()) { 2971 case -1: 2972 return (_B_FALSE); 2973 2974 case 0: 2975 (void) chdir("/"); 2976 (void) umask(022); 2977 (void) fdwalk(closefunc, NULL); 2978 break; 2979 2980 default: 2981 _exit(EXIT_SUCCESS); 2982 } 2983 break; 2984 2985 default: 2986 _exit(EXIT_SUCCESS); 2987 } 2988 2989 return (_B_TRUE); 2990 } 2991 2992 /* 2993 * The parent has created some fds before forking on purpose, keep them open. 2994 */ 2995 static int 2996 closefunc(void *not_used, int fd) 2997 /* ARGSUSED */ 2998 { 2999 if (fd != lsock_v4 && fd != lsock_v6) 3000 (void) close(fd); 3001 return (0); 3002 } 3003 3004 /* LOGGER */ 3005 3006 #include <syslog.h> 3007 3008 /* 3009 * Logging routines. All routines log to syslog, unless the daemon is 3010 * running in the foreground, in which case the logging goes to stderr. 3011 * 3012 * The following routines are available: 3013 * 3014 * logdebug(): A printf-like function for outputting debug messages 3015 * (messages at LOG_DEBUG) that are only of use to developers. 3016 * 3017 * logtrace(): A printf-like function for outputting tracing messages 3018 * (messages at LOG_INFO) from the daemon. This is typically used 3019 * to log the receipt of interesting network-related conditions. 3020 * 3021 * logerr(): A printf-like function for outputting error messages 3022 * (messages at LOG_ERR) from the daemon. 3023 * 3024 * logperror*(): A set of functions used to output error messages 3025 * (messages at LOG_ERR); these automatically append strerror(errno) 3026 * and a newline to the message passed to them. 3027 * 3028 * NOTE: since the logging functions write to syslog, the messages passed 3029 * to them are not eligible for localization. Thus, gettext() must 3030 * *not* be used. 3031 */ 3032 3033 static int logging = 0; 3034 3035 static void 3036 initlog(void) 3037 { 3038 logging++; 3039 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3040 } 3041 3042 /* PRINTFLIKE1 */ 3043 void 3044 logerr(char *fmt, ...) 3045 { 3046 va_list ap; 3047 3048 va_start(ap, fmt); 3049 3050 if (logging) 3051 vsyslog(LOG_ERR, fmt, ap); 3052 else 3053 (void) vfprintf(stderr, fmt, ap); 3054 va_end(ap); 3055 } 3056 3057 /* PRINTFLIKE1 */ 3058 void 3059 logtrace(char *fmt, ...) 3060 { 3061 va_list ap; 3062 3063 va_start(ap, fmt); 3064 3065 if (logging) 3066 vsyslog(LOG_INFO, fmt, ap); 3067 else 3068 (void) vfprintf(stderr, fmt, ap); 3069 va_end(ap); 3070 } 3071 3072 /* PRINTFLIKE1 */ 3073 void 3074 logdebug(char *fmt, ...) 3075 { 3076 va_list ap; 3077 3078 va_start(ap, fmt); 3079 3080 if (logging) 3081 vsyslog(LOG_DEBUG, fmt, ap); 3082 else 3083 (void) vfprintf(stderr, fmt, ap); 3084 va_end(ap); 3085 } 3086 3087 /* PRINTFLIKE1 */ 3088 void 3089 logperror(char *str) 3090 { 3091 if (logging) 3092 syslog(LOG_ERR, "%s: %m\n", str); 3093 else 3094 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3095 } 3096 3097 void 3098 logperror_pii(struct phyint_instance *pii, char *str) 3099 { 3100 if (logging) { 3101 syslog(LOG_ERR, "%s (%s %s): %m\n", 3102 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3103 } else { 3104 (void) fprintf(stderr, "%s (%s %s): %s\n", 3105 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3106 strerror(errno)); 3107 } 3108 } 3109 3110 void 3111 logperror_li(struct logint *li, char *str) 3112 { 3113 struct phyint_instance *pii = li->li_phyint_inst; 3114 3115 if (logging) { 3116 syslog(LOG_ERR, "%s (%s %s): %m\n", 3117 str, AF_STR(pii->pii_af), li->li_name); 3118 } else { 3119 (void) fprintf(stderr, "%s (%s %s): %s\n", 3120 str, AF_STR(pii->pii_af), li->li_name, 3121 strerror(errno)); 3122 } 3123 } 3124 3125 void 3126 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3127 { 3128 if (polled) 3129 (void) poll_remove(pii->pii_probe_sock); 3130 (void) close(pii->pii_probe_sock); 3131 pii->pii_probe_sock = -1; 3132 pii->pii_basetime_inited = 0; 3133 } 3134