1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #pragma ident "%Z%%M% %I% %E% SMI" 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 35 /* All times below in ms */ 36 int user_failure_detection_time; /* user specified failure detection */ 37 /* time (fdt) */ 38 int user_probe_interval; /* derived from user specified fdt */ 39 40 static int rtsock_v4; /* AF_INET routing socket */ 41 static int rtsock_v6; /* AF_INET6 routing socket */ 42 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 43 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 44 static int lsock_v4; /* Listen socket to detect mpathd */ 45 static int lsock_v6; /* Listen socket to detect mpathd */ 46 static int mibfd = -1; /* fd to get mib info */ 47 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 48 49 boolean_t full_scan_required = _B_FALSE; 50 static uint_t last_initifs_time; /* Time when initifs was last run */ 51 static char **argv0; /* Saved for re-exec on SIGHUP */ 52 boolean_t handle_link_notifications = _B_TRUE; 53 54 static void initlog(void); 55 static void run_timeouts(void); 56 static void initifs(void); 57 static void check_if_removed(struct phyint_instance *pii); 58 static void select_test_ifs(void); 59 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 60 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 61 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 62 struct in_addr nexthop_v4); 63 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 64 struct in6_addr nexthop_v6); 65 static void router_add_common(int af, char *ifname, 66 struct in6_addr nexthop); 67 static void init_router_targets(); 68 static void cleanup(void); 69 static int setup_listener(int af); 70 static void check_config(void); 71 static void check_testconfig(void); 72 static void check_addr_unique(struct phyint_instance *, 73 struct sockaddr_storage *); 74 static void init_host_targets(void); 75 static void dup_host_targets(struct phyint_instance *desired_pii); 76 static void loopback_cmd(int sock, int family); 77 static int poll_remove(int fd); 78 static boolean_t daemonize(void); 79 static int closefunc(void *, int); 80 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 81 static unsigned int process_query(int fd, mi_query_t *miq); 82 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 83 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 84 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 85 static unsigned int send_result(int fd, unsigned int error, int syserror); 86 87 struct local_addr *laddr_list = NULL; 88 89 /* 90 * Return the current time in milliseconds (from an arbitrary reference) 91 * truncated to fit into an int. Truncation is ok since we are interested 92 * only in differences and not the absolute values. 93 */ 94 uint_t 95 getcurrenttime(void) 96 { 97 uint_t cur_time; /* In ms */ 98 99 /* 100 * Use of a non-user-adjustable source of time is 101 * required. However millisecond precision is sufficient. 102 * divide by 10^6 103 */ 104 cur_time = (uint_t)(gethrtime() / 1000000LL); 105 return (cur_time); 106 } 107 108 uint64_t 109 getcurrentsec(void) 110 { 111 return (gethrtime() / NANOSEC); 112 } 113 114 /* 115 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 116 */ 117 int 118 poll_add(int fd) 119 { 120 int i; 121 int new_num; 122 struct pollfd *newfds; 123 retry: 124 /* Check if already present */ 125 for (i = 0; i < pollfd_num; i++) { 126 if (pollfds[i].fd == fd) 127 return (0); 128 } 129 /* Check for empty spot already present */ 130 for (i = 0; i < pollfd_num; i++) { 131 if (pollfds[i].fd == -1) { 132 pollfds[i].fd = fd; 133 return (0); 134 } 135 } 136 137 /* Allocate space for 32 more fds and initialize to -1 */ 138 new_num = pollfd_num + 32; 139 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 140 if (newfds == NULL) { 141 logperror("poll_add: realloc"); 142 return (-1); 143 } 144 for (i = pollfd_num; i < new_num; i++) { 145 newfds[i].fd = -1; 146 newfds[i].events = POLLIN; 147 } 148 pollfd_num = new_num; 149 pollfds = newfds; 150 goto retry; 151 } 152 153 /* 154 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 155 */ 156 static int 157 poll_remove(int fd) 158 { 159 int i; 160 161 /* Check if already present */ 162 for (i = 0; i < pollfd_num; i++) { 163 if (pollfds[i].fd == fd) { 164 pollfds[i].fd = -1; 165 return (0); 166 } 167 } 168 return (-1); 169 } 170 171 /* 172 * Extract information about the phyint instance. If the phyint instance still 173 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 174 * will use it to detect phyint instances that don't exist any longer and 175 * remove them, from our database of phyint instances. 176 * Return value: 177 * returns true if the phyint instance exists in the kernel, 178 * returns false otherwise 179 */ 180 static boolean_t 181 pii_process(int af, char *name, struct phyint_instance **pii_p) 182 { 183 int err; 184 struct phyint_instance *pii; 185 struct phyint_instance *pii_other; 186 187 if (debug & D_PHYINT) 188 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 189 190 pii = phyint_inst_lookup(af, name); 191 if (pii == NULL) { 192 /* 193 * Phyint instance does not exist in our tables, 194 * create new phyint instance 195 */ 196 pii = phyint_inst_init_from_k(af, name); 197 } else { 198 /* Phyint exists in our tables */ 199 err = phyint_inst_update_from_k(pii); 200 201 switch (err) { 202 case PI_IOCTL_ERROR: 203 /* Some ioctl error. don't change anything */ 204 pii->pii_in_use = 1; 205 break; 206 207 case PI_GROUP_CHANGED: 208 /* 209 * The phyint has changed group. 210 */ 211 restore_phyint(pii->pii_phyint); 212 /* FALLTHRU */ 213 214 case PI_IFINDEX_CHANGED: 215 /* 216 * Interface index has changed. Delete and 217 * recreate the phyint as it is quite likely 218 * the interface has been unplumbed and replumbed. 219 */ 220 pii_other = phyint_inst_other(pii); 221 if (pii_other != NULL) 222 phyint_inst_delete(pii_other); 223 phyint_inst_delete(pii); 224 pii = phyint_inst_init_from_k(af, name); 225 break; 226 227 case PI_DELETED: 228 /* Phyint instance has disappeared from kernel */ 229 pii->pii_in_use = 0; 230 break; 231 232 case PI_OK: 233 /* Phyint instance exists and is fine */ 234 pii->pii_in_use = 1; 235 break; 236 237 default: 238 /* Unknown status */ 239 logerr("pii_process: Unknown status %d\n", err); 240 break; 241 } 242 } 243 244 *pii_p = pii; 245 if (pii != NULL) 246 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 247 else 248 return (_B_FALSE); 249 } 250 251 /* 252 * This phyint is leaving the group. Try to restore the phyint to its 253 * initial state. Return the addresses that belong to other group members, 254 * to the group, and take back any addresses owned by this phyint 255 */ 256 void 257 restore_phyint(struct phyint *pi) 258 { 259 if (pi->pi_group == phyint_anongroup) 260 return; 261 262 /* 263 * Move everthing to some other member in the group. 264 * The phyint has changed group in the kernel. But we 265 * have yet to do it in our tables. 266 */ 267 if (!pi->pi_empty) 268 (void) try_failover(pi, FAILOVER_TO_ANY); 269 /* 270 * Move all addresses owned by 'pi' back to pi, from each 271 * of the other members of the group 272 */ 273 (void) try_failback(pi); 274 } 275 276 /* 277 * Scan all interfaces to detect changes as well as new and deleted interfaces 278 */ 279 static void 280 initifs() 281 { 282 int n; 283 int af; 284 char *cp; 285 char *buf; 286 int numifs; 287 struct lifnum lifn; 288 struct lifconf lifc; 289 struct lifreq *lifr; 290 struct logint *li; 291 struct phyint_instance *pii; 292 struct phyint_instance *next_pii; 293 char pi_name[LIFNAMSIZ + 1]; 294 boolean_t exists; 295 struct phyint *pi; 296 struct local_addr *next; 297 298 if (debug & D_PHYINT) 299 logdebug("initifs: Scanning interfaces\n"); 300 301 last_initifs_time = getcurrenttime(); 302 303 /* 304 * Free the laddr_list before collecting the local addresses. 305 */ 306 while (laddr_list != NULL) { 307 next = laddr_list->next; 308 free(laddr_list); 309 laddr_list = next; 310 } 311 312 /* 313 * Mark the interfaces so that we can find phyints and logints 314 * which have disappeared from the kernel. pii_process() and 315 * logint_init_from_k() will set {pii,li}_in_use when they find 316 * the interface in the kernel. Also, clear dupaddr bit on probe 317 * logint. check_addr_unique() will set the dupaddr bit on the 318 * probe logint, if the testaddress is not unique. 319 */ 320 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 321 pii->pii_in_use = 0; 322 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 323 li->li_in_use = 0; 324 if (pii->pii_probe_logint == li) 325 li->li_dupaddr = 0; 326 } 327 } 328 329 lifn.lifn_family = AF_UNSPEC; 330 lifn.lifn_flags = LIFC_ALLZONES; 331 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 332 logperror("initifs: ioctl (get interface numbers)"); 333 return; 334 } 335 numifs = lifn.lifn_count; 336 337 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 338 if (buf == NULL) { 339 logperror("initifs: calloc"); 340 return; 341 } 342 343 lifc.lifc_family = AF_UNSPEC; 344 lifc.lifc_flags = LIFC_ALLZONES; 345 lifc.lifc_len = numifs * sizeof (struct lifreq); 346 lifc.lifc_buf = buf; 347 348 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 349 /* 350 * EINVAL is commonly encountered, when things change 351 * underneath us rapidly, (eg. at boot, when new interfaces 352 * are plumbed successively) and the kernel finds the buffer 353 * size we passed as too small. We will retry again 354 * when we see the next routing socket msg, or at worst after 355 * IF_SCAN_INTERVAL ms. 356 */ 357 if (errno != EINVAL) { 358 logperror("initifs: ioctl" 359 " (get interface configuration)"); 360 } 361 free(buf); 362 return; 363 } 364 365 lifr = (struct lifreq *)lifc.lifc_req; 366 367 /* 368 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 369 * and get the state of the corresponding phyint_instance. If it is 370 * successful, then call logint_init_from_k() to get the state of the 371 * logint. 372 */ 373 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 374 int sockfd; 375 struct local_addr *taddr; 376 struct sockaddr_in *sin; 377 struct sockaddr_in6 *sin6; 378 struct lifreq lifreq; 379 380 af = lifr->lifr_addr.ss_family; 381 382 /* 383 * Collect all local addresses. 384 */ 385 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 386 (void) memset(&lifreq, 0, sizeof (lifreq)); 387 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, 388 sizeof (lifreq.lifr_name)); 389 390 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 391 if (errno != ENXIO) 392 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 393 continue; 394 } 395 396 /* 397 * Add the interface address to laddr_list. 398 * Another node might have the same IP address which is up. 399 * In that case, it is appropriate to use the address as a 400 * target, even though it is also configured (but not up) on 401 * the local system. 402 * Hence,the interface address is not added to laddr_list 403 * unless it is IFF_UP. 404 */ 405 if (lifreq.lifr_flags & IFF_UP) { 406 taddr = malloc(sizeof (struct local_addr)); 407 if (taddr == NULL) { 408 logperror("initifs: malloc"); 409 continue; 410 } 411 if (af == AF_INET) { 412 sin = (struct sockaddr_in *)&lifr->lifr_addr; 413 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, 414 &taddr->addr); 415 } else { 416 sin6 = (struct sockaddr_in6 *)&lifr->lifr_addr; 417 taddr->addr = sin6->sin6_addr; 418 } 419 taddr->next = laddr_list; 420 laddr_list = taddr; 421 } 422 423 /* 424 * Need to pass a phyint name to pii_process. Insert the 425 * null where the ':' IF_SEPARATOR is found in the logical 426 * name. 427 */ 428 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 429 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 430 *cp = '\0'; 431 432 exists = pii_process(af, pi_name, &pii); 433 if (exists) { 434 /* The phyint is fine. So process the logint */ 435 logint_init_from_k(pii, lifr->lifr_name); 436 check_addr_unique(pii, &lifr->lifr_addr); 437 } 438 439 } 440 441 free(buf); 442 443 /* 444 * Scan for phyints and logints that have disappeared from the 445 * kernel, and delete them. 446 */ 447 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 448 next_pii = pii->pii_next; 449 check_if_removed(pii); 450 } 451 452 /* 453 * Select a test address for sending probes on each phyint instance 454 */ 455 select_test_ifs(); 456 457 /* 458 * Handle link up/down notifications from the NICs. 459 */ 460 process_link_state_changes(); 461 462 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 463 /* 464 * If this is a case of group failure, we don't have much 465 * to do until the group recovers again. 466 */ 467 if (GROUP_FAILED(pi->pi_group)) 468 continue; 469 470 /* 471 * Try/Retry any pending failovers / failbacks, that did not 472 * not complete, or that could not be initiated previously. 473 * This implements the 3 invariants described in the big block 474 * comment at the beginning of probe.c 475 */ 476 if (pi->pi_flags & IFF_INACTIVE) { 477 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 478 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 479 } else { 480 struct phyint_instance *pii; 481 482 /* 483 * Skip LINK UP interfaces which are not capable 484 * of probing. 485 */ 486 pii = pi->pi_v4; 487 if (pii == NULL || 488 (LINK_UP(pi) && !PROBE_CAPABLE(pii))) { 489 pii = pi->pi_v6; 490 if (pii == NULL || 491 (LINK_UP(pi) && !PROBE_CAPABLE(pii))) 492 continue; 493 } 494 495 /* 496 * It is possible that the phyint has started 497 * receiving packets, after it has been marked 498 * PI_FAILED. Don't initiate failover, if the 499 * phyint has started recovering. failure_state() 500 * captures this check. A similar logic is used 501 * for failback/repair case. 502 */ 503 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 504 (failure_state(pii) == PHYINT_FAILURE)) { 505 (void) try_failover(pi, FAILOVER_NORMAL); 506 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 507 if (try_failback(pi) != IPMP_FAILURE) { 508 (void) change_lif_flags(pi, IFF_FAILED, 509 _B_FALSE); 510 /* Per state diagram */ 511 pi->pi_empty = 0; 512 } 513 } 514 } 515 } 516 } 517 518 /* 519 * Check that a given test address is unique across all of the interfaces in a 520 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 521 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 522 * Any issues will be reported by check_testconfig(). 523 */ 524 static void 525 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 526 { 527 struct phyint *pi; 528 struct phyint_group *pg; 529 struct in6_addr addr; 530 struct phyint_instance *pii; 531 struct sockaddr_in *sin; 532 533 if (ss->ss_family == AF_INET) { 534 sin = (struct sockaddr_in *)ss; 535 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 536 } else { 537 assert(ss->ss_family == AF_INET6); 538 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 539 } 540 541 /* 542 * For anonymous groups, every interface is assumed to be on its own 543 * link, so there is no chance of overlapping addresses. 544 */ 545 pg = ourpii->pii_phyint->pi_group; 546 if (pg == phyint_anongroup) 547 return; 548 549 /* 550 * Walk the list of phyint instances in the group and check for test 551 * addresses matching ours. Of course, we skip ourself. 552 */ 553 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 554 pii = PHYINT_INSTANCE(pi, ss->ss_family); 555 if (pii == NULL || pii == ourpii || 556 pii->pii_probe_logint == NULL) 557 continue; 558 559 /* 560 * If this test address is not unique, set the dupaddr bit. 561 */ 562 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 563 pii->pii_probe_logint->li_dupaddr = 1; 564 } 565 } 566 567 /* 568 * Stop probing an interface. Called when an interface is offlined. 569 * The probe socket is closed on each interface instance, and the 570 * interface state set to PI_OFFLINE. 571 */ 572 static void 573 stop_probing(struct phyint *pi) 574 { 575 struct phyint_instance *pii; 576 577 pii = pi->pi_v4; 578 if (pii != NULL) { 579 if (pii->pii_probe_sock != -1) 580 close_probe_socket(pii, _B_TRUE); 581 pii->pii_probe_logint = NULL; 582 } 583 584 pii = pi->pi_v6; 585 if (pii != NULL) { 586 if (pii->pii_probe_sock != -1) 587 close_probe_socket(pii, _B_TRUE); 588 pii->pii_probe_logint = NULL; 589 } 590 591 phyint_chstate(pi, PI_OFFLINE); 592 } 593 594 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 595 596 /* 597 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 598 * IFF_UP must also be set so that the associated address can be used as a 599 * source address. Further, we must be able to exchange packets with local 600 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 601 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 602 */ 603 static int 604 rate_testflags(uint64_t flags) 605 { 606 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 607 return (BAD_TESTFLAGS); 608 609 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 610 return (BAD_TESTFLAGS); 611 612 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 613 return (BEST_TESTFLAGS); 614 615 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 616 return (BEST_TESTFLAGS); 617 618 return (OK_TESTFLAGS); 619 } 620 621 /* 622 * Attempt to select a test address for each phyint instance. 623 * Call phyint_inst_sockinit() to complete the initializations. 624 */ 625 static void 626 select_test_ifs(void) 627 { 628 struct phyint *pi; 629 struct phyint_instance *pii; 630 struct phyint_instance *next_pii; 631 struct logint *li; 632 struct logint *probe_logint; 633 boolean_t target_scan_reqd = _B_FALSE; 634 struct target *tg; 635 int rating; 636 637 if (debug & D_PHYINT) 638 logdebug("select_test_ifs\n"); 639 640 /* 641 * For each phyint instance, do the test address selection 642 */ 643 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 644 next_pii = pii->pii_next; 645 probe_logint = NULL; 646 647 /* 648 * An interface that is offline, should not be probed. 649 * Offline interfaces should always in PI_OFFLINE state, 650 * unless some other entity has set the offline flag. 651 */ 652 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 653 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 654 logerr("shouldn't be probing offline" 655 " interface %s (state is: %u)." 656 " Stopping probes.\n", 657 pii->pii_phyint->pi_name, 658 pii->pii_phyint->pi_state); 659 stop_probing(pii->pii_phyint); 660 } 661 continue; 662 } 663 664 li = pii->pii_probe_logint; 665 if (li != NULL) { 666 /* 667 * We've already got a test address; only proceed 668 * if it's suboptimal. 669 */ 670 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 671 continue; 672 } 673 674 /* 675 * Walk the logints of this phyint instance, and select 676 * the best available test address 677 */ 678 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 679 /* 680 * Skip 0.0.0.0 addresses, as those are never 681 * actually usable. 682 */ 683 if (pii->pii_af == AF_INET && 684 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 685 continue; 686 687 /* 688 * Skip any IPv6 logints that are not link-local, 689 * since we should always have a link-local address 690 * anyway and in6_data() expects link-local replies. 691 */ 692 if (pii->pii_af == AF_INET6 && 693 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 694 continue; 695 696 /* 697 * Rate the testflags. If we've found an optimal 698 * match, then break out; otherwise, record the most 699 * recent OK one. 700 */ 701 rating = rate_testflags(li->li_flags); 702 if (rating == BAD_TESTFLAGS) 703 continue; 704 705 probe_logint = li; 706 if (rating == BEST_TESTFLAGS) 707 break; 708 } 709 710 /* 711 * If the probe logint has changed, ditch the old one. 712 */ 713 if (pii->pii_probe_logint != NULL && 714 pii->pii_probe_logint != probe_logint) { 715 if (pii->pii_probe_sock != -1) 716 close_probe_socket(pii, _B_TRUE); 717 pii->pii_probe_logint = NULL; 718 } 719 720 if (probe_logint == NULL) { 721 /* 722 * We don't have a test address; zero out the probe 723 * stats array since it is no longer relevant. 724 * Optimize by checking if it is already zeroed out. 725 */ 726 int pr_ndx; 727 728 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 729 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 730 clear_pii_probe_stats(pii); 731 reset_crtt_all(pii->pii_phyint); 732 } 733 continue; 734 } else if (probe_logint == pii->pii_probe_logint) { 735 /* 736 * If we didn't find any new test addr, go to the 737 * next phyint. 738 */ 739 continue; 740 } 741 742 /* 743 * The phyint is either being assigned a new testaddr 744 * or is being assigned a testaddr for the 1st time. 745 * Need to initialize the phyint socket 746 */ 747 pii->pii_probe_logint = probe_logint; 748 if (!phyint_inst_sockinit(pii)) { 749 if (debug & D_PHYINT) { 750 logdebug("select_test_ifs: " 751 "phyint_sockinit failed\n"); 752 } 753 phyint_inst_delete(pii); 754 continue; 755 } 756 757 /* 758 * This phyint instance is now enabled for probes; this 759 * impacts our state machine in two ways: 760 * 761 * 1. If we're probe *capable* as well (i.e., we have 762 * probe targets) and the interface is in PI_NOTARGETS, 763 * then transition to PI_RUNNING. 764 * 765 * 2. If we're not probe capable, and the other phyint 766 * instance is also not probe capable, and we were in 767 * PI_RUNNING, then transition to PI_NOTARGETS. 768 * 769 * Also see the state diagram in mpd_probe.c. 770 */ 771 if (PROBE_CAPABLE(pii)) { 772 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 773 phyint_chstate(pii->pii_phyint, PI_RUNNING); 774 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 775 if (pii->pii_phyint->pi_state == PI_RUNNING) 776 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 777 } 778 779 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 780 tg = pii->pii_targets; 781 if (tg != NULL) 782 target_delete(tg); 783 assert(pii->pii_targets == NULL); 784 assert(pii->pii_target_next == NULL); 785 assert(pii->pii_ntargets == 0); 786 target_create(pii, probe_logint->li_dstaddr, 787 _B_TRUE); 788 } 789 790 /* 791 * If no targets are currently known for this phyint 792 * we need to call init_router_targets. Since 793 * init_router_targets() initializes the list of targets 794 * for all phyints it is done below the loop. 795 */ 796 if (pii->pii_targets == NULL) 797 target_scan_reqd = _B_TRUE; 798 799 /* 800 * Start the probe timer for this instance. 801 */ 802 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 803 start_timer(pii); 804 pii->pii_basetime_inited = 1; 805 } 806 } 807 808 /* 809 * Check the interface list for any interfaces that are marked 810 * PI_FAILED but no longer enabled to send probes, and call 811 * phyint_check_for_repair() to see if the link now indicates that the 812 * interface should be repaired. Also see the state diagram in 813 * mpd_probe.c. 814 */ 815 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 816 if (pi->pi_state == PI_FAILED && 817 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 818 phyint_check_for_repair(pi); 819 } 820 } 821 822 check_testconfig(); 823 824 /* 825 * Try to populate the target list. init_router_targets populates 826 * the target list from the routing table. If our target list is 827 * still empty, init_host_targets adds host targets based on the 828 * host target list of other phyints in the group. 829 */ 830 if (target_scan_reqd) { 831 init_router_targets(); 832 init_host_targets(); 833 } 834 } 835 836 /* 837 * Check test address configuration, and log warnings if appropriate. Note 838 * that this function only logs pre-existing conditions (e.g., that probe- 839 * based failure detection is disabled). 840 */ 841 static void 842 check_testconfig(void) 843 { 844 struct phyint *pi; 845 struct logint *li; 846 char abuf[INET6_ADDRSTRLEN]; 847 848 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 849 if (pi->pi_flags & IFF_OFFLINE) 850 continue; 851 852 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 853 if (pi->pi_taddrmsg_printed || 854 pi->pi_duptaddrmsg_printed) { 855 logerr("Test address now configured on " 856 "interface %s; enabling probe-based " 857 "failure detection on it\n", pi->pi_name); 858 pi->pi_taddrmsg_printed = 0; 859 pi->pi_duptaddrmsg_printed = 0; 860 } 861 continue; 862 } 863 864 li = NULL; 865 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 866 pi->pi_v4->pii_probe_logint->li_dupaddr) 867 li = pi->pi_v4->pii_probe_logint; 868 869 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 870 pi->pi_v6->pii_probe_logint->li_dupaddr) 871 li = pi->pi_v6->pii_probe_logint; 872 873 if (li != NULL) { 874 if (!pi->pi_duptaddrmsg_printed) { 875 (void) pr_addr(li->li_phyint_inst->pii_af, 876 li->li_addr, abuf, sizeof (abuf)); 877 logerr("Test address %s is not unique in " 878 "group; disabling probe-based failure " 879 "detection on %s\n", abuf, pi->pi_name); 880 pi->pi_duptaddrmsg_printed = 1; 881 } 882 continue; 883 } 884 885 if (getcurrentsec() < pi->pi_taddrthresh) 886 continue; 887 888 if (!pi->pi_taddrmsg_printed) { 889 logerr("No test address configured on interface %s; " 890 "disabling probe-based failure detection on it\n", 891 pi->pi_name); 892 pi->pi_taddrmsg_printed = 1; 893 } 894 } 895 } 896 897 /* 898 * Check phyint group configuration, to detect any inconsistencies, 899 * and log an error message. This is called from runtimeouts every 900 * 20 secs. But the error message is displayed once. If the 901 * consistency is resolved by the admin, a recovery message is displayed 902 * once. 903 */ 904 static void 905 check_config(void) 906 { 907 struct phyint_group *pg; 908 struct phyint *pi; 909 boolean_t v4_in_group; 910 boolean_t v6_in_group; 911 912 /* 913 * All phyints of a group must be homogenous to ensure that 914 * failover or failback can be done. If any phyint in a group 915 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 916 * Do a similar check for IPv6. 917 */ 918 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 919 if (pg == phyint_anongroup) 920 continue; 921 922 v4_in_group = _B_FALSE; 923 v6_in_group = _B_FALSE; 924 /* 925 * 1st pass. Determine if at least 1 phyint in the group 926 * has IPv4 plumbed and if so set v4_in_group to true. 927 * Repeat similarly for IPv6. 928 */ 929 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 930 if (pi->pi_v4 != NULL) 931 v4_in_group = _B_TRUE; 932 if (pi->pi_v6 != NULL) 933 v6_in_group = _B_TRUE; 934 } 935 936 /* 937 * 2nd pass. If v4_in_group is true, check that phyint 938 * has IPv4 plumbed. Repeat similarly for IPv6. Print 939 * out a message the 1st time only. 940 */ 941 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 942 if (pi->pi_flags & IFF_OFFLINE) 943 continue; 944 945 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 946 if (!pi->pi_cfgmsg_printed) { 947 logerr("NIC %s of group %s is" 948 " not plumbed for IPv4 and may" 949 " affect failover capability\n", 950 pi->pi_name, 951 pi->pi_group->pg_name); 952 pi->pi_cfgmsg_printed = 1; 953 } 954 } else if (v6_in_group == _B_TRUE && 955 pi->pi_v6 == NULL) { 956 if (!pi->pi_cfgmsg_printed) { 957 logerr("NIC %s of group %s is" 958 " not plumbed for IPv6 and may" 959 " affect failover capability\n", 960 pi->pi_name, 961 pi->pi_group->pg_name); 962 pi->pi_cfgmsg_printed = 1; 963 } 964 } else { 965 /* 966 * The phyint matches the group configuration, 967 * if we have reached this point. If it was 968 * improperly configured earlier, log an 969 * error recovery message 970 */ 971 if (pi->pi_cfgmsg_printed) { 972 logerr("NIC %s is now consistent with " 973 "group %s and failover capability " 974 "is restored\n", pi->pi_name, 975 pi->pi_group->pg_name); 976 pi->pi_cfgmsg_printed = 0; 977 } 978 } 979 980 } 981 } 982 } 983 984 /* 985 * Timer mechanism using relative time (in milliseconds) from the 986 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 987 * will fire after TIMER_INFINITY milliseconds. 988 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 989 * time values. Hence 2 consecutive timer events cannot be spaced farther 990 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 991 * that can be passed for the delay parameter of timer_schedule() 992 */ 993 static uint_t timer_next; /* Currently scheduled timeout */ 994 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 995 996 static void 997 timer_init(void) 998 { 999 timer_next = getcurrenttime() + TIMER_INFINITY; 1000 /* 1001 * The call to run_timeouts() will get the timer started 1002 * Since there are no phyints at this point, the timer will 1003 * be set for IF_SCAN_INTERVAL ms. 1004 */ 1005 run_timeouts(); 1006 } 1007 1008 /* 1009 * Make sure the next SIGALRM occurs delay milliseconds from the current 1010 * time if not earlier. We are interested only in time differences. 1011 */ 1012 void 1013 timer_schedule(uint_t delay) 1014 { 1015 uint_t now; 1016 struct itimerval itimerval; 1017 1018 if (debug & D_TIMER) 1019 logdebug("timer_schedule(%u)\n", delay); 1020 1021 assert(delay <= TIMER_INFINITY); 1022 1023 now = getcurrenttime(); 1024 if (delay == 0) { 1025 /* Minimum allowed delay */ 1026 delay = 1; 1027 } 1028 /* Will this timer occur before the currently scheduled SIGALRM? */ 1029 if (timer_active && TIME_GE(now + delay, timer_next)) { 1030 if (debug & D_TIMER) { 1031 logdebug("timer_schedule(%u) - no action: " 1032 "now %u next %u\n", delay, now, timer_next); 1033 } 1034 return; 1035 } 1036 timer_next = now + delay; 1037 1038 itimerval.it_value.tv_sec = delay / 1000; 1039 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1040 itimerval.it_interval.tv_sec = 0; 1041 itimerval.it_interval.tv_usec = 0; 1042 if (debug & D_TIMER) { 1043 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1044 delay, itimerval.it_value.tv_sec, 1045 itimerval.it_value.tv_usec); 1046 } 1047 timer_active = _B_TRUE; 1048 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1049 logperror("timer_schedule: setitimer"); 1050 exit(2); 1051 } 1052 } 1053 1054 /* 1055 * Timer has fired. Determine when the next timer event will occur by asking 1056 * all the timer routines. Should not be called from a timer routine. 1057 */ 1058 static void 1059 run_timeouts(void) 1060 { 1061 uint_t next; 1062 uint_t next_event_time; 1063 struct phyint_instance *pii; 1064 struct phyint_instance *next_pii; 1065 static boolean_t timeout_running; 1066 1067 /* assert that recursive timeouts don't happen. */ 1068 assert(!timeout_running); 1069 1070 timeout_running = _B_TRUE; 1071 1072 if (debug & D_TIMER) 1073 logdebug("run_timeouts()\n"); 1074 1075 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1076 initifs(); 1077 check_config(); 1078 } 1079 1080 next = TIMER_INFINITY; 1081 1082 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1083 next_pii = pii->pii_next; 1084 next_event_time = phyint_inst_timer(pii); 1085 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1086 next = next_event_time; 1087 1088 if (debug & D_TIMER) { 1089 logdebug("run_timeouts(%s %s): next scheduled for" 1090 " this phyint inst %u, next scheduled global" 1091 " %u ms\n", 1092 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1093 next_event_time, next); 1094 } 1095 } 1096 1097 /* 1098 * Make sure initifs() is called at least once every 1099 * IF_SCAN_INTERVAL, to make sure that we are in sync 1100 * with the kernel, in case we have missed any routing 1101 * socket messages. 1102 */ 1103 if (next > IF_SCAN_INTERVAL) 1104 next = IF_SCAN_INTERVAL; 1105 1106 if (debug & D_TIMER) 1107 logdebug("run_timeouts: %u ms\n", next); 1108 1109 timer_schedule(next); 1110 timeout_running = _B_FALSE; 1111 } 1112 1113 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1114 static int eventpipe_write = -1; 1115 static boolean_t cleanup_started = _B_FALSE; 1116 /* Don't write to eventpipe if in cleanup */ 1117 /* 1118 * Ensure that signals are processed synchronously with the rest of 1119 * the code by just writing a one character signal number on the pipe. 1120 * The poll loop will pick this up and process the signal event. 1121 */ 1122 static void 1123 sig_handler(int signo) 1124 { 1125 uchar_t buf = (uchar_t)signo; 1126 1127 /* 1128 * Don't write to pipe if cleanup has already begun. cleanup() 1129 * might have closed the pipe already 1130 */ 1131 if (cleanup_started) 1132 return; 1133 1134 if (eventpipe_write == -1) { 1135 logerr("sig_handler: no pipe found\n"); 1136 return; 1137 } 1138 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1139 logperror("sig_handler: write"); 1140 } 1141 1142 extern struct probes_missed probes_missed; 1143 1144 /* 1145 * Pick up a signal "byte" from the pipe and process it. 1146 */ 1147 static void 1148 in_signal(int fd) 1149 { 1150 uchar_t buf; 1151 uint64_t sent, acked, lost, unacked, unknown; 1152 struct phyint_instance *pii; 1153 int pr_ndx; 1154 1155 switch (read(fd, &buf, sizeof (buf))) { 1156 case -1: 1157 logperror("in_signal: read"); 1158 exit(1); 1159 /* NOTREACHED */ 1160 case 1: 1161 break; 1162 case 0: 1163 logerr("in_signal: read end of file\n"); 1164 exit(1); 1165 /* NOTREACHED */ 1166 default: 1167 logerr("in_signal: read > 1\n"); 1168 exit(1); 1169 } 1170 1171 if (debug & D_TIMER) 1172 logdebug("in_signal() got %d\n", buf); 1173 1174 switch (buf) { 1175 case SIGALRM: 1176 if (debug & D_TIMER) { 1177 uint_t now = getcurrenttime(); 1178 1179 logdebug("in_signal(SIGALRM) delta %u\n", 1180 now - timer_next); 1181 } 1182 timer_active = _B_FALSE; 1183 run_timeouts(); 1184 break; 1185 case SIGUSR1: 1186 logdebug("Printing configuration:\n"); 1187 /* Print out the internal tables */ 1188 phyint_inst_print_all(); 1189 1190 /* 1191 * Print out the accumulated statistics about missed 1192 * probes (happens due to scheduling delay). 1193 */ 1194 logerr("Missed sending total of %d probes spread over" 1195 " %d occurrences\n", probes_missed.pm_nprobes, 1196 probes_missed.pm_ntimes); 1197 1198 /* 1199 * Print out the accumulated statistics about probes 1200 * that were sent. 1201 */ 1202 for (pii = phyint_instances; pii != NULL; 1203 pii = pii->pii_next) { 1204 unacked = 0; 1205 acked = pii->pii_cum_stats.acked; 1206 lost = pii->pii_cum_stats.lost; 1207 sent = pii->pii_cum_stats.sent; 1208 unknown = pii->pii_cum_stats.unknown; 1209 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1210 switch (pii->pii_probes[pr_ndx].pr_status) { 1211 case PR_ACKED: 1212 acked++; 1213 break; 1214 case PR_LOST: 1215 lost++; 1216 break; 1217 case PR_UNACKED: 1218 unacked++; 1219 break; 1220 } 1221 } 1222 logerr("\nProbe stats on (%s %s)\n" 1223 "Number of probes sent %lld\n" 1224 "Number of probe acks received %lld\n" 1225 "Number of probes/acks lost %lld\n" 1226 "Number of valid unacknowled probes %lld\n" 1227 "Number of ambiguous probe acks received %lld\n", 1228 AF_STR(pii->pii_af), pii->pii_name, 1229 sent, acked, lost, unacked, unknown); 1230 } 1231 break; 1232 case SIGHUP: 1233 logerr("SIGHUP: restart and reread config file\n"); 1234 cleanup(); 1235 (void) execv(argv0[0], argv0); 1236 _exit(0177); 1237 /* NOTREACHED */ 1238 case SIGINT: 1239 case SIGTERM: 1240 case SIGQUIT: 1241 cleanup(); 1242 exit(0); 1243 /* NOTREACHED */ 1244 default: 1245 logerr("in_signal: unknown signal: %d\n", buf); 1246 } 1247 } 1248 1249 static void 1250 cleanup(void) 1251 { 1252 struct phyint_instance *pii; 1253 struct phyint_instance *next_pii; 1254 1255 /* 1256 * Make sure that we don't write to eventpipe in 1257 * sig_handler() if any signal notably SIGALRM, 1258 * occurs after we close the eventpipe descriptor below 1259 */ 1260 cleanup_started = _B_TRUE; 1261 1262 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1263 next_pii = pii->pii_next; 1264 phyint_inst_delete(pii); 1265 } 1266 1267 (void) close(ifsock_v4); 1268 (void) close(ifsock_v6); 1269 (void) close(rtsock_v4); 1270 (void) close(rtsock_v6); 1271 (void) close(lsock_v4); 1272 (void) close(lsock_v6); 1273 (void) close(0); 1274 (void) close(1); 1275 (void) close(2); 1276 (void) close(mibfd); 1277 (void) close(eventpipe_read); 1278 (void) close(eventpipe_write); 1279 } 1280 1281 /* 1282 * Create pipe for signal delivery and set up signal handlers. 1283 */ 1284 static void 1285 setup_eventpipe(void) 1286 { 1287 int fds[2]; 1288 struct sigaction act; 1289 1290 if ((pipe(fds)) < 0) { 1291 logperror("setup_eventpipe: pipe"); 1292 exit(1); 1293 } 1294 eventpipe_read = fds[0]; 1295 eventpipe_write = fds[1]; 1296 if (poll_add(eventpipe_read) == -1) { 1297 exit(1); 1298 } 1299 1300 act.sa_handler = sig_handler; 1301 act.sa_flags = SA_RESTART; 1302 (void) sigaction(SIGALRM, &act, NULL); 1303 1304 (void) sigset(SIGHUP, sig_handler); 1305 (void) sigset(SIGUSR1, sig_handler); 1306 (void) sigset(SIGTERM, sig_handler); 1307 (void) sigset(SIGINT, sig_handler); 1308 (void) sigset(SIGQUIT, sig_handler); 1309 } 1310 1311 /* 1312 * Create a routing socket for receiving RTM_IFINFO messages. 1313 */ 1314 static int 1315 setup_rtsock(int af) 1316 { 1317 int s; 1318 int flags; 1319 1320 s = socket(PF_ROUTE, SOCK_RAW, af); 1321 if (s == -1) { 1322 logperror("setup_rtsock: socket PF_ROUTE"); 1323 exit(1); 1324 } 1325 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1326 logperror("setup_rtsock: fcntl F_GETFL"); 1327 (void) close(s); 1328 exit(1); 1329 } 1330 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1331 logperror("setup_rtsock: fcntl F_SETFL"); 1332 (void) close(s); 1333 exit(1); 1334 } 1335 if (poll_add(s) == -1) { 1336 (void) close(s); 1337 exit(1); 1338 } 1339 return (s); 1340 } 1341 1342 /* 1343 * Process an RTM_IFINFO message received on a routing socket. 1344 * The return value indicates whether a full interface scan is required. 1345 * Link up/down notifications from the NICs are reflected in the 1346 * IFF_RUNNING flag. 1347 * If just the state of the IFF_RUNNING interface flag has changed, a 1348 * a full interface scan isn't required. 1349 */ 1350 static boolean_t 1351 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1352 { 1353 struct sockaddr_dl *sdl; 1354 struct phyint *pi; 1355 uint64_t old_flags; 1356 struct phyint_instance *pii; 1357 1358 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1359 1360 /* 1361 * Although the sockaddr_dl structure is directly after the 1362 * if_msghdr_t structure. At the time of writing, the size of the 1363 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1364 * to the presence of a timeval structure, which contains longs, 1365 * in the if_data structure. Anyway, we know where the message ends, 1366 * so we work backwards to get the start of the sockaddr_dl structure. 1367 */ 1368 /*LINTED*/ 1369 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1370 sizeof (struct sockaddr_dl)); 1371 1372 assert(sdl->sdl_family == AF_LINK); 1373 1374 /* 1375 * The interface name is in sdl_data. 1376 * RTM_IFINFO messages are only generated for logical interface 1377 * zero, so there is no colon and logical interface number to 1378 * strip from the name. The name is not null terminated, but 1379 * there should be enough space in sdl_data to add the null. 1380 */ 1381 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1382 if (debug & D_LINKNOTE) 1383 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1384 return (_B_TRUE); 1385 } 1386 sdl->sdl_data[sdl->sdl_nlen] = 0; 1387 1388 pi = phyint_lookup(sdl->sdl_data); 1389 if (pi == NULL) { 1390 if (debug & D_LINKNOTE) 1391 logdebug("process_rtm_ifinfo: phyint lookup failed" 1392 " for %s\n", sdl->sdl_data); 1393 return (_B_TRUE); 1394 } 1395 1396 /* 1397 * We want to try and avoid doing a full interface scan for 1398 * link state notifications from the NICs, as indicated 1399 * by the state of the IFF_RUNNING flag. If just the 1400 * IFF_RUNNING flag has changed state, the link state changes 1401 * are processed without a full scan. 1402 * If there is both an IPv4 and IPv6 instance associated with 1403 * the physical interface, we will get an RTM_IFINFO message 1404 * for each instance. If we just maintained a single copy of 1405 * the physical interface flags, it would appear that no flags 1406 * had changed when the second message is processed, leading us 1407 * to believe that the message wasn't generated by a flags change, 1408 * and that a full interface scan is required. 1409 * To get around this problem, two additional copies of the flags 1410 * are kept, one copy for each instance. These are only used in 1411 * this routine. At any one time, all three copies of the flags 1412 * should be identical except for the IFF_RUNNING flag. The 1413 * copy of the flags in the "phyint" structure is always up to 1414 * date. 1415 */ 1416 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1417 if (pii == NULL) { 1418 if (debug & D_LINKNOTE) 1419 logdebug("process_rtm_ifinfo: no instance of address " 1420 "family %s for %s\n", AF_STR(type), pi->pi_name); 1421 return (_B_TRUE); 1422 } 1423 1424 old_flags = pii->pii_flags; 1425 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1426 pi->pi_flags = pii->pii_flags; 1427 1428 if (debug & D_LINKNOTE) { 1429 logdebug("process_rtm_ifinfo: %s address family: %s, " 1430 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1431 AF_STR(type), old_flags, pi->pi_flags); 1432 } 1433 1434 /* 1435 * If IFF_STANDBY has changed, indicate that the interface has changed 1436 * types. 1437 */ 1438 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1439 phyint_newtype(pi); 1440 1441 /* 1442 * If IFF_INACTIVE has been set, then no data addresses should be 1443 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1444 * move previously failed-over addresses back to it, provided it is 1445 * not failed. For details, see the state diagram in mpd_probe.c. 1446 */ 1447 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1448 if (pii->pii_flags & IFF_INACTIVE) { 1449 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1450 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1451 } else { 1452 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1453 pi->pi_empty = 0; 1454 (void) try_failback(pi); 1455 } 1456 } 1457 } 1458 1459 /* Has just the IFF_RUNNING flag changed state ? */ 1460 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1461 struct phyint_instance *pii_other; 1462 /* 1463 * It wasn't just a link state change. Update 1464 * the other instance's copy of the flags. 1465 */ 1466 pii_other = phyint_inst_other(pii); 1467 if (pii_other != NULL) 1468 pii_other->pii_flags = pii->pii_flags; 1469 return (_B_TRUE); 1470 } 1471 1472 return (_B_FALSE); 1473 } 1474 1475 /* 1476 * Retrieve as many routing socket messages as possible, and try to 1477 * empty the routing sockets. Initiate full scan of targets or interfaces 1478 * as needed. 1479 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1480 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1481 */ 1482 static void 1483 process_rtsock(int rtsock_v4, int rtsock_v6) 1484 { 1485 int nbytes; 1486 int64_t msg[2048 / 8]; 1487 struct rt_msghdr *rtm; 1488 boolean_t need_if_scan = _B_FALSE; 1489 boolean_t need_rt_scan = _B_FALSE; 1490 boolean_t rtm_ifinfo_seen = _B_FALSE; 1491 int type; 1492 1493 /* Read as many messages as possible and try to empty the sockets */ 1494 for (type = AF_INET; ; type = AF_INET6) { 1495 for (;;) { 1496 nbytes = read((type == AF_INET) ? rtsock_v4 : 1497 rtsock_v6, msg, sizeof (msg)); 1498 if (nbytes <= 0) { 1499 /* No more messages */ 1500 break; 1501 } 1502 rtm = (struct rt_msghdr *)msg; 1503 if (rtm->rtm_version != RTM_VERSION) { 1504 logerr("process_rtsock: version %d " 1505 "not understood\n", rtm->rtm_version); 1506 break; 1507 } 1508 1509 if (debug & D_PHYINT) { 1510 logdebug("process_rtsock: message %d\n", 1511 rtm->rtm_type); 1512 } 1513 1514 switch (rtm->rtm_type) { 1515 case RTM_NEWADDR: 1516 case RTM_DELADDR: 1517 /* 1518 * Some logical interface has changed, 1519 * have to scan everything to determine 1520 * what actually changed. 1521 */ 1522 need_if_scan = _B_TRUE; 1523 break; 1524 1525 case RTM_IFINFO: 1526 rtm_ifinfo_seen = _B_TRUE; 1527 need_if_scan |= process_rtm_ifinfo( 1528 (if_msghdr_t *)rtm, type); 1529 break; 1530 1531 case RTM_ADD: 1532 case RTM_DELETE: 1533 case RTM_CHANGE: 1534 case RTM_OLDADD: 1535 case RTM_OLDDEL: 1536 need_rt_scan = _B_TRUE; 1537 break; 1538 1539 default: 1540 /* Not interesting */ 1541 break; 1542 } 1543 } 1544 if (type == AF_INET6) 1545 break; 1546 } 1547 1548 if (need_if_scan) { 1549 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1550 logdebug("process_rtsock: synchronizing with kernel\n"); 1551 initifs(); 1552 } else if (rtm_ifinfo_seen) { 1553 if (debug & D_LINKNOTE) 1554 logdebug("process_rtsock: " 1555 "link up/down notification(s) seen\n"); 1556 process_link_state_changes(); 1557 } 1558 1559 if (need_rt_scan) 1560 init_router_targets(); 1561 } 1562 1563 /* 1564 * Look if the phyint instance or one of its logints have been removed from 1565 * the kernel and take appropriate action. 1566 * Uses {pii,li}_in_use. 1567 */ 1568 static void 1569 check_if_removed(struct phyint_instance *pii) 1570 { 1571 struct logint *li; 1572 struct logint *next_li; 1573 1574 /* Detect phyints that have been removed from the kernel. */ 1575 if (!pii->pii_in_use) { 1576 logtrace("%s %s has been removed from kernel\n", 1577 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1578 phyint_inst_delete(pii); 1579 } else { 1580 /* Detect logints that have been removed. */ 1581 for (li = pii->pii_logint; li != NULL; li = next_li) { 1582 next_li = li->li_next; 1583 if (!li->li_in_use) { 1584 logint_delete(li); 1585 } 1586 } 1587 } 1588 } 1589 1590 /* 1591 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1592 * tables defined by mib2.h. Parse the returned data and extract 1593 * the 'routing' information table. Process the 'routing' table 1594 * to get the list of known onlink routers, and update our database. 1595 * These onlink routers will serve as our probe targets. 1596 * Returns false, if any system calls resulted in errors, true otherwise. 1597 */ 1598 static boolean_t 1599 update_router_list(int fd) 1600 { 1601 union { 1602 char ubuf[1024]; 1603 union T_primitives uprim; 1604 } buf; 1605 1606 int flags; 1607 struct strbuf ctlbuf; 1608 struct strbuf databuf; 1609 struct T_optmgmt_req *tor; 1610 struct T_optmgmt_ack *toa; 1611 struct T_error_ack *tea; 1612 struct opthdr *optp; 1613 struct opthdr *req; 1614 int status; 1615 t_scalar_t prim; 1616 1617 tor = (struct T_optmgmt_req *)&buf; 1618 1619 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1620 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1621 tor->OPT_length = sizeof (struct opthdr); 1622 tor->MGMT_flags = T_CURRENT; 1623 1624 req = (struct opthdr *)&tor[1]; 1625 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1626 req->name = 0; 1627 req->len = 0; 1628 1629 ctlbuf.buf = (char *)&buf; 1630 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1631 ctlbuf.maxlen = sizeof (buf); 1632 flags = 0; 1633 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1634 logperror("update_router_list: putmsg(ctl)"); 1635 return (_B_FALSE); 1636 } 1637 1638 /* 1639 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1640 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1641 * a control and data part. The control part contains a struct 1642 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1643 * the level, name and length of the data in the data part. The 1644 * data part contains the actual table data. The last message 1645 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1646 * single option with zero optlen. 1647 */ 1648 1649 for (;;) { 1650 /* 1651 * Go around this loop once for each table. Ignore 1652 * all tables except the routing information table. 1653 */ 1654 flags = 0; 1655 status = getmsg(fd, &ctlbuf, NULL, &flags); 1656 if (status < 0) { 1657 if (errno == EINTR) 1658 continue; 1659 logperror("update_router_list: getmsg(ctl)"); 1660 return (_B_FALSE); 1661 } 1662 if (ctlbuf.len < sizeof (t_scalar_t)) { 1663 logerr("update_router_list: ctlbuf.len %d\n", 1664 ctlbuf.len); 1665 return (_B_FALSE); 1666 } 1667 1668 prim = buf.uprim.type; 1669 1670 switch (prim) { 1671 1672 case T_ERROR_ACK: 1673 tea = &buf.uprim.error_ack; 1674 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1675 logerr("update_router_list: T_ERROR_ACK" 1676 " ctlbuf.len %d\n", ctlbuf.len); 1677 return (_B_FALSE); 1678 } 1679 logerr("update_router_list: T_ERROR_ACK:" 1680 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1681 tea->TLI_error, tea->UNIX_error); 1682 return (_B_FALSE); 1683 1684 case T_OPTMGMT_ACK: 1685 toa = &buf.uprim.optmgmt_ack; 1686 optp = (struct opthdr *)&toa[1]; 1687 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1688 logerr("update_router_list: ctlbuf.len %d\n", 1689 ctlbuf.len); 1690 return (_B_FALSE); 1691 } 1692 if (toa->MGMT_flags != T_SUCCESS) { 1693 logerr("update_router_list: MGMT_flags 0x%lx\n", 1694 toa->MGMT_flags); 1695 return (_B_FALSE); 1696 } 1697 break; 1698 1699 default: 1700 logerr("update_router_list: unknown primitive %ld\n", 1701 prim); 1702 return (_B_FALSE); 1703 } 1704 1705 /* Process the T_OPGMGMT_ACK below */ 1706 assert(prim == T_OPTMGMT_ACK); 1707 1708 switch (status) { 1709 case 0: 1710 /* 1711 * We have reached the end of this T_OPTMGMT_ACK 1712 * message. If this is the last message i.e EOD, 1713 * return, else process the next T_OPTMGMT_ACK msg. 1714 */ 1715 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1716 sizeof (struct opthdr)) && optp->len == 0 && 1717 optp->name == 0 && optp->level == 0) { 1718 /* 1719 * This is the EOD message. Return 1720 */ 1721 return (_B_TRUE); 1722 } 1723 continue; 1724 1725 case MORECTL: 1726 case MORECTL | MOREDATA: 1727 /* 1728 * This should not happen. We should be able to read 1729 * the control portion in a single getmsg. 1730 */ 1731 logerr("update_router_list: MORECTL\n"); 1732 return (_B_FALSE); 1733 1734 case MOREDATA: 1735 databuf.maxlen = optp->len; 1736 /* malloc of 0 bytes is ok */ 1737 databuf.buf = malloc((size_t)optp->len); 1738 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1739 logperror("update_router_list: malloc"); 1740 return (_B_FALSE); 1741 } 1742 databuf.len = 0; 1743 flags = 0; 1744 for (;;) { 1745 status = getmsg(fd, NULL, &databuf, &flags); 1746 if (status >= 0) { 1747 break; 1748 } else if (errno == EINTR) { 1749 continue; 1750 } else { 1751 logperror("update_router_list:" 1752 " getmsg(data)"); 1753 free(databuf.buf); 1754 return (_B_FALSE); 1755 } 1756 } 1757 1758 if (optp->level == MIB2_IP && 1759 optp->name == MIB2_IP_ROUTE) { 1760 /* LINTED */ 1761 ire_process_v4((mib2_ipRouteEntry_t *) 1762 databuf.buf, databuf.len); 1763 } else if (optp->level == MIB2_IP6 && 1764 optp->name == MIB2_IP6_ROUTE) { 1765 /* LINTED */ 1766 ire_process_v6((mib2_ipv6RouteEntry_t *) 1767 databuf.buf, databuf.len); 1768 } 1769 free(databuf.buf); 1770 } 1771 } 1772 /* NOTREACHED */ 1773 } 1774 1775 /* 1776 * Examine the IPv4 routing table, for default routers. For each default 1777 * router, populate the list of targets of each phyint that is on the same 1778 * link as the default router 1779 */ 1780 static void 1781 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1782 { 1783 mib2_ipRouteEntry_t *rp; 1784 mib2_ipRouteEntry_t *rp1; 1785 struct in_addr nexthop_v4; 1786 mib2_ipRouteEntry_t *endp; 1787 1788 if (len == 0) 1789 return; 1790 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1791 1792 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1793 1794 /* 1795 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1796 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1797 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1798 * This is a potential target for probing, which we try to add 1799 * to the list of probe targets. 1800 */ 1801 for (rp = buf; rp < endp; rp++) { 1802 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1803 continue; 1804 1805 /* Get the nexthop address. */ 1806 nexthop_v4.s_addr = rp->ipRouteNextHop; 1807 1808 /* 1809 * Get the nexthop address. Then determine the outgoing 1810 * interface, by examining all interface IREs, and picking the 1811 * match. We don't look at the interface specified in the route 1812 * because we need to add the router target on all matching 1813 * interfaces anyway; the goal is to avoid falling back to 1814 * multicast when some interfaces are in the same subnet but 1815 * not in the same group. 1816 */ 1817 for (rp1 = buf; rp1 < endp; rp1++) { 1818 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1819 continue; 1820 } 1821 1822 /* 1823 * Determine the interface IRE that matches the nexthop. 1824 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1825 */ 1826 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1827 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1828 /* 1829 * We found the interface ire 1830 */ 1831 router_add_v4(rp1, nexthop_v4); 1832 } 1833 } 1834 } 1835 } 1836 1837 void 1838 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1839 { 1840 char *cp; 1841 char ifname[LIFNAMSIZ + 1]; 1842 struct in6_addr nexthop; 1843 int len; 1844 1845 if (debug & D_TARGET) 1846 logdebug("router_add_v4()\n"); 1847 1848 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1849 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1850 ifname[len] = '\0'; 1851 1852 if (ifname[0] == '\0') 1853 return; 1854 1855 cp = strchr(ifname, IF_SEPARATOR); 1856 if (cp != NULL) 1857 *cp = '\0'; 1858 1859 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1860 router_add_common(AF_INET, ifname, nexthop); 1861 } 1862 1863 void 1864 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1865 { 1866 struct phyint_instance *pii; 1867 struct phyint *pi; 1868 1869 if (debug & D_TARGET) 1870 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1871 1872 /* 1873 * Retrieve the phyint instance; bail if it's not known to us yet. 1874 */ 1875 pii = phyint_inst_lookup(af, ifname); 1876 if (pii == NULL) 1877 return; 1878 1879 /* 1880 * Don't use our own addresses as targets. 1881 */ 1882 if (own_address(nexthop)) 1883 return; 1884 1885 /* 1886 * If the phyint is part a named group, then add the address to all 1887 * members of the group; note that this is suboptimal in the IPv4 case 1888 * as it has already been added to all matching interfaces in 1889 * ire_process_v4(). Otherwise, add the address only to the phyint 1890 * itself, since other phyints in the anongroup may not be on the same 1891 * subnet. 1892 */ 1893 pi = pii->pii_phyint; 1894 if (pi->pi_group == phyint_anongroup) { 1895 target_add(pii, nexthop, _B_TRUE); 1896 } else { 1897 pi = pi->pi_group->pg_phyint; 1898 for (; pi != NULL; pi = pi->pi_pgnext) 1899 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1900 } 1901 } 1902 1903 /* 1904 * Examine the IPv6 routing table, for default routers. For each default 1905 * router, populate the list of targets of each phyint that is on the same 1906 * link as the default router 1907 */ 1908 static void 1909 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1910 { 1911 mib2_ipv6RouteEntry_t *rp; 1912 mib2_ipv6RouteEntry_t *endp; 1913 struct in6_addr nexthop_v6; 1914 1915 if (debug & D_TARGET) 1916 logdebug("ire_process_v6(len %d)\n", len); 1917 1918 if (len == 0) 1919 return; 1920 1921 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1922 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1923 1924 /* 1925 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1926 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1927 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1928 * This is a potential target for probing, which we try to add 1929 * to the list of probe targets. 1930 */ 1931 for (rp = buf; rp < endp; rp++) { 1932 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1933 continue; 1934 1935 /* 1936 * We have the outgoing interface in ipv6RouteIfIndex 1937 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1938 * interface must be present for link-local addresses. Since 1939 * we use only link-local addreses for probing, we don't 1940 * consider the case when the outgoing interface is not 1941 * known and we need to scan interface ires 1942 */ 1943 nexthop_v6 = rp->ipv6RouteNextHop; 1944 if (rp->ipv6RouteIfIndex.o_length != 0) { 1945 /* 1946 * We already have the outgoing interface 1947 * in ipv6RouteIfIndex. 1948 */ 1949 router_add_v6(rp, nexthop_v6); 1950 } 1951 } 1952 } 1953 1954 1955 void 1956 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1957 { 1958 char ifname[LIFNAMSIZ + 1]; 1959 char *cp; 1960 int len; 1961 1962 if (debug & D_TARGET) 1963 logdebug("router_add_v6()\n"); 1964 1965 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1966 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1967 ifname[len] = '\0'; 1968 1969 if (ifname[0] == '\0') 1970 return; 1971 1972 cp = strchr(ifname, IF_SEPARATOR); 1973 if (cp != NULL) 1974 *cp = '\0'; 1975 1976 router_add_common(AF_INET6, ifname, nexthop_v6); 1977 } 1978 1979 1980 1981 /* 1982 * Build a list of target routers, by scanning the routing tables. 1983 * It is assumed that interface routes exist, to reach the routers. 1984 */ 1985 static void 1986 init_router_targets(void) 1987 { 1988 struct target *tg; 1989 struct target *next_tg; 1990 struct phyint_instance *pii; 1991 struct phyint *pi; 1992 1993 if (force_mcast) 1994 return; 1995 1996 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1997 pi = pii->pii_phyint; 1998 /* 1999 * Exclude ptp and host targets. Set tg_in_use to false, 2000 * only for router targets. 2001 */ 2002 if (!pii->pii_targets_are_routers || 2003 (pi->pi_flags & IFF_POINTOPOINT)) 2004 continue; 2005 2006 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 2007 tg->tg_in_use = 0; 2008 } 2009 2010 if (mibfd < 0) { 2011 mibfd = open("/dev/ip", O_RDWR); 2012 if (mibfd < 0) { 2013 logperror("mibopen: ip open"); 2014 exit(1); 2015 } 2016 } 2017 2018 if (!update_router_list(mibfd)) { 2019 (void) close(mibfd); 2020 mibfd = -1; 2021 } 2022 2023 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2024 if (!pii->pii_targets_are_routers || 2025 (pi->pi_flags & IFF_POINTOPOINT)) 2026 continue; 2027 2028 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 2029 next_tg = tg->tg_next; 2030 if (!tg->tg_in_use) { 2031 target_delete(tg); 2032 } 2033 } 2034 } 2035 } 2036 2037 /* 2038 * Attempt to assign host targets to any interfaces that do not currently 2039 * have probe targets by sharing targets with other interfaces in the group. 2040 */ 2041 static void 2042 init_host_targets(void) 2043 { 2044 struct phyint_instance *pii; 2045 struct phyint_group *pg; 2046 2047 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2048 pg = pii->pii_phyint->pi_group; 2049 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2050 dup_host_targets(pii); 2051 } 2052 } 2053 2054 /* 2055 * Duplicate host targets from other phyints of the group to 2056 * the phyint instance 'desired_pii'. 2057 */ 2058 static void 2059 dup_host_targets(struct phyint_instance *desired_pii) 2060 { 2061 int af; 2062 struct phyint *pi; 2063 struct phyint_instance *pii; 2064 struct target *tg; 2065 2066 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2067 2068 af = desired_pii->pii_af; 2069 2070 /* 2071 * For every phyint in the same group as desired_pii, check if 2072 * it has any host targets. If so add them to desired_pii. 2073 */ 2074 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2075 pii = PHYINT_INSTANCE(pi, af); 2076 /* 2077 * We know that we don't have targets on this phyint instance 2078 * since we have been called. But we still check for 2079 * pii_targets_are_routers because another phyint instance 2080 * could have router targets, since IFF_NOFAILOVER addresses 2081 * on different phyint instances may belong to different 2082 * subnets. 2083 */ 2084 if ((pii == NULL) || (pii == desired_pii) || 2085 pii->pii_targets_are_routers) 2086 continue; 2087 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2088 target_create(desired_pii, tg->tg_address, _B_FALSE); 2089 } 2090 } 2091 } 2092 2093 static void 2094 usage(char *cmd) 2095 { 2096 (void) fprintf(stderr, "usage: %s\n", cmd); 2097 } 2098 2099 2100 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2101 2102 /* Get an option from the /etc/default/mpathd file */ 2103 static char * 2104 getdefault(char *name) 2105 { 2106 char namebuf[BUFSIZ]; 2107 char *value = NULL; 2108 2109 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2110 char *cp; 2111 int flags; 2112 2113 /* 2114 * ignore case 2115 */ 2116 flags = defcntl(DC_GETFLAGS, 0); 2117 TURNOFF(flags, DC_CASE); 2118 (void) defcntl(DC_SETFLAGS, flags); 2119 2120 /* Add "=" to the name */ 2121 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2122 (void) strncat(namebuf, "=", 2); 2123 2124 if ((cp = defread(namebuf)) != NULL) 2125 value = strdup(cp); 2126 2127 /* close */ 2128 (void) defopen((char *)NULL); 2129 } 2130 return (value); 2131 } 2132 2133 2134 /* 2135 * Command line options below 2136 */ 2137 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2138 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2139 static boolean_t adopt = _B_FALSE; 2140 static boolean_t foreground = _B_FALSE; 2141 2142 int 2143 main(int argc, char *argv[]) 2144 { 2145 int i; 2146 int c; 2147 struct phyint_instance *pii; 2148 char *value; 2149 2150 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2151 srandom(gethostid()); /* Initialize the random number generator */ 2152 2153 /* 2154 * NOTE: The messages output by in.mpathd are not suitable for 2155 * translation, so we do not call textdomain(). 2156 */ 2157 (void) setlocale(LC_ALL, ""); 2158 2159 /* 2160 * Get the user specified value of 'failure detection time' 2161 * from /etc/default/mpathd 2162 */ 2163 value = getdefault("FAILURE_DETECTION_TIME"); 2164 if (value != NULL) { 2165 user_failure_detection_time = 2166 (int)strtol((char *)value, NULL, 0); 2167 2168 if (user_failure_detection_time <= 0) { 2169 user_failure_detection_time = FAILURE_DETECTION_TIME; 2170 logerr("Invalid failure detection time %s, assuming " 2171 "default %d\n", value, user_failure_detection_time); 2172 2173 } else if (user_failure_detection_time < 2174 MIN_FAILURE_DETECTION_TIME) { 2175 user_failure_detection_time = 2176 MIN_FAILURE_DETECTION_TIME; 2177 logerr("Too small failure detection time of %s, " 2178 "assuming minimum %d\n", value, 2179 user_failure_detection_time); 2180 } 2181 free(value); 2182 } else { 2183 /* User has not specified the parameter, Use default value */ 2184 user_failure_detection_time = FAILURE_DETECTION_TIME; 2185 } 2186 2187 /* 2188 * This gives the frequency at which probes will be sent. 2189 * When fdt ms elapses, we should be able to determine 2190 * whether 5 consecutive probes have failed or not. 2191 * 1 probe will be sent in every user_probe_interval ms, 2192 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2193 * user_probe_interval. Thus when we send out probe 'n' we 2194 * can be sure that probe 'n - 2' is lost, if we have not 2195 * got the ack. (since the probe interval is > crtt). But 2196 * probe 'n - 1' may be a valid unacked probe, since the 2197 * time between 2 successive probes could be as small as 2198 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2199 */ 2200 user_probe_interval = user_failure_detection_time / 2201 (NUM_PROBE_FAILS + 2); 2202 2203 /* 2204 * Get the user specified value of failback_enabled from 2205 * /etc/default/mpathd 2206 */ 2207 value = getdefault("FAILBACK"); 2208 if (value != NULL) { 2209 if (strncasecmp(value, "yes", 3) == 0) 2210 failback_enabled = _B_TRUE; 2211 else if (strncasecmp(value, "no", 2) == 0) 2212 failback_enabled = _B_FALSE; 2213 else 2214 logerr("Invalid value for FAILBACK %s\n", value); 2215 free(value); 2216 } else { 2217 failback_enabled = _B_TRUE; 2218 } 2219 2220 /* 2221 * Get the user specified value of track_all_phyints from 2222 * /etc/default/mpathd. The sense is reversed in 2223 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2224 */ 2225 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2226 if (value != NULL) { 2227 if (strncasecmp(value, "yes", 3) == 0) 2228 track_all_phyints = _B_FALSE; 2229 else if (strncasecmp(value, "no", 2) == 0) 2230 track_all_phyints = _B_TRUE; 2231 else 2232 logerr("Invalid value for " 2233 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2234 free(value); 2235 } else { 2236 track_all_phyints = _B_FALSE; 2237 } 2238 2239 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2240 switch (c) { 2241 case 'a': 2242 adopt = _B_TRUE; 2243 break; 2244 case 'm': 2245 force_mcast = _B_TRUE; 2246 break; 2247 case 'd': 2248 debug = D_ALL; 2249 foreground = _B_TRUE; 2250 break; 2251 case 'D': 2252 i = (int)strtol(optarg, NULL, 0); 2253 if (i == 0) { 2254 (void) fprintf(stderr, "Bad debug flags: %s\n", 2255 optarg); 2256 exit(1); 2257 } 2258 debug |= i; 2259 foreground = _B_TRUE; 2260 break; 2261 case 'l': 2262 /* 2263 * Turn off link state notification handling. 2264 * Undocumented command line flag, for debugging 2265 * purposes. 2266 */ 2267 handle_link_notifications = _B_FALSE; 2268 break; 2269 default: 2270 usage(argv[0]); 2271 exit(1); 2272 } 2273 } 2274 2275 /* 2276 * The sockets for the loopback command interface should be listening 2277 * before we fork and exit in daemonize(). This way, whoever started us 2278 * can use the loopback interface as soon as they get a zero exit 2279 * status. 2280 */ 2281 lsock_v4 = setup_listener(AF_INET); 2282 lsock_v6 = setup_listener(AF_INET6); 2283 2284 if (lsock_v4 < 0 && lsock_v6 < 0) { 2285 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2286 exit(1); 2287 } 2288 2289 if (!foreground) { 2290 if (!daemonize()) { 2291 logerr("cannot daemonize\n"); 2292 exit(EXIT_FAILURE); 2293 } 2294 initlog(); 2295 } 2296 2297 /* 2298 * Initializations: 2299 * 1. Create ifsock* sockets. These are used for performing SIOC* 2300 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2301 * 2. Initialize a pipe for handling/recording signal events. 2302 * 3. Create the routing sockets, used for listening 2303 * to routing / interface changes. 2304 * 4. phyint_init() - Initialize physical interface state 2305 * (in mpd_tables.c). Must be done before creating interfaces, 2306 * which timer_init() does indirectly. 2307 * 5. timer_init() - Initialize timer related stuff 2308 * 6. initifs() - Initialize our database of all known interfaces 2309 * 7. init_router_targets() - Initialize our database of all known 2310 * router targets. 2311 */ 2312 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2313 if (ifsock_v4 < 0) { 2314 logperror("main: IPv4 socket open"); 2315 exit(1); 2316 } 2317 2318 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2319 if (ifsock_v6 < 0) { 2320 logperror("main: IPv6 socket open"); 2321 exit(1); 2322 } 2323 2324 setup_eventpipe(); 2325 2326 rtsock_v4 = setup_rtsock(AF_INET); 2327 rtsock_v6 = setup_rtsock(AF_INET6); 2328 2329 if (phyint_init() == -1) { 2330 logerr("cannot initialize physical interface structures"); 2331 exit(1); 2332 } 2333 2334 timer_init(); 2335 2336 initifs(); 2337 2338 /* Inform kernel whether failback is enabled or disabled */ 2339 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2340 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2341 exit(1); 2342 } 2343 2344 /* 2345 * If we're operating in "adopt" mode and no interfaces need to be 2346 * tracked, shut down (ifconfig(1M) will restart us on demand if 2347 * interfaces are subsequently put into multipathing groups). 2348 */ 2349 if (adopt && phyint_instances == NULL) 2350 exit(0); 2351 2352 /* 2353 * Main body. Keep listening for activity on any of the sockets 2354 * that we are monitoring and take appropriate action as necessary. 2355 * signals are also handled synchronously. 2356 */ 2357 for (;;) { 2358 if (poll(pollfds, pollfd_num, -1) < 0) { 2359 if (errno == EINTR) 2360 continue; 2361 logperror("main: poll"); 2362 exit(1); 2363 } 2364 for (i = 0; i < pollfd_num; i++) { 2365 if ((pollfds[i].fd == -1) || 2366 !(pollfds[i].revents & POLLIN)) 2367 continue; 2368 if (pollfds[i].fd == eventpipe_read) { 2369 in_signal(eventpipe_read); 2370 break; 2371 } 2372 if (pollfds[i].fd == rtsock_v4 || 2373 pollfds[i].fd == rtsock_v6) { 2374 process_rtsock(rtsock_v4, rtsock_v6); 2375 break; 2376 } 2377 for (pii = phyint_instances; pii != NULL; 2378 pii = pii->pii_next) { 2379 if (pollfds[i].fd == pii->pii_probe_sock) { 2380 if (pii->pii_af == AF_INET) 2381 in_data(pii); 2382 else 2383 in6_data(pii); 2384 break; 2385 } 2386 } 2387 if (pollfds[i].fd == lsock_v4) 2388 loopback_cmd(lsock_v4, AF_INET); 2389 else if (pollfds[i].fd == lsock_v6) 2390 loopback_cmd(lsock_v6, AF_INET6); 2391 } 2392 if (full_scan_required) { 2393 initifs(); 2394 full_scan_required = _B_FALSE; 2395 } 2396 } 2397 /* NOTREACHED */ 2398 return (EXIT_SUCCESS); 2399 } 2400 2401 static int 2402 setup_listener(int af) 2403 { 2404 int sock; 2405 int on; 2406 int len; 2407 int ret; 2408 struct sockaddr_storage laddr; 2409 struct sockaddr_in *sin; 2410 struct sockaddr_in6 *sin6; 2411 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2412 2413 assert(af == AF_INET || af == AF_INET6); 2414 2415 sock = socket(af, SOCK_STREAM, 0); 2416 if (sock < 0) { 2417 logperror("setup_listener: socket"); 2418 exit(1); 2419 } 2420 2421 on = 1; 2422 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2423 sizeof (on)) < 0) { 2424 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2425 exit(1); 2426 } 2427 2428 bzero(&laddr, sizeof (laddr)); 2429 laddr.ss_family = af; 2430 2431 if (af == AF_INET) { 2432 sin = (struct sockaddr_in *)&laddr; 2433 sin->sin_port = htons(MPATHD_PORT); 2434 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2435 len = sizeof (struct sockaddr_in); 2436 } else { 2437 sin6 = (struct sockaddr_in6 *)&laddr; 2438 sin6->sin6_port = htons(MPATHD_PORT); 2439 sin6->sin6_addr = loopback_addr; 2440 len = sizeof (struct sockaddr_in6); 2441 } 2442 2443 ret = bind(sock, (struct sockaddr *)&laddr, len); 2444 if (ret < 0) { 2445 if (errno == EADDRINUSE) { 2446 /* 2447 * Another instance of mpathd may be already active. 2448 */ 2449 logerr("main: is another instance of in.mpathd " 2450 "already active?\n"); 2451 exit(1); 2452 } else { 2453 (void) close(sock); 2454 return (-1); 2455 } 2456 } 2457 if (listen(sock, 30) < 0) { 2458 logperror("main: listen"); 2459 exit(1); 2460 } 2461 if (poll_add(sock) == -1) { 2462 (void) close(sock); 2463 exit(1); 2464 } 2465 2466 return (sock); 2467 } 2468 2469 /* 2470 * Table of commands and their expected size; used by loopback_cmd(). 2471 */ 2472 static struct { 2473 const char *name; 2474 unsigned int size; 2475 } commands[] = { 2476 { "MI_PING", sizeof (uint32_t) }, 2477 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2478 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2479 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2480 { "MI_QUERY", sizeof (mi_query_t) } 2481 }; 2482 2483 /* 2484 * Commands received over the loopback interface come here. Currently 2485 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2486 * module. ifconfig only makes a connection, and closes it to check if 2487 * in.mpathd is running. 2488 * if_mpadm sends commands in the format specified by the mpathd_interface 2489 * structure. 2490 */ 2491 static void 2492 loopback_cmd(int sock, int family) 2493 { 2494 int newfd; 2495 ssize_t len; 2496 struct sockaddr_storage peer; 2497 struct sockaddr_in *peer_sin; 2498 struct sockaddr_in6 *peer_sin6; 2499 socklen_t peerlen; 2500 union mi_commands mpi; 2501 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2502 char abuf[INET6_ADDRSTRLEN]; 2503 uint_t cmd; 2504 int retval; 2505 2506 peerlen = sizeof (peer); 2507 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2508 if (newfd < 0) { 2509 logperror("loopback_cmd: accept"); 2510 return; 2511 } 2512 2513 switch (family) { 2514 case AF_INET: 2515 /* 2516 * Validate the address and port to make sure that 2517 * non privileged processes don't connect and start 2518 * talking to us. 2519 */ 2520 if (peerlen != sizeof (struct sockaddr_in)) { 2521 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2522 (void) close(newfd); 2523 return; 2524 } 2525 peer_sin = (struct sockaddr_in *)&peer; 2526 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2527 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2528 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2529 abuf, sizeof (abuf)); 2530 logerr("Attempt to connect from addr %s port %d\n", 2531 abuf, ntohs(peer_sin->sin_port)); 2532 (void) close(newfd); 2533 return; 2534 } 2535 break; 2536 2537 case AF_INET6: 2538 if (peerlen != sizeof (struct sockaddr_in6)) { 2539 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2540 (void) close(newfd); 2541 return; 2542 } 2543 /* 2544 * Validate the address and port to make sure that 2545 * non privileged processes don't connect and start 2546 * talking to us. 2547 */ 2548 peer_sin6 = (struct sockaddr_in6 *)&peer; 2549 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2550 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2551 &loopback_addr))) { 2552 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2553 sizeof (abuf)); 2554 logerr("Attempt to connect from addr %s port %d\n", 2555 abuf, ntohs(peer_sin6->sin6_port)); 2556 (void) close(newfd); 2557 return; 2558 } 2559 2560 default: 2561 logdebug("loopback_cmd: family %d\n", family); 2562 (void) close(newfd); 2563 return; 2564 } 2565 2566 /* 2567 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2568 * all supported commands 2569 */ 2570 len = read(newfd, &mpi, sizeof (mpi)); 2571 2572 /* 2573 * ifconfig does not send any data. Just tests to see if mpathd 2574 * is already running. 2575 */ 2576 if (len <= 0) { 2577 (void) close(newfd); 2578 return; 2579 } 2580 2581 /* 2582 * In theory, we can receive any sized message for a stream socket, 2583 * but we don't expect that to happen for a small message over a 2584 * loopback connection. 2585 */ 2586 if (len < sizeof (uint32_t)) { 2587 logerr("loopback_cmd: bad command format or read returns " 2588 "partial data %d\n", len); 2589 } 2590 2591 cmd = mpi.mi_command; 2592 if (cmd >= MI_NCMD) { 2593 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2594 (void) close(newfd); 2595 return; 2596 } 2597 2598 if (len < commands[cmd].size) { 2599 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2600 commands[cmd].name, commands[cmd].size, len); 2601 (void) close(newfd); 2602 return; 2603 } 2604 2605 retval = process_cmd(newfd, &mpi); 2606 if (retval != IPMP_SUCCESS) { 2607 logerr("failed processing %s: %s\n", commands[cmd].name, 2608 ipmp_errmsg(retval)); 2609 } 2610 (void) close(newfd); 2611 } 2612 2613 extern int global_errno; /* set by failover() or failback() */ 2614 2615 /* 2616 * Process the offline, undo offline and set original index commands, 2617 * received from if_mpadm(1M) 2618 */ 2619 static unsigned int 2620 process_cmd(int newfd, union mi_commands *mpi) 2621 { 2622 uint_t nif = 0; 2623 uint32_t cmd; 2624 struct phyint *pi; 2625 struct phyint *pi2; 2626 struct phyint_group *pg; 2627 boolean_t success; 2628 int error; 2629 struct mi_offline *mio; 2630 struct mi_undo_offline *miu; 2631 struct lifreq lifr; 2632 int ifsock; 2633 struct mi_setoindex *mis; 2634 2635 cmd = mpi->mi_command; 2636 2637 switch (cmd) { 2638 case MI_OFFLINE: 2639 mio = &mpi->mi_ocmd; 2640 /* 2641 * Lookup the interface that needs to be offlined. 2642 * If it does not exist, return a suitable error. 2643 */ 2644 pi = phyint_lookup(mio->mio_ifname); 2645 if (pi == NULL) 2646 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2647 2648 /* 2649 * Verify that the minimum redundancy requirements are met. 2650 * The multipathing group must have at least the specified 2651 * number of functional interfaces after offlining the 2652 * requested interface. Otherwise return a suitable error. 2653 */ 2654 pg = pi->pi_group; 2655 nif = 0; 2656 if (pg != phyint_anongroup) { 2657 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2658 pi2 = pi2->pi_pgnext) { 2659 if ((pi2->pi_state == PI_RUNNING) || 2660 (pg->pg_groupfailed && 2661 !(pi2->pi_flags & IFF_OFFLINE))) 2662 nif++; 2663 } 2664 } 2665 if (nif < mio->mio_min_redundancy) 2666 return (send_result(newfd, IPMP_EMINRED, 0)); 2667 2668 /* 2669 * The order of operation is to set IFF_OFFLINE, followed by 2670 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2671 * can be created. Subsequent failover moves everything on 2672 * the OFFLINE interface to some other functional interface. 2673 */ 2674 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2675 if (success) { 2676 if (!pi->pi_empty) { 2677 error = try_failover(pi, FAILOVER_NORMAL); 2678 if (error != 0) { 2679 if (!change_lif_flags(pi, IFF_OFFLINE, 2680 _B_FALSE)) { 2681 logerr("process_cmd: couldn't" 2682 " clear OFFLINE flag on" 2683 " %s\n", pi->pi_name); 2684 /* 2685 * Offline interfaces should 2686 * not be probed. 2687 */ 2688 stop_probing(pi); 2689 } 2690 return (send_result(newfd, error, 2691 global_errno)); 2692 } 2693 } 2694 } else { 2695 return (send_result(newfd, IPMP_FAILURE, errno)); 2696 } 2697 2698 /* 2699 * The interface is now Offline, so stop probing it. 2700 * Note that if_mpadm(1M) will down the test addresses, 2701 * after receiving a success reply from us. The routing 2702 * socket message will then make us close the socket used 2703 * for sending probes. But it is more logical that an 2704 * offlined interface must not be probed, even if it has 2705 * test addresses. 2706 */ 2707 stop_probing(pi); 2708 return (send_result(newfd, IPMP_SUCCESS, 0)); 2709 2710 case MI_UNDO_OFFLINE: 2711 miu = &mpi->mi_ucmd; 2712 /* 2713 * Undo the offline command. As usual lookup the interface. 2714 * Send an error if it does not exist or is not offline. 2715 */ 2716 pi = phyint_lookup(miu->miu_ifname); 2717 if (pi == NULL || pi->pi_state != PI_OFFLINE) 2718 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2719 2720 /* 2721 * Reset the state of the interface based on the current link 2722 * state; if this phyint subsequently acquires a test address, 2723 * the state will be updated later as a result of the probes. 2724 */ 2725 if (LINK_UP(pi)) 2726 phyint_chstate(pi, PI_RUNNING); 2727 else 2728 phyint_chstate(pi, PI_FAILED); 2729 2730 if (pi->pi_state == PI_RUNNING) { 2731 /* 2732 * Note that the success of MI_UNDO_OFFLINE is not 2733 * contingent on actually failing back; in the odd 2734 * case where we cannot do it here, we will try again 2735 * in initifs() since pi->pi_full will still be zero. 2736 */ 2737 if (do_failback(pi) != IPMP_SUCCESS) { 2738 logdebug("process_cmd: cannot failback from " 2739 "%s during MI_UNDO_OFFLINE\n", pi->pi_name); 2740 } 2741 } 2742 2743 /* 2744 * Clear the IFF_OFFLINE flag. We have to do this last 2745 * because do_failback() relies on it being set to decide 2746 * when to display messages. 2747 */ 2748 (void) change_lif_flags(pi, IFF_OFFLINE, _B_FALSE); 2749 2750 /* 2751 * Give the requestor time to configure test addresses 2752 * before complaining that they're missing. 2753 */ 2754 pi->pi_taddrthresh = getcurrentsec() + TESTADDR_CONF_TIME; 2755 2756 return (send_result(newfd, IPMP_SUCCESS, 0)); 2757 2758 case MI_SETOINDEX: 2759 mis = &mpi->mi_scmd; 2760 2761 /* Get the socket for doing ioctls */ 2762 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2763 2764 /* 2765 * Get index of new original interface. 2766 * The index is returned in lifr.lifr_index. 2767 */ 2768 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2769 sizeof (lifr.lifr_name)); 2770 2771 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2772 return (send_result(newfd, IPMP_FAILURE, errno)); 2773 2774 /* 2775 * Set new original interface index. 2776 * The new index was put into lifr.lifr_index by the 2777 * SIOCGLIFINDEX ioctl. 2778 */ 2779 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2780 sizeof (lifr.lifr_name)); 2781 2782 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2783 return (send_result(newfd, IPMP_FAILURE, errno)); 2784 2785 return (send_result(newfd, IPMP_SUCCESS, 0)); 2786 2787 case MI_QUERY: 2788 return (process_query(newfd, &mpi->mi_qcmd)); 2789 2790 default: 2791 break; 2792 } 2793 2794 return (send_result(newfd, IPMP_EPROTO, 0)); 2795 } 2796 2797 /* 2798 * Process the query request pointed to by `miq' and send a reply on file 2799 * descriptor `fd'. Returns an IPMP error code. 2800 */ 2801 static unsigned int 2802 process_query(int fd, mi_query_t *miq) 2803 { 2804 ipmp_groupinfo_t *grinfop; 2805 ipmp_groupinfolist_t *grlp; 2806 ipmp_grouplist_t *grlistp; 2807 ipmp_ifinfo_t *ifinfop; 2808 ipmp_ifinfolist_t *iflp; 2809 ipmp_snap_t *snap; 2810 unsigned int retval; 2811 2812 switch (miq->miq_inforeq) { 2813 case IPMP_GROUPLIST: 2814 retval = getgrouplist(&grlistp); 2815 if (retval != IPMP_SUCCESS) 2816 return (send_result(fd, retval, errno)); 2817 2818 retval = send_result(fd, IPMP_SUCCESS, 0); 2819 if (retval == IPMP_SUCCESS) 2820 retval = send_grouplist(fd, grlistp); 2821 2822 ipmp_freegrouplist(grlistp); 2823 return (retval); 2824 2825 case IPMP_GROUPINFO: 2826 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2827 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2828 if (retval != IPMP_SUCCESS) 2829 return (send_result(fd, retval, errno)); 2830 2831 retval = send_result(fd, IPMP_SUCCESS, 0); 2832 if (retval == IPMP_SUCCESS) 2833 retval = send_groupinfo(fd, grinfop); 2834 2835 ipmp_freegroupinfo(grinfop); 2836 return (retval); 2837 2838 case IPMP_IFINFO: 2839 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2840 retval = getifinfo(miq->miq_ifname, &ifinfop); 2841 if (retval != IPMP_SUCCESS) 2842 return (send_result(fd, retval, errno)); 2843 2844 retval = send_result(fd, IPMP_SUCCESS, 0); 2845 if (retval == IPMP_SUCCESS) 2846 retval = send_ifinfo(fd, ifinfop); 2847 2848 ipmp_freeifinfo(ifinfop); 2849 return (retval); 2850 2851 case IPMP_SNAP: 2852 retval = getsnap(&snap); 2853 if (retval != IPMP_SUCCESS) 2854 return (send_result(fd, retval, errno)); 2855 2856 retval = send_result(fd, IPMP_SUCCESS, 0); 2857 if (retval != IPMP_SUCCESS) 2858 goto out; 2859 2860 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2861 if (retval != IPMP_SUCCESS) 2862 goto out; 2863 2864 retval = send_grouplist(fd, snap->sn_grlistp); 2865 if (retval != IPMP_SUCCESS) 2866 goto out; 2867 2868 iflp = snap->sn_ifinfolistp; 2869 for (; iflp != NULL; iflp = iflp->ifl_next) { 2870 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2871 if (retval != IPMP_SUCCESS) 2872 goto out; 2873 } 2874 2875 grlp = snap->sn_grinfolistp; 2876 for (; grlp != NULL; grlp = grlp->grl_next) { 2877 retval = send_groupinfo(fd, grlp->grl_grinfop); 2878 if (retval != IPMP_SUCCESS) 2879 goto out; 2880 } 2881 out: 2882 ipmp_snap_free(snap); 2883 return (retval); 2884 2885 default: 2886 break; 2887 2888 } 2889 return (send_result(fd, IPMP_EPROTO, 0)); 2890 } 2891 2892 /* 2893 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2894 * Returns an IPMP error code. 2895 */ 2896 static unsigned int 2897 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2898 { 2899 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2900 unsigned int retval; 2901 2902 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2903 if (retval != IPMP_SUCCESS) 2904 return (retval); 2905 2906 return (ipmp_writetlv(fd, IPMP_IFLIST, 2907 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2908 } 2909 2910 /* 2911 * Send the interface information pointed to by `ifinfop' on file descriptor 2912 * `fd'. Returns an IPMP error code. 2913 */ 2914 static unsigned int 2915 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2916 { 2917 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2918 } 2919 2920 /* 2921 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2922 * Returns an IPMP error code. 2923 */ 2924 static unsigned int 2925 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2926 { 2927 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2928 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2929 } 2930 2931 /* 2932 * Initialize an mi_result_t structure using `error' and `syserror' and 2933 * send it on file descriptor `fd'. Returns an IPMP error code. 2934 */ 2935 static unsigned int 2936 send_result(int fd, unsigned int error, int syserror) 2937 { 2938 mi_result_t me; 2939 2940 me.me_mpathd_error = error; 2941 if (error == IPMP_FAILURE) 2942 me.me_sys_error = syserror; 2943 else 2944 me.me_sys_error = 0; 2945 2946 return (ipmp_write(fd, &me, sizeof (me))); 2947 } 2948 2949 /* 2950 * Daemonize the process. 2951 */ 2952 static boolean_t 2953 daemonize(void) 2954 { 2955 switch (fork()) { 2956 case -1: 2957 return (_B_FALSE); 2958 2959 case 0: 2960 /* 2961 * Lose our controlling terminal, and become both a session 2962 * leader and a process group leader. 2963 */ 2964 if (setsid() == -1) 2965 return (_B_FALSE); 2966 2967 /* 2968 * Under POSIX, a session leader can accidentally (through 2969 * open(2)) acquire a controlling terminal if it does not 2970 * have one. Just to be safe, fork() again so we are not a 2971 * session leader. 2972 */ 2973 switch (fork()) { 2974 case -1: 2975 return (_B_FALSE); 2976 2977 case 0: 2978 (void) chdir("/"); 2979 (void) umask(022); 2980 (void) fdwalk(closefunc, NULL); 2981 break; 2982 2983 default: 2984 _exit(EXIT_SUCCESS); 2985 } 2986 break; 2987 2988 default: 2989 _exit(EXIT_SUCCESS); 2990 } 2991 2992 return (_B_TRUE); 2993 } 2994 2995 /* 2996 * The parent has created some fds before forking on purpose, keep them open. 2997 */ 2998 static int 2999 closefunc(void *not_used, int fd) 3000 /* ARGSUSED */ 3001 { 3002 if (fd != lsock_v4 && fd != lsock_v6) 3003 (void) close(fd); 3004 return (0); 3005 } 3006 3007 /* LOGGER */ 3008 3009 #include <syslog.h> 3010 3011 /* 3012 * Logging routines. All routines log to syslog, unless the daemon is 3013 * running in the foreground, in which case the logging goes to stderr. 3014 * 3015 * The following routines are available: 3016 * 3017 * logdebug(): A printf-like function for outputting debug messages 3018 * (messages at LOG_DEBUG) that are only of use to developers. 3019 * 3020 * logtrace(): A printf-like function for outputting tracing messages 3021 * (messages at LOG_INFO) from the daemon. This is typically used 3022 * to log the receipt of interesting network-related conditions. 3023 * 3024 * logerr(): A printf-like function for outputting error messages 3025 * (messages at LOG_ERR) from the daemon. 3026 * 3027 * logperror*(): A set of functions used to output error messages 3028 * (messages at LOG_ERR); these automatically append strerror(errno) 3029 * and a newline to the message passed to them. 3030 * 3031 * NOTE: since the logging functions write to syslog, the messages passed 3032 * to them are not eligible for localization. Thus, gettext() must 3033 * *not* be used. 3034 */ 3035 3036 static int logging = 0; 3037 3038 static void 3039 initlog(void) 3040 { 3041 logging++; 3042 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3043 } 3044 3045 /* PRINTFLIKE1 */ 3046 void 3047 logerr(char *fmt, ...) 3048 { 3049 va_list ap; 3050 3051 va_start(ap, fmt); 3052 3053 if (logging) 3054 vsyslog(LOG_ERR, fmt, ap); 3055 else 3056 (void) vfprintf(stderr, fmt, ap); 3057 va_end(ap); 3058 } 3059 3060 /* PRINTFLIKE1 */ 3061 void 3062 logtrace(char *fmt, ...) 3063 { 3064 va_list ap; 3065 3066 va_start(ap, fmt); 3067 3068 if (logging) 3069 vsyslog(LOG_INFO, fmt, ap); 3070 else 3071 (void) vfprintf(stderr, fmt, ap); 3072 va_end(ap); 3073 } 3074 3075 /* PRINTFLIKE1 */ 3076 void 3077 logdebug(char *fmt, ...) 3078 { 3079 va_list ap; 3080 3081 va_start(ap, fmt); 3082 3083 if (logging) 3084 vsyslog(LOG_DEBUG, fmt, ap); 3085 else 3086 (void) vfprintf(stderr, fmt, ap); 3087 va_end(ap); 3088 } 3089 3090 /* PRINTFLIKE1 */ 3091 void 3092 logperror(char *str) 3093 { 3094 if (logging) 3095 syslog(LOG_ERR, "%s: %m\n", str); 3096 else 3097 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3098 } 3099 3100 void 3101 logperror_pii(struct phyint_instance *pii, char *str) 3102 { 3103 if (logging) { 3104 syslog(LOG_ERR, "%s (%s %s): %m\n", 3105 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3106 } else { 3107 (void) fprintf(stderr, "%s (%s %s): %s\n", 3108 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3109 strerror(errno)); 3110 } 3111 } 3112 3113 void 3114 logperror_li(struct logint *li, char *str) 3115 { 3116 struct phyint_instance *pii = li->li_phyint_inst; 3117 3118 if (logging) { 3119 syslog(LOG_ERR, "%s (%s %s): %m\n", 3120 str, AF_STR(pii->pii_af), li->li_name); 3121 } else { 3122 (void) fprintf(stderr, "%s (%s %s): %s\n", 3123 str, AF_STR(pii->pii_af), li->li_name, 3124 strerror(errno)); 3125 } 3126 } 3127 3128 void 3129 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3130 { 3131 if (polled) 3132 (void) poll_remove(pii->pii_probe_sock); 3133 (void) close(pii->pii_probe_sock); 3134 pii->pii_probe_sock = -1; 3135 pii->pii_basetime_inited = 0; 3136 } 3137