1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include "mpd_defs.h" 30 #include "mpd_tables.h" 31 32 int debug = 0; /* Debug flag */ 33 static int pollfd_num = 0; /* Num. of poll descriptors */ 34 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 35 36 /* All times below in ms */ 37 int user_failure_detection_time; /* user specified failure detection */ 38 /* time (fdt) */ 39 int user_probe_interval; /* derived from user specified fdt */ 40 41 static int rtsock_v4; /* AF_INET routing socket */ 42 static int rtsock_v6; /* AF_INET6 routing socket */ 43 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 44 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 45 static int lsock_v4; /* Listen socket to detect mpathd */ 46 static int lsock_v6; /* Listen socket to detect mpathd */ 47 static int mibfd = -1; /* fd to get mib info */ 48 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 49 50 boolean_t full_scan_required = _B_FALSE; 51 static uint_t last_initifs_time; /* Time when initifs was last run */ 52 static char **argv0; /* Saved for re-exec on SIGHUP */ 53 boolean_t handle_link_notifications = _B_TRUE; 54 55 static void initlog(void); 56 static void run_timeouts(void); 57 static void initifs(void); 58 static void check_if_removed(struct phyint_instance *pii); 59 static void select_test_ifs(void); 60 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 61 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 62 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 63 struct in_addr nexthop_v4); 64 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 65 struct in6_addr nexthop_v6); 66 static void router_add_common(int af, char *ifname, 67 struct in6_addr nexthop); 68 static void init_router_targets(); 69 static void cleanup(void); 70 static int setup_listener(int af); 71 static void check_config(void); 72 static void check_addr_unique(int af, char *name); 73 static void init_host_targets(void); 74 static void dup_host_targets(struct phyint_instance *desired_pii); 75 static void loopback_cmd(int sock, int family); 76 static int poll_remove(int fd); 77 static boolean_t daemonize(void); 78 static int closefunc(void *, int); 79 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 80 static unsigned int process_query(int fd, mi_query_t *miq); 81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 84 static unsigned int send_result(int fd, unsigned int error, int syserror); 85 86 /* 87 * Return the current time in milliseconds (from an arbitrary reference) 88 * truncated to fit into an int. Truncation is ok since we are interested 89 * only in differences and not the absolute values. 90 */ 91 uint_t 92 getcurrenttime(void) 93 { 94 uint_t cur_time; /* In ms */ 95 96 /* 97 * Use of a non-user-adjustable source of time is 98 * required. However millisecond precision is sufficient. 99 * divide by 10^6 100 */ 101 cur_time = (uint_t)(gethrtime() / 1000000LL); 102 return (cur_time); 103 } 104 105 /* 106 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 107 */ 108 int 109 poll_add(int fd) 110 { 111 int i; 112 int new_num; 113 struct pollfd *newfds; 114 retry: 115 /* Check if already present */ 116 for (i = 0; i < pollfd_num; i++) { 117 if (pollfds[i].fd == fd) 118 return (0); 119 } 120 /* Check for empty spot already present */ 121 for (i = 0; i < pollfd_num; i++) { 122 if (pollfds[i].fd == -1) { 123 pollfds[i].fd = fd; 124 return (0); 125 } 126 } 127 128 /* Allocate space for 32 more fds and initialize to -1 */ 129 new_num = pollfd_num + 32; 130 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 131 if (newfds == NULL) { 132 logperror("poll_add: realloc"); 133 return (-1); 134 } 135 for (i = pollfd_num; i < new_num; i++) { 136 newfds[i].fd = -1; 137 newfds[i].events = POLLIN; 138 } 139 pollfd_num = new_num; 140 pollfds = newfds; 141 goto retry; 142 } 143 144 /* 145 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 146 */ 147 static int 148 poll_remove(int fd) 149 { 150 int i; 151 152 /* Check if already present */ 153 for (i = 0; i < pollfd_num; i++) { 154 if (pollfds[i].fd == fd) { 155 pollfds[i].fd = -1; 156 return (0); 157 } 158 } 159 return (-1); 160 } 161 162 /* 163 * Extract information about the phyint instance. If the phyint instance still 164 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 165 * will use it to detect phyint instances that don't exist any longer and 166 * remove them, from our database of phyint instances. 167 * Return value: 168 * returns true if the phyint instance exists in the kernel, 169 * returns false otherwise 170 */ 171 static boolean_t 172 pii_process(int af, char *name, struct phyint_instance **pii_p) 173 { 174 int err; 175 struct phyint_instance *pii; 176 struct phyint_instance *pii_other; 177 178 if (debug & D_PHYINT) 179 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 180 181 pii = phyint_inst_lookup(af, name); 182 if (pii == NULL) { 183 /* 184 * Phyint instance does not exist in our tables, 185 * create new phyint instance 186 */ 187 pii = phyint_inst_init_from_k(af, name); 188 } else { 189 /* Phyint exists in our tables */ 190 err = phyint_inst_update_from_k(pii); 191 192 switch (err) { 193 case PI_IOCTL_ERROR: 194 /* Some ioctl error. don't change anything */ 195 pii->pii_in_use = 1; 196 break; 197 198 case PI_GROUP_CHANGED: 199 /* 200 * The phyint has changed group. 201 */ 202 restore_phyint(pii->pii_phyint); 203 /* FALLTHRU */ 204 205 case PI_IFINDEX_CHANGED: 206 /* 207 * Interface index has changed. Delete and 208 * recreate the phyint as it is quite likely 209 * the interface has been unplumbed and replumbed. 210 */ 211 pii_other = phyint_inst_other(pii); 212 if (pii_other != NULL) 213 phyint_inst_delete(pii_other); 214 phyint_inst_delete(pii); 215 pii = phyint_inst_init_from_k(af, name); 216 break; 217 218 case PI_DELETED: 219 /* Phyint instance has disappeared from kernel */ 220 pii->pii_in_use = 0; 221 break; 222 223 case PI_OK: 224 /* Phyint instance exists and is fine */ 225 pii->pii_in_use = 1; 226 break; 227 228 default: 229 /* Unknown status */ 230 logerr("pii_process: Unknown status %d\n", err); 231 break; 232 } 233 } 234 235 *pii_p = pii; 236 if (pii != NULL) 237 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 238 else 239 return (_B_FALSE); 240 } 241 242 /* 243 * This phyint is leaving the group. Try to restore the phyint to its 244 * initial state. Return the addresses that belong to other group members, 245 * to the group, and take back any addresses owned by this phyint 246 */ 247 void 248 restore_phyint(struct phyint *pi) 249 { 250 if (pi->pi_group == phyint_anongroup) 251 return; 252 253 /* 254 * Move everthing to some other member in the group. 255 * The phyint has changed group in the kernel. But we 256 * have yet to do it in our tables. 257 */ 258 if (!pi->pi_empty) 259 (void) try_failover(pi, FAILOVER_TO_ANY); 260 /* 261 * Move all addresses owned by 'pi' back to pi, from each 262 * of the other members of the group 263 */ 264 (void) try_failback(pi, _B_FALSE); 265 } 266 267 /* 268 * Scan all interfaces to detect changes as well as new and deleted interfaces 269 */ 270 static void 271 initifs() 272 { 273 int n; 274 int af; 275 char *cp; 276 char *buf; 277 int numifs; 278 struct lifnum lifn; 279 struct lifconf lifc; 280 struct lifreq *lifr; 281 struct logint *li; 282 struct phyint_instance *pii; 283 struct phyint_instance *next_pii; 284 char pi_name[LIFNAMSIZ + 1]; 285 boolean_t exists; 286 struct phyint *pi; 287 288 if (debug & D_PHYINT) 289 logdebug("initifs: Scanning interfaces\n"); 290 291 last_initifs_time = getcurrenttime(); 292 293 /* 294 * Mark the interfaces so that we can find phyints and logints 295 * which have disappeared from the kernel. pii_process() and 296 * logint_init_from_k() will set {pii,li}_in_use when they find 297 * the interface in the kernel. Also, clear dupaddr bit on probe 298 * logint. check_addr_unique() will set the dupaddr bit on the 299 * probe logint, if the testaddress is not unique. 300 */ 301 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 302 pii->pii_in_use = 0; 303 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 304 li->li_in_use = 0; 305 if (pii->pii_probe_logint == li) 306 li->li_dupaddr = 0; 307 } 308 } 309 310 lifn.lifn_family = AF_UNSPEC; 311 lifn.lifn_flags = 0; 312 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 313 logperror("initifs: ioctl (get interface numbers)"); 314 return; 315 } 316 numifs = lifn.lifn_count; 317 318 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 319 if (buf == NULL) { 320 logperror("initifs: calloc"); 321 return; 322 } 323 324 lifc.lifc_family = AF_UNSPEC; 325 lifc.lifc_flags = 0; 326 lifc.lifc_len = numifs * sizeof (struct lifreq); 327 lifc.lifc_buf = buf; 328 329 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 330 /* 331 * EINVAL is commonly encountered, when things change 332 * underneath us rapidly, (eg. at boot, when new interfaces 333 * are plumbed successively) and the kernel finds the buffer 334 * size we passed as too small. We will retry again 335 * when we see the next routing socket msg, or at worst after 336 * IF_SCAN_INTERVAL ms. 337 */ 338 if (errno != EINVAL) { 339 logperror("initifs: ioctl" 340 " (get interface configuration)"); 341 } 342 free(buf); 343 return; 344 } 345 346 lifr = (struct lifreq *)lifc.lifc_req; 347 348 /* 349 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 350 * and get the state of the corresponding phyint_instance. If it is 351 * successful, then call logint_init_from_k() to get the state of the 352 * logint. 353 */ 354 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 355 af = lifr->lifr_addr.ss_family; 356 357 /* 358 * Need to pass a phyint name to pii_process. Insert the 359 * null where the ':' IF_SEPARATOR is found in the logical 360 * name. 361 */ 362 (void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 363 pi_name[sizeof (pi_name) - 1] = '\0'; 364 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 365 *cp = '\0'; 366 367 exists = pii_process(af, pi_name, &pii); 368 if (exists) { 369 /* The phyint is fine. So process the logint */ 370 logint_init_from_k(pii, lifr->lifr_name); 371 } 372 check_addr_unique(af, lifr->lifr_name); 373 } 374 375 free(buf); 376 377 /* 378 * If the test address is now unique, and if it was not unique 379 * previously, clear the li_dupaddrmsg_printed flag and log a 380 * recovery message 381 */ 382 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 383 struct logint *li; 384 char abuf[INET6_ADDRSTRLEN]; 385 386 li = pii->pii_probe_logint; 387 if ((li != NULL) && !li->li_dupaddr && 388 li->li_dupaddrmsg_printed) { 389 logerr("Test address %s is unique; enabling probe-" 390 "based failure detection\n", 391 pr_addr(pii->pii_af, li->li_addr, abuf, 392 sizeof (abuf))); 393 li->li_dupaddrmsg_printed = 0; 394 } 395 } 396 397 /* 398 * Scan for phyints and logints that have disappeared from the 399 * kernel, and delete them. 400 */ 401 pii = phyint_instances; 402 403 while (pii != NULL) { 404 next_pii = pii->pii_next; 405 check_if_removed(pii); 406 pii = next_pii; 407 } 408 409 /* 410 * Select a test address for sending probes on each phyint instance 411 */ 412 select_test_ifs(); 413 414 /* 415 * Handle link up/down notifications from the NICs. 416 */ 417 process_link_state_changes(); 418 419 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 420 /* 421 * If this is a case of group failure, we don't have much 422 * to do until the group recovers again. 423 */ 424 if (GROUP_FAILED(pi->pi_group)) 425 continue; 426 427 /* 428 * Try/Retry any pending failovers / failbacks, that did not 429 * not complete, or that could not be initiated previously. 430 * This implements the 3 invariants described in the big block 431 * comment at the beginning of probe.c 432 */ 433 if (pi->pi_flags & IFF_INACTIVE) { 434 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 435 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 436 } else { 437 struct phyint_instance *pii; 438 439 pii = pi->pi_v4; 440 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 441 pii = pi->pi_v6; 442 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 443 continue; 444 /* 445 * It is possible that the phyint has started 446 * receiving packets, after it has been marked 447 * PI_FAILED. Don't initiate failover, if the 448 * phyint has started recovering. failure_state() 449 * captures this check. A similar logic is used 450 * for failback/repair case. 451 */ 452 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 453 (failure_state(pii) == PHYINT_FAILURE)) { 454 (void) try_failover(pi, FAILOVER_NORMAL); 455 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 456 if (try_failback(pi, _B_FALSE) != 457 IPMP_FAILURE) { 458 (void) change_lif_flags(pi, IFF_FAILED, 459 _B_FALSE); 460 /* Per state diagram */ 461 pi->pi_empty = 0; 462 } 463 } 464 } 465 } 466 } 467 468 /* 469 * Check that test/probe addresses are always unique. link-locals and 470 * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER) 471 * address can produce unexpected results. Log an error and alert the user. 472 */ 473 static void 474 check_addr_unique(int af, char *name) 475 { 476 struct lifreq lifr; 477 struct phyint *pi; 478 struct in6_addr addr; 479 struct phyint_instance *pii; 480 struct sockaddr_in *sin; 481 struct sockaddr_in6 *sin6; 482 int ifsock; 483 char abuf[INET6_ADDRSTRLEN]; 484 485 /* Get the socket for doing ioctls */ 486 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 487 488 (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); 489 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 490 /* 491 * Get the address corresponding to 'name'. We cannot 492 * do a logint lookup in our tables, because, not all logints 493 * in the system are tracked by mpathd. (eg. things not in a group) 494 */ 495 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 496 if (errno == ENXIO) { 497 /* Interface has vanished */ 498 return; 499 } else { 500 logperror("ioctl (get addr)"); 501 return; 502 } 503 } 504 505 if (af == AF_INET) { 506 sin = (struct sockaddr_in *)&lifr.lifr_addr; 507 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 508 } else { 509 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 510 addr = sin6->sin6_addr; 511 } 512 513 /* 514 * Does the address 'addr' match any known test address ? If so 515 * it is a duplicate, unless we are looking at the same logint 516 */ 517 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 518 pii = PHYINT_INSTANCE(pi, af); 519 if (pii == NULL || pii->pii_probe_logint == NULL) 520 continue; 521 522 if (!IN6_ARE_ADDR_EQUAL(&addr, 523 &pii->pii_probe_logint->li_addr)) { 524 continue; 525 } 526 527 if (strncmp(pii->pii_probe_logint->li_name, name, 528 sizeof (pii->pii_probe_logint->li_name)) == 0) { 529 continue; 530 } 531 532 /* 533 * This test address is not unique. Set the dupaddr bit 534 */ 535 pii->pii_probe_logint->li_dupaddr = 1; 536 537 /* 538 * Log an error message if not already logged 539 */ 540 if (pii->pii_probe_logint->li_dupaddrmsg_printed) 541 continue; 542 543 logerr("Test address %s is not unique; disabling " 544 "probe-based failure detection\n", 545 pr_addr(af, addr, abuf, sizeof (abuf))); 546 547 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 548 } 549 } 550 551 /* 552 * The pii_probe_logint used for probing, must satisfy the following properties 553 * with respect to its li_flags. 554 * IFF_NOFAILOVER - must be set (except in singleton group case) 555 * IFF_UP - must be set 556 * IFF_NOXMIT - must be clear 557 * IFF_NOLOCAL - must be clear 558 * IFF_DEPRECATED - preferably set (for IPv4) 559 */ 560 #define BEST_FLAG_SET (IFF_NOFAILOVER | IFF_UP | IFF_DEPRECATED) 561 #define CLEAR_FLAG_SET (IFF_NOXMIT | IFF_NOLOCAL) 562 #define TEST_CLEAR_FLAG_SET CLEAR_FLAG_SET 563 #define TEST_MINIMAL_FLAG_SET (IFF_UP | CLEAR_FLAG_SET) 564 #define TEST_BEST_FLAG_SET (BEST_FLAG_SET | CLEAR_FLAG_SET) 565 566 /* 567 * Stop probing an interface. Called when an interface is offlined. 568 * The probe socket is closed on each interface instance, and the 569 * interface state set to PI_OFFLINE. 570 */ 571 static void 572 stop_probing(struct phyint *pi) 573 { 574 struct phyint_instance *pii; 575 576 pii = pi->pi_v4; 577 if (pii != NULL) { 578 if (pii->pii_probe_sock != -1) 579 close_probe_socket(pii, _B_TRUE); 580 pii->pii_probe_logint = NULL; 581 } 582 583 pii = pi->pi_v6; 584 if (pii != NULL) { 585 if (pii->pii_probe_sock != -1) 586 close_probe_socket(pii, _B_TRUE); 587 pii->pii_probe_logint = NULL; 588 } 589 590 phyint_chstate(pi, PI_OFFLINE); 591 } 592 593 /* 594 * Do the test address selection for each phyint instance. Pick an 595 * IFF_NOFAILOVER address as test address. For singleton case, 596 * if user didn't configure an IFF_NOFAILOVER address, we will pick a 597 * normal address as test address. For (multiple adapter) groups, 598 * user is required to configure IFF_NOFAILOVER test address. Call 599 * phyint_inst_sockinit() to complete the initializations. 600 */ 601 static void 602 select_test_ifs(void) 603 { 604 struct phyint *pi; 605 struct phyint_instance *pii; 606 struct phyint_instance *next_pii; 607 struct logint *li; 608 struct logint *test_logint; 609 boolean_t target_scan_reqd = _B_FALSE; 610 struct target *tg; 611 612 if (debug & D_PHYINT) 613 logdebug("select_test_ifs\n"); 614 615 /* 616 * For each phyint instance, do the test address selection 617 */ 618 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 619 next_pii = pii->pii_next; 620 /* 621 * An interface that is offline, should not be probed. 622 * Offline interfaces should always in PI_OFFLINE state, 623 * unless some other entity has set the offline flag. 624 */ 625 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 626 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 627 logerr("shouldn't be probing offline" 628 " interface %s (state is: %u)." 629 " Stopping probes.\n", 630 pii->pii_phyint->pi_name, 631 pii->pii_phyint->pi_state); 632 stop_probing(pii->pii_phyint); 633 } 634 continue; 635 } 636 637 test_logint = pii->pii_probe_logint; 638 639 if (test_logint != NULL) { 640 if ((test_logint->li_flags & TEST_BEST_FLAG_SET) 641 == BEST_FLAG_SET) 642 continue; 643 644 /* 645 * If user configures IFF_NOXMIT or IFF_NOLOCAL 646 * flags on test addresses after in.mpathd has 647 * has started, the daemon aborts. In future 648 * this can be better handling, i.e. instead 649 * of abort the daemon, a more appropriate 650 * action may be issuing a warning and choose 651 * a different test address. 652 */ 653 assert((test_logint->li_flags & TEST_CLEAR_FLAG_SET) 654 == 0); 655 } 656 657 /* 658 * Walk the logints of this phyint instance, and select 659 * the best available test address 660 */ 661 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 662 /* 663 * Skip any IPv6 logints that are not link-local, 664 * since we should always have a link-local address 665 * anyway and in6_data() expects link-local replies. 666 */ 667 if (pii->pii_af == AF_INET6 && 668 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 669 continue; 670 671 if ((li->li_flags & TEST_MINIMAL_FLAG_SET) == IFF_UP) { 672 /* 673 * Now we have a testaddress, that satisfies 674 * the minimal properties. 675 */ 676 if ((li->li_flags & TEST_BEST_FLAG_SET) 677 == BEST_FLAG_SET) { 678 /* 679 * This is the best possible address. 680 * So break, and continue to the 681 * next phyint 682 */ 683 test_logint = li; 684 break; 685 } 686 if ((test_logint == NULL) || 687 (!(test_logint->li_flags & 688 IFF_NOFAILOVER) && 689 (li->li_flags & IFF_NOFAILOVER))) 690 /* 691 * This is a possible candidate, 692 * unless we find a better one. 693 */ 694 test_logint = li; 695 } 696 } 697 698 /* 699 * If we've gone from a singleton group to a multiple adapter 700 * group, and we haven't found an IFF_NOFAILOVER test address 701 * by now, the old test address is no longer valid. If we are 702 * not dealing with a singleton group, and the above test 703 * address selection loop has selected a non IFF_NOFAILOVER 704 * address as a candidate, we will correct that here. 705 */ 706 if ((test_logint != NULL) && 707 !SINGLETON_GROUP(pii->pii_phyint) && 708 !(test_logint->li_flags & IFF_NOFAILOVER)) { 709 test_logint = NULL; 710 if (pii->pii_probe_sock != -1) 711 close_probe_socket(pii, _B_TRUE); 712 pii->pii_probe_logint = NULL; 713 } 714 715 if (test_logint == NULL) { 716 /* 717 * We don't have a test address. Don't print an 718 * error message immediately. check_config() will 719 * take care of it. Zero out the probe stats array 720 * since it is no longer relevant. Optimize by 721 * checking if it is already zeroed out. 722 */ 723 int pr_ndx; 724 725 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 726 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 727 clear_pii_probe_stats(pii); 728 reset_crtt_all(pii->pii_phyint); 729 } 730 continue; 731 } else if (test_logint == pii->pii_probe_logint) { 732 /* 733 * If we didn't find any new test addr, go to the 734 * next phyint. 735 */ 736 continue; 737 } 738 739 /* 740 * The phyint is either being assigned a new testaddr 741 * or is being assigned a testaddr for the 1st time. 742 * Need to initialize the phyint socket 743 */ 744 pii->pii_probe_logint = test_logint; 745 if (!phyint_inst_sockinit(pii)) { 746 if (debug & D_PHYINT) { 747 logdebug("select_test_ifs: " 748 "phyint_sockinit failed\n"); 749 } 750 phyint_inst_delete(pii); 751 continue; 752 } 753 754 /* 755 * This phyint instance is now enabled for probes; this 756 * impacts our state machine in two ways: 757 * 758 * 1. If we're probe *capable* as well (i.e., we have 759 * probe targets) and the interface is in PI_NOTARGETS, 760 * then transition to PI_RUNNING. 761 * 762 * 2. If we're not probe capable, and the other phyint 763 * instance is also not probe capable, and we were in 764 * PI_RUNNING, then transition to PI_NOTARGETS. 765 * 766 * Also see the state diagram in mpd_probe.c. 767 */ 768 if (PROBE_CAPABLE(pii)) { 769 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 770 phyint_chstate(pii->pii_phyint, PI_RUNNING); 771 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 772 if (pii->pii_phyint->pi_state == PI_RUNNING) 773 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 774 } 775 776 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 777 tg = pii->pii_targets; 778 if (tg != NULL) 779 target_delete(tg); 780 assert(pii->pii_targets == NULL); 781 assert(pii->pii_target_next == NULL); 782 assert(pii->pii_ntargets == 0); 783 target_create(pii, test_logint->li_dstaddr, 784 _B_TRUE); 785 } 786 787 /* 788 * If no targets are currently known for this phyint 789 * we need to call init_router_targets. Since 790 * init_router_targets() initializes the list of targets 791 * for all phyints it is done below the loop. 792 */ 793 if (pii->pii_targets == NULL) 794 target_scan_reqd = _B_TRUE; 795 796 /* 797 * Start the probe timer for this instance. 798 */ 799 if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) { 800 start_timer(pii); 801 pii->pii_basetime_inited = 1; 802 } 803 } 804 805 /* 806 * Check the interface list for any interfaces that are marked 807 * PI_FAILED but no longer enabled to send probes, and call 808 * phyint_check_for_repair() to see if the link now indicates that the 809 * interface should be repaired. Also see the state diagram in 810 * mpd_probe.c. 811 */ 812 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 813 if (pi->pi_state == PI_FAILED && 814 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 815 phyint_check_for_repair(pi); 816 } 817 } 818 819 /* 820 * Try to populate the target list. init_router_targets populates 821 * the target list from the routing table. If our target list is 822 * still empty, init_host_targets adds host targets based on the 823 * host target list of other phyints in the group. 824 */ 825 if (target_scan_reqd) { 826 init_router_targets(); 827 init_host_targets(); 828 } 829 } 830 831 /* 832 * Check phyint group configuration, to detect any inconsistencies, 833 * and log an error message. This is called from runtimeouts every 834 * 20 secs. But the error message is displayed once. If the 835 * consistency is resolved by the admin, a recovery message is displayed 836 * once. 837 */ 838 static void 839 check_config(void) 840 { 841 struct phyint_group *pg; 842 struct phyint *pi; 843 boolean_t v4_in_group; 844 boolean_t v6_in_group; 845 846 /* 847 * All phyints of a group must be homogenous to ensure that 848 * failover or failback can be done. If any phyint in a group 849 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 850 * Do a similar check for IPv6. 851 */ 852 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 853 if (pg == phyint_anongroup) 854 continue; 855 856 v4_in_group = _B_FALSE; 857 v6_in_group = _B_FALSE; 858 /* 859 * 1st pass. Determine if at least 1 phyint in the group 860 * has IPv4 plumbed and if so set v4_in_group to true. 861 * Repeat similarly for IPv6. 862 */ 863 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 864 if (pi->pi_v4 != NULL) 865 v4_in_group = _B_TRUE; 866 if (pi->pi_v6 != NULL) 867 v6_in_group = _B_TRUE; 868 } 869 870 /* 871 * 2nd pass. If v4_in_group is true, check that phyint 872 * has IPv4 plumbed. Repeat similarly for IPv6. Print 873 * out a message the 1st time only. 874 */ 875 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 876 if (pi->pi_flags & IFF_OFFLINE) 877 continue; 878 879 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 880 if (!pi->pi_cfgmsg_printed) { 881 logerr("NIC %s of group %s is" 882 " not plumbed for IPv4 and may" 883 " affect failover capability\n", 884 pi->pi_name, 885 pi->pi_group->pg_name); 886 pi->pi_cfgmsg_printed = 1; 887 } 888 } else if (v6_in_group == _B_TRUE && 889 pi->pi_v6 == NULL) { 890 if (!pi->pi_cfgmsg_printed) { 891 logerr("NIC %s of group %s is" 892 " not plumbed for IPv6 and may" 893 " affect failover capability\n", 894 pi->pi_name, 895 pi->pi_group->pg_name); 896 pi->pi_cfgmsg_printed = 1; 897 } 898 } else { 899 /* 900 * The phyint matches the group configuration, 901 * if we have reached this point. If it was 902 * improperly configured earlier, log an 903 * error recovery message 904 */ 905 if (pi->pi_cfgmsg_printed) { 906 logerr("NIC %s is now consistent with " 907 "group %s and failover capability " 908 "is restored\n", pi->pi_name, 909 pi->pi_group->pg_name); 910 pi->pi_cfgmsg_printed = 0; 911 } 912 } 913 914 } 915 } 916 917 /* 918 * In order to perform probe-based failure detection, a phyint must 919 * have at least 1 test/probe address for sending and receiving probes 920 * (either on IPv4 or IPv6 instance or both). If no test address has 921 * been configured, notify the administrator, but continue on since we 922 * can still perform load spreading, along with "link up/down" based 923 * failure detection. 924 * 925 * Note: In the singleton group case, when user didn't configure 926 * a test address, the probe address is picked by this daemon. 927 */ 928 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 929 if (pi->pi_flags & IFF_OFFLINE) 930 continue; 931 932 if ((pi->pi_v4 == NULL || 933 pi->pi_v4->pii_probe_logint == NULL) && 934 (pi->pi_v6 == NULL || 935 pi->pi_v6->pii_probe_logint == NULL)) { 936 if (!pi->pi_taddrmsg_printed) { 937 logerr("No test address configured on " 938 "interface %s; disabling probe-based " 939 "failure detection on it\n", pi->pi_name); 940 pi->pi_taddrmsg_printed = 1; 941 } 942 } else if (pi->pi_taddrmsg_printed) { 943 logerr("Test address now configured on interface %s; " 944 "enabling probe-based failure detection on it\n", 945 pi->pi_name); 946 pi->pi_taddrmsg_printed = 0; 947 } 948 949 } 950 } 951 952 /* 953 * Timer mechanism using relative time (in milliseconds) from the 954 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 955 * will fire after TIMER_INFINITY milliseconds. 956 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 957 * time values. Hence 2 consecutive timer events cannot be spaced farther 958 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 959 * that can be passed for the delay parameter of timer_schedule() 960 */ 961 static uint_t timer_next; /* Currently scheduled timeout */ 962 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 963 964 static void 965 timer_init(void) 966 { 967 timer_next = getcurrenttime() + TIMER_INFINITY; 968 /* 969 * The call to run_timeouts() will get the timer started 970 * Since there are no phyints at this point, the timer will 971 * be set for IF_SCAN_INTERVAL ms. 972 */ 973 run_timeouts(); 974 } 975 976 /* 977 * Make sure the next SIGALRM occurs delay milliseconds from the current 978 * time if not earlier. We are interested only in time differences. 979 */ 980 void 981 timer_schedule(uint_t delay) 982 { 983 uint_t now; 984 struct itimerval itimerval; 985 986 if (debug & D_TIMER) 987 logdebug("timer_schedule(%u)\n", delay); 988 989 assert(delay <= TIMER_INFINITY); 990 991 now = getcurrenttime(); 992 if (delay == 0) { 993 /* Minimum allowed delay */ 994 delay = 1; 995 } 996 /* Will this timer occur before the currently scheduled SIGALRM? */ 997 if (timer_active && TIME_GE(now + delay, timer_next)) { 998 if (debug & D_TIMER) { 999 logdebug("timer_schedule(%u) - no action: " 1000 "now %u next %u\n", delay, now, timer_next); 1001 } 1002 return; 1003 } 1004 timer_next = now + delay; 1005 1006 itimerval.it_value.tv_sec = delay / 1000; 1007 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1008 itimerval.it_interval.tv_sec = 0; 1009 itimerval.it_interval.tv_usec = 0; 1010 if (debug & D_TIMER) { 1011 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1012 delay, itimerval.it_value.tv_sec, 1013 itimerval.it_value.tv_usec); 1014 } 1015 timer_active = _B_TRUE; 1016 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1017 logperror("timer_schedule: setitimer"); 1018 exit(2); 1019 } 1020 } 1021 1022 /* 1023 * Timer has fired. Determine when the next timer event will occur by asking 1024 * all the timer routines. Should not be called from a timer routine. 1025 */ 1026 static void 1027 run_timeouts(void) 1028 { 1029 uint_t next; 1030 uint_t next_event_time; 1031 struct phyint_instance *pii; 1032 struct phyint_instance *next_pii; 1033 static boolean_t timeout_running; 1034 1035 /* assert that recursive timeouts don't happen. */ 1036 assert(!timeout_running); 1037 1038 timeout_running = _B_TRUE; 1039 1040 if (debug & D_TIMER) 1041 logdebug("run_timeouts()\n"); 1042 1043 next = TIMER_INFINITY; 1044 1045 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1046 next_pii = pii->pii_next; 1047 next_event_time = phyint_inst_timer(pii); 1048 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1049 next = next_event_time; 1050 1051 if (debug & D_TIMER) { 1052 logdebug("run_timeouts(%s %s): next scheduled for" 1053 " this phyint inst %u, next scheduled global" 1054 " %u ms\n", 1055 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1056 next_event_time, next); 1057 } 1058 } 1059 1060 /* 1061 * Make sure initifs() is called at least once every 1062 * IF_SCAN_INTERVAL, to make sure that we are in sync 1063 * with the kernel, in case we have missed any routing 1064 * socket messages. 1065 */ 1066 if (next > IF_SCAN_INTERVAL) 1067 next = IF_SCAN_INTERVAL; 1068 1069 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1070 initifs(); 1071 check_config(); 1072 } 1073 1074 if (debug & D_TIMER) 1075 logdebug("run_timeouts: %u ms\n", next); 1076 1077 timer_schedule(next); 1078 timeout_running = _B_FALSE; 1079 } 1080 1081 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1082 static int eventpipe_write = -1; 1083 static boolean_t cleanup_started = _B_FALSE; 1084 /* Don't write to eventpipe if in cleanup */ 1085 /* 1086 * Ensure that signals are processed synchronously with the rest of 1087 * the code by just writing a one character signal number on the pipe. 1088 * The poll loop will pick this up and process the signal event. 1089 */ 1090 static void 1091 sig_handler(int signo) 1092 { 1093 uchar_t buf = (uchar_t)signo; 1094 1095 /* 1096 * Don't write to pipe if cleanup has already begun. cleanup() 1097 * might have closed the pipe already 1098 */ 1099 if (cleanup_started) 1100 return; 1101 1102 if (eventpipe_write == -1) { 1103 logerr("sig_handler: no pipe found\n"); 1104 return; 1105 } 1106 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1107 logperror("sig_handler: write"); 1108 } 1109 1110 extern struct probes_missed probes_missed; 1111 1112 /* 1113 * Pick up a signal "byte" from the pipe and process it. 1114 */ 1115 static void 1116 in_signal(int fd) 1117 { 1118 uchar_t buf; 1119 uint64_t sent, acked, lost, unacked, unknown; 1120 struct phyint_instance *pii; 1121 int pr_ndx; 1122 1123 switch (read(fd, &buf, sizeof (buf))) { 1124 case -1: 1125 logperror("in_signal: read"); 1126 exit(1); 1127 /* NOTREACHED */ 1128 case 1: 1129 break; 1130 case 0: 1131 logerr("in_signal: read end of file\n"); 1132 exit(1); 1133 /* NOTREACHED */ 1134 default: 1135 logerr("in_signal: read > 1\n"); 1136 exit(1); 1137 } 1138 1139 if (debug & D_TIMER) 1140 logdebug("in_signal() got %d\n", buf); 1141 1142 switch (buf) { 1143 case SIGALRM: 1144 if (debug & D_TIMER) { 1145 uint_t now = getcurrenttime(); 1146 1147 logdebug("in_signal(SIGALRM) delta %u\n", 1148 now - timer_next); 1149 } 1150 timer_active = _B_FALSE; 1151 run_timeouts(); 1152 break; 1153 case SIGUSR1: 1154 logdebug("Printing configuration:\n"); 1155 /* Print out the internal tables */ 1156 phyint_inst_print_all(); 1157 1158 /* 1159 * Print out the accumulated statistics about missed 1160 * probes (happens due to scheduling delay). 1161 */ 1162 logerr("Missed sending total of %d probes spread over" 1163 " %d occurrences\n", probes_missed.pm_nprobes, 1164 probes_missed.pm_ntimes); 1165 1166 /* 1167 * Print out the accumulated statistics about probes 1168 * that were sent. 1169 */ 1170 for (pii = phyint_instances; pii != NULL; 1171 pii = pii->pii_next) { 1172 unacked = 0; 1173 acked = pii->pii_cum_stats.acked; 1174 lost = pii->pii_cum_stats.lost; 1175 sent = pii->pii_cum_stats.sent; 1176 unknown = pii->pii_cum_stats.unknown; 1177 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1178 switch (pii->pii_probes[pr_ndx].pr_status) { 1179 case PR_ACKED: 1180 acked++; 1181 break; 1182 case PR_LOST: 1183 lost++; 1184 break; 1185 case PR_UNACKED: 1186 unacked++; 1187 break; 1188 } 1189 } 1190 logerr("\nProbe stats on (%s %s)\n" 1191 "Number of probes sent %lld\n" 1192 "Number of probe acks received %lld\n" 1193 "Number of probes/acks lost %lld\n" 1194 "Number of valid unacknowled probes %lld\n" 1195 "Number of ambiguous probe acks received %lld\n", 1196 AF_STR(pii->pii_af), pii->pii_name, 1197 sent, acked, lost, unacked, unknown); 1198 } 1199 break; 1200 case SIGHUP: 1201 logerr("SIGHUP: restart and reread config file\n"); 1202 cleanup(); 1203 (void) execv(argv0[0], argv0); 1204 _exit(0177); 1205 /* NOTREACHED */ 1206 case SIGINT: 1207 case SIGTERM: 1208 case SIGQUIT: 1209 cleanup(); 1210 exit(0); 1211 /* NOTREACHED */ 1212 default: 1213 logerr("in_signal: unknown signal: %d\n", buf); 1214 } 1215 } 1216 1217 static void 1218 cleanup(void) 1219 { 1220 struct phyint_instance *pii; 1221 struct phyint_instance *next_pii; 1222 1223 /* 1224 * Make sure that we don't write to eventpipe in 1225 * sig_handler() if any signal notably SIGALRM, 1226 * occurs after we close the eventpipe descriptor below 1227 */ 1228 cleanup_started = _B_TRUE; 1229 1230 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1231 next_pii = pii->pii_next; 1232 phyint_inst_delete(pii); 1233 } 1234 1235 (void) close(ifsock_v4); 1236 (void) close(ifsock_v6); 1237 (void) close(rtsock_v4); 1238 (void) close(rtsock_v6); 1239 (void) close(lsock_v4); 1240 (void) close(lsock_v6); 1241 (void) close(0); 1242 (void) close(1); 1243 (void) close(2); 1244 (void) close(mibfd); 1245 (void) close(eventpipe_read); 1246 (void) close(eventpipe_write); 1247 } 1248 1249 /* 1250 * Create pipe for signal delivery and set up signal handlers. 1251 */ 1252 static void 1253 setup_eventpipe(void) 1254 { 1255 int fds[2]; 1256 struct sigaction act; 1257 1258 if ((pipe(fds)) < 0) { 1259 logperror("setup_eventpipe: pipe"); 1260 exit(1); 1261 } 1262 eventpipe_read = fds[0]; 1263 eventpipe_write = fds[1]; 1264 if (poll_add(eventpipe_read) == -1) { 1265 exit(1); 1266 } 1267 1268 act.sa_handler = sig_handler; 1269 act.sa_flags = SA_RESTART; 1270 (void) sigaction(SIGALRM, &act, NULL); 1271 1272 (void) sigset(SIGHUP, sig_handler); 1273 (void) sigset(SIGUSR1, sig_handler); 1274 (void) sigset(SIGTERM, sig_handler); 1275 (void) sigset(SIGINT, sig_handler); 1276 (void) sigset(SIGQUIT, sig_handler); 1277 } 1278 1279 /* 1280 * Create a routing socket for receiving RTM_IFINFO messages. 1281 */ 1282 static int 1283 setup_rtsock(int af) 1284 { 1285 int s; 1286 int flags; 1287 1288 s = socket(PF_ROUTE, SOCK_RAW, af); 1289 if (s == -1) { 1290 logperror("setup_rtsock: socket PF_ROUTE"); 1291 exit(1); 1292 } 1293 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1294 logperror("setup_rtsock: fcntl F_GETFL"); 1295 (void) close(s); 1296 exit(1); 1297 } 1298 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1299 logperror("setup_rtsock: fcntl F_SETFL"); 1300 (void) close(s); 1301 exit(1); 1302 } 1303 if (poll_add(s) == -1) { 1304 (void) close(s); 1305 exit(1); 1306 } 1307 return (s); 1308 } 1309 1310 /* 1311 * Process an RTM_IFINFO message received on a routing socket. 1312 * The return value indicates whether a full interface scan is required. 1313 * Link up/down notifications from the NICs are reflected in the 1314 * IFF_RUNNING flag. 1315 * If just the state of the IFF_RUNNING interface flag has changed, a 1316 * a full interface scan isn't required. 1317 */ 1318 static boolean_t 1319 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1320 { 1321 struct sockaddr_dl *sdl; 1322 struct phyint *pi; 1323 uint64_t old_flags; 1324 struct phyint_instance *pii; 1325 1326 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1327 1328 /* 1329 * Although the sockaddr_dl structure is directly after the 1330 * if_msghdr_t structure. At the time of writing, the size of the 1331 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1332 * to the presence of a timeval structure, which contains longs, 1333 * in the if_data structure. Anyway, we know where the message ends, 1334 * so we work backwards to get the start of the sockaddr_dl structure. 1335 */ 1336 /*LINTED*/ 1337 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1338 sizeof (struct sockaddr_dl)); 1339 1340 assert(sdl->sdl_family == AF_LINK); 1341 1342 /* 1343 * The interface name is in sdl_data. 1344 * RTM_IFINFO messages are only generated for logical interface 1345 * zero, so there is no colon and logical interface number to 1346 * strip from the name. The name is not null terminated, but 1347 * there should be enough space in sdl_data to add the null. 1348 */ 1349 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1350 if (debug & D_LINKNOTE) 1351 logdebug("process_rtm_ifinfo: " 1352 "phyint name too long\n"); 1353 return (_B_TRUE); 1354 } 1355 sdl->sdl_data[sdl->sdl_nlen] = 0; 1356 1357 pi = phyint_lookup(sdl->sdl_data); 1358 if (pi == NULL) { 1359 if (debug & D_LINKNOTE) 1360 logdebug("process_rtm_ifinfo: phyint lookup failed" 1361 " for %s\n", sdl->sdl_data); 1362 return (_B_TRUE); 1363 } 1364 1365 /* 1366 * We want to try and avoid doing a full interface scan for 1367 * link state notifications from the NICs, as indicated 1368 * by the state of the IFF_RUNNING flag. If just the 1369 * IFF_RUNNING flag has changed state, the link state changes 1370 * are processed without a full scan. 1371 * If there is both an IPv4 and IPv6 instance associated with 1372 * the physical interface, we will get an RTM_IFINFO message 1373 * for each instance. If we just maintained a single copy of 1374 * the physical interface flags, it would appear that no flags 1375 * had changed when the second message is processed, leading us 1376 * to believe that the message wasn't generated by a flags change, 1377 * and that a full interface scan is required. 1378 * To get around this problem, two additional copies of the flags 1379 * are kept, one copy for each instance. These are only used in 1380 * this routine. At any one time, all three copies of the flags 1381 * should be identical except for the IFF_RUNNING flag. The 1382 * copy of the flags in the "phyint" structure is always up to 1383 * date. 1384 */ 1385 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1386 if (pii == NULL) { 1387 if (debug & D_LINKNOTE) 1388 logdebug("process_rtm_ifinfo: no instance of address " 1389 "family %s for %s\n", AF_STR(type), pi->pi_name); 1390 return (_B_TRUE); 1391 } 1392 1393 old_flags = pii->pii_flags; 1394 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1395 pi->pi_flags = pii->pii_flags; 1396 1397 if (debug & D_LINKNOTE) { 1398 logdebug("process_rtm_ifinfo: %s address family: %s, " 1399 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1400 AF_STR(type), old_flags, pi->pi_flags); 1401 } 1402 1403 /* 1404 * If IFF_STANDBY has changed, indicate that the interface has changed 1405 * types. 1406 */ 1407 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1408 phyint_newtype(pi); 1409 1410 /* 1411 * If IFF_INACTIVE has been set, then no data addresses should be 1412 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1413 * move previously failed-over addresses back to it, provided it is 1414 * not failed. For details, see the state diagram in mpd_probe.c. 1415 */ 1416 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1417 if (pii->pii_flags & IFF_INACTIVE) { 1418 if (!pi->pi_empty && (pi->pi_flags & IFF_STANDBY)) 1419 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1420 } else { 1421 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1422 pi->pi_empty = 0; 1423 (void) try_failback(pi, _B_FALSE); 1424 } 1425 } 1426 } 1427 1428 /* Has just the IFF_RUNNING flag changed state ? */ 1429 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1430 struct phyint_instance *pii_other; 1431 /* 1432 * It wasn't just a link state change. Update 1433 * the other instance's copy of the flags. 1434 */ 1435 pii_other = phyint_inst_other(pii); 1436 if (pii_other != NULL) 1437 pii_other->pii_flags = pii->pii_flags; 1438 return (_B_TRUE); 1439 } 1440 1441 return (_B_FALSE); 1442 } 1443 1444 /* 1445 * Retrieve as many routing socket messages as possible, and try to 1446 * empty the routing sockets. Initiate full scan of targets or interfaces 1447 * as needed. 1448 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1449 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1450 */ 1451 static void 1452 process_rtsock(int rtsock_v4, int rtsock_v6) 1453 { 1454 int nbytes; 1455 int64_t msg[2048 / 8]; 1456 struct rt_msghdr *rtm; 1457 boolean_t need_if_scan = _B_FALSE; 1458 boolean_t need_rt_scan = _B_FALSE; 1459 boolean_t rtm_ifinfo_seen = _B_FALSE; 1460 int type; 1461 1462 /* Read as many messages as possible and try to empty the sockets */ 1463 for (type = AF_INET; ; type = AF_INET6) { 1464 for (;;) { 1465 nbytes = read((type == AF_INET) ? rtsock_v4 : 1466 rtsock_v6, msg, sizeof (msg)); 1467 if (nbytes <= 0) { 1468 /* No more messages */ 1469 break; 1470 } 1471 rtm = (struct rt_msghdr *)msg; 1472 if (rtm->rtm_version != RTM_VERSION) { 1473 logerr("process_rtsock: version %d " 1474 "not understood\n", rtm->rtm_version); 1475 break; 1476 } 1477 1478 if (debug & D_PHYINT) { 1479 logdebug("process_rtsock: message %d\n", 1480 rtm->rtm_type); 1481 } 1482 1483 switch (rtm->rtm_type) { 1484 case RTM_NEWADDR: 1485 case RTM_DELADDR: 1486 /* 1487 * Some logical interface has changed, 1488 * have to scan everything to determine 1489 * what actually changed. 1490 */ 1491 need_if_scan = _B_TRUE; 1492 break; 1493 1494 case RTM_IFINFO: 1495 rtm_ifinfo_seen = _B_TRUE; 1496 need_if_scan |= 1497 process_rtm_ifinfo((if_msghdr_t *)rtm, 1498 type); 1499 break; 1500 1501 case RTM_ADD: 1502 case RTM_DELETE: 1503 case RTM_CHANGE: 1504 case RTM_OLDADD: 1505 case RTM_OLDDEL: 1506 need_rt_scan = _B_TRUE; 1507 break; 1508 1509 default: 1510 /* Not interesting */ 1511 break; 1512 } 1513 } 1514 if (type == AF_INET6) 1515 break; 1516 } 1517 1518 if (need_if_scan) { 1519 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1520 logdebug("process_rtsock: synchronizing with kernel\n"); 1521 initifs(); 1522 } else if (rtm_ifinfo_seen) { 1523 if (debug & D_LINKNOTE) 1524 logdebug("process_rtsock: " 1525 "link up/down notification(s) seen\n"); 1526 process_link_state_changes(); 1527 } 1528 1529 if (need_rt_scan) 1530 init_router_targets(); 1531 } 1532 1533 /* 1534 * Look if the phyint instance or one of its logints have been removed from 1535 * the kernel and take appropriate action. 1536 * Uses {pii,li}_in_use. 1537 */ 1538 static void 1539 check_if_removed(struct phyint_instance *pii) 1540 { 1541 struct logint *li; 1542 struct logint *next_li; 1543 1544 /* Detect phyints that have been removed from the kernel. */ 1545 if (!pii->pii_in_use) { 1546 logtrace("%s %s has been removed from kernel\n", 1547 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1548 phyint_inst_delete(pii); 1549 } else { 1550 /* Detect logints that have been removed. */ 1551 for (li = pii->pii_logint; li != NULL; li = next_li) { 1552 next_li = li->li_next; 1553 if (!li->li_in_use) { 1554 logint_delete(li); 1555 } 1556 } 1557 } 1558 } 1559 1560 /* 1561 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1562 * tables defined by mib2.h. Parse the returned data and extract 1563 * the 'routing' information table. Process the 'routing' table 1564 * to get the list of known onlink routers, and update our database. 1565 * These onlink routers will serve as our probe targets. 1566 * Returns false, if any system calls resulted in errors, true otherwise. 1567 */ 1568 static boolean_t 1569 update_router_list(int fd) 1570 { 1571 union { 1572 char ubuf[1024]; 1573 union T_primitives uprim; 1574 } buf; 1575 1576 int flags; 1577 struct strbuf ctlbuf; 1578 struct strbuf databuf; 1579 struct T_optmgmt_req *tor; 1580 struct T_optmgmt_ack *toa; 1581 struct T_error_ack *tea; 1582 struct opthdr *optp; 1583 struct opthdr *req; 1584 int status; 1585 t_scalar_t prim; 1586 1587 tor = (struct T_optmgmt_req *)&buf; 1588 1589 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1590 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1591 tor->OPT_length = sizeof (struct opthdr); 1592 tor->MGMT_flags = T_CURRENT; 1593 1594 req = (struct opthdr *)&tor[1]; 1595 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1596 req->name = 0; 1597 req->len = 0; 1598 1599 ctlbuf.buf = (char *)&buf; 1600 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1601 ctlbuf.maxlen = sizeof (buf); 1602 flags = 0; 1603 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1604 logperror("update_router_list: putmsg(ctl)"); 1605 return (_B_FALSE); 1606 } 1607 1608 /* 1609 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1610 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1611 * a control and data part. The control part contains a struct 1612 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1613 * the level, name and length of the data in the data part. The 1614 * data part contains the actual table data. The last message 1615 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1616 * single option with zero optlen. 1617 */ 1618 1619 for (;;) { 1620 /* 1621 * Go around this loop once for each table. Ignore 1622 * all tables except the routing information table. 1623 */ 1624 flags = 0; 1625 status = getmsg(fd, &ctlbuf, NULL, &flags); 1626 if (status < 0) { 1627 if (errno == EINTR) 1628 continue; 1629 logperror("update_router_list: getmsg(ctl)"); 1630 return (_B_FALSE); 1631 } 1632 if (ctlbuf.len < sizeof (t_scalar_t)) { 1633 logerr("update_router_list: ctlbuf.len %d\n", 1634 ctlbuf.len); 1635 return (_B_FALSE); 1636 } 1637 1638 prim = buf.uprim.type; 1639 1640 switch (prim) { 1641 1642 case T_ERROR_ACK: 1643 tea = &buf.uprim.error_ack; 1644 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1645 logerr("update_router_list: T_ERROR_ACK" 1646 " ctlbuf.len %d\n", ctlbuf.len); 1647 return (_B_FALSE); 1648 } 1649 logerr("update_router_list: T_ERROR_ACK:" 1650 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1651 tea->TLI_error, tea->UNIX_error); 1652 return (_B_FALSE); 1653 1654 case T_OPTMGMT_ACK: 1655 toa = &buf.uprim.optmgmt_ack; 1656 optp = (struct opthdr *)&toa[1]; 1657 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1658 logerr("update_router_list: ctlbuf.len %d\n", 1659 ctlbuf.len); 1660 return (_B_FALSE); 1661 } 1662 if (toa->MGMT_flags != T_SUCCESS) { 1663 logerr("update_router_list: MGMT_flags 0x%lx\n", 1664 toa->MGMT_flags); 1665 return (_B_FALSE); 1666 } 1667 break; 1668 1669 default: 1670 logerr("update_router_list: unknown primitive %ld\n", 1671 prim); 1672 return (_B_FALSE); 1673 } 1674 1675 /* Process the T_OPGMGMT_ACK below */ 1676 assert(prim == T_OPTMGMT_ACK); 1677 1678 switch (status) { 1679 case 0: 1680 /* 1681 * We have reached the end of this T_OPTMGMT_ACK 1682 * message. If this is the last message i.e EOD, 1683 * return, else process the next T_OPTMGMT_ACK msg. 1684 */ 1685 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1686 sizeof (struct opthdr)) && optp->len == 0 && 1687 optp->name == 0 && optp->level == 0) { 1688 /* 1689 * This is the EOD message. Return 1690 */ 1691 return (_B_TRUE); 1692 } 1693 continue; 1694 1695 case MORECTL: 1696 case MORECTL | MOREDATA: 1697 /* 1698 * This should not happen. We should be able to read 1699 * the control portion in a single getmsg. 1700 */ 1701 logerr("update_router_list: MORECTL\n"); 1702 return (_B_FALSE); 1703 1704 case MOREDATA: 1705 databuf.maxlen = optp->len; 1706 /* malloc of 0 bytes is ok */ 1707 databuf.buf = malloc((size_t)optp->len); 1708 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1709 logperror("update_router_list: malloc"); 1710 return (_B_FALSE); 1711 } 1712 databuf.len = 0; 1713 flags = 0; 1714 for (;;) { 1715 status = getmsg(fd, NULL, &databuf, &flags); 1716 if (status >= 0) { 1717 break; 1718 } else if (errno == EINTR) { 1719 continue; 1720 } else { 1721 logperror("update_router_list:" 1722 " getmsg(data)"); 1723 free(databuf.buf); 1724 return (_B_FALSE); 1725 } 1726 } 1727 1728 if (optp->level == MIB2_IP && 1729 optp->name == MIB2_IP_ROUTE) { 1730 /* LINTED */ 1731 ire_process_v4((mib2_ipRouteEntry_t *) 1732 databuf.buf, databuf.len); 1733 } else if (optp->level == MIB2_IP6 && 1734 optp->name == MIB2_IP6_ROUTE) { 1735 /* LINTED */ 1736 ire_process_v6((mib2_ipv6RouteEntry_t *) 1737 databuf.buf, databuf.len); 1738 } 1739 free(databuf.buf); 1740 } 1741 } 1742 /* NOTREACHED */ 1743 } 1744 1745 /* 1746 * Examine the IPv4 routing table, for default routers. For each default 1747 * router, populate the list of targets of each phyint that is on the same 1748 * link as the default router 1749 */ 1750 static void 1751 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1752 { 1753 mib2_ipRouteEntry_t *rp; 1754 mib2_ipRouteEntry_t *rp1; 1755 struct in_addr nexthop_v4; 1756 mib2_ipRouteEntry_t *endp; 1757 1758 if (len == 0) 1759 return; 1760 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1761 1762 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1763 1764 /* 1765 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1766 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1767 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1768 * This is a potential target for probing, which we try to add 1769 * to the list of probe targets. 1770 */ 1771 for (rp = buf; rp < endp; rp++) { 1772 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1773 continue; 1774 1775 /* Get the nexthop address. */ 1776 nexthop_v4.s_addr = rp->ipRouteNextHop; 1777 1778 /* 1779 * Get the nexthop address. Then determine the outgoing 1780 * interface, by examining all interface IREs, and picking the 1781 * match. We don't look at the interface specified in the route 1782 * because we need to add the router target on all matching 1783 * interfaces anyway; the goal is to avoid falling back to 1784 * multicast when some interfaces are in the same subnet but 1785 * not in the same group. 1786 */ 1787 for (rp1 = buf; rp1 < endp; rp1++) { 1788 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1789 continue; 1790 } 1791 1792 /* 1793 * Determine the interface IRE that matches the nexthop. 1794 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1795 */ 1796 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1797 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1798 /* 1799 * We found the interface ire 1800 */ 1801 router_add_v4(rp1, nexthop_v4); 1802 } 1803 } 1804 } 1805 } 1806 1807 void 1808 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1809 { 1810 char *cp; 1811 char ifname[LIFNAMSIZ + 1]; 1812 struct in6_addr nexthop; 1813 int len; 1814 1815 if (debug & D_TARGET) 1816 logdebug("router_add_v4()\n"); 1817 1818 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1819 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1820 ifname[len] = '\0'; 1821 1822 if (ifname[0] == '\0') 1823 return; 1824 1825 cp = strchr(ifname, IF_SEPARATOR); 1826 if (cp != NULL) 1827 *cp = '\0'; 1828 1829 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1830 router_add_common(AF_INET, ifname, nexthop); 1831 } 1832 1833 void 1834 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1835 { 1836 struct phyint_instance *pii; 1837 struct phyint *pi; 1838 1839 if (debug & D_TARGET) 1840 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1841 1842 /* 1843 * Retrieve the phyint instance; bail if it's not known to us yet. 1844 */ 1845 pii = phyint_inst_lookup(af, ifname); 1846 if (pii == NULL) 1847 return; 1848 1849 /* 1850 * Don't use our own addresses as targets. 1851 */ 1852 if (own_address(pii->pii_af, nexthop)) 1853 return; 1854 1855 /* 1856 * If the phyint is part a named group, then add the address to all 1857 * members of the group; note that this is suboptimal in the IPv4 case 1858 * as it has already been added to all matching interfaces in 1859 * ire_process_v4(). Otherwise, add the address only to the phyint 1860 * itself, since other phyints in the anongroup may not be on the same 1861 * subnet. 1862 */ 1863 pi = pii->pii_phyint; 1864 if (pi->pi_group == phyint_anongroup) { 1865 target_add(pii, nexthop, _B_TRUE); 1866 } else { 1867 pi = pi->pi_group->pg_phyint; 1868 for (; pi != NULL; pi = pi->pi_pgnext) 1869 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1870 } 1871 } 1872 1873 /* 1874 * Examine the IPv6 routing table, for default routers. For each default 1875 * router, populate the list of targets of each phyint that is on the same 1876 * link as the default router 1877 */ 1878 static void 1879 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1880 { 1881 mib2_ipv6RouteEntry_t *rp; 1882 mib2_ipv6RouteEntry_t *endp; 1883 struct in6_addr nexthop_v6; 1884 1885 if (debug & D_TARGET) 1886 logdebug("ire_process_v6(len %d)\n", len); 1887 1888 if (len == 0) 1889 return; 1890 1891 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1892 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1893 1894 /* 1895 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1896 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1897 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1898 * This is a potential target for probing, which we try to add 1899 * to the list of probe targets. 1900 */ 1901 for (rp = buf; rp < endp; rp++) { 1902 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1903 continue; 1904 1905 /* 1906 * We have the outgoing interface in ipv6RouteIfIndex 1907 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1908 * interface must be present for link-local addresses. Since 1909 * we use only link-local addreses for probing, we don't 1910 * consider the case when the outgoing interface is not 1911 * known and we need to scan interface ires 1912 */ 1913 nexthop_v6 = rp->ipv6RouteNextHop; 1914 if (rp->ipv6RouteIfIndex.o_length != 0) { 1915 /* 1916 * We already have the outgoing interface 1917 * in ipv6RouteIfIndex. 1918 */ 1919 router_add_v6(rp, nexthop_v6); 1920 } 1921 } 1922 } 1923 1924 1925 void 1926 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1927 { 1928 char ifname[LIFNAMSIZ + 1]; 1929 char *cp; 1930 int len; 1931 1932 if (debug & D_TARGET) 1933 logdebug("router_add_v6()\n"); 1934 1935 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1936 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1937 ifname[len] = '\0'; 1938 1939 if (ifname[0] == '\0') 1940 return; 1941 1942 cp = strchr(ifname, IF_SEPARATOR); 1943 if (cp != NULL) 1944 *cp = '\0'; 1945 1946 router_add_common(AF_INET6, ifname, nexthop_v6); 1947 } 1948 1949 1950 1951 /* 1952 * Build a list of target routers, by scanning the routing tables. 1953 * It is assumed that interface routes exist, to reach the routers. 1954 */ 1955 static void 1956 init_router_targets(void) 1957 { 1958 struct target *tg; 1959 struct target *next_tg; 1960 struct phyint_instance *pii; 1961 struct phyint *pi; 1962 1963 if (force_mcast) 1964 return; 1965 1966 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1967 pi = pii->pii_phyint; 1968 /* 1969 * Exclude ptp and host targets. Set tg_in_use to false, 1970 * only for router targets. 1971 */ 1972 if (!pii->pii_targets_are_routers || 1973 (pi->pi_flags & IFF_POINTOPOINT)) 1974 continue; 1975 1976 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1977 tg->tg_in_use = 0; 1978 } 1979 1980 if (mibfd < 0) { 1981 mibfd = open("/dev/ip", O_RDWR); 1982 if (mibfd < 0) { 1983 logperror("mibopen: ip open"); 1984 exit(1); 1985 } 1986 } 1987 1988 if (!update_router_list(mibfd)) { 1989 (void) close(mibfd); 1990 mibfd = -1; 1991 } 1992 1993 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1994 if (!pii->pii_targets_are_routers || 1995 (pi->pi_flags & IFF_POINTOPOINT)) 1996 continue; 1997 1998 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1999 next_tg = tg->tg_next; 2000 if (!tg->tg_in_use) { 2001 target_delete(tg); 2002 } 2003 } 2004 } 2005 } 2006 2007 /* 2008 * Attempt to assign host targets to any interfaces that do not currently 2009 * have probe targets by sharing targets with other interfaces in the group. 2010 */ 2011 static void 2012 init_host_targets(void) 2013 { 2014 struct phyint_instance *pii; 2015 struct phyint_group *pg; 2016 2017 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2018 pg = pii->pii_phyint->pi_group; 2019 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2020 dup_host_targets(pii); 2021 } 2022 } 2023 2024 /* 2025 * Duplicate host targets from other phyints of the group to 2026 * the phyint instance 'desired_pii'. 2027 */ 2028 static void 2029 dup_host_targets(struct phyint_instance *desired_pii) 2030 { 2031 int af; 2032 struct phyint *pi; 2033 struct phyint_instance *pii; 2034 struct target *tg; 2035 2036 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2037 2038 af = desired_pii->pii_af; 2039 2040 /* 2041 * For every phyint in the same group as desired_pii, check if 2042 * it has any host targets. If so add them to desired_pii. 2043 */ 2044 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2045 pii = PHYINT_INSTANCE(pi, af); 2046 /* 2047 * We know that we don't have targets on this phyint instance 2048 * since we have been called. But we still check for 2049 * pii_targets_are_routers because another phyint instance 2050 * could have router targets, since IFF_NOFAILOVER addresses 2051 * on different phyint instances may belong to different 2052 * subnets. 2053 */ 2054 if ((pii == NULL) || (pii == desired_pii) || 2055 pii->pii_targets_are_routers) 2056 continue; 2057 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2058 target_create(desired_pii, tg->tg_address, _B_FALSE); 2059 } 2060 } 2061 } 2062 2063 static void 2064 usage(char *cmd) 2065 { 2066 (void) fprintf(stderr, "usage: %s\n", cmd); 2067 } 2068 2069 2070 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2071 2072 /* Get an option from the /etc/default/mpathd file */ 2073 static char * 2074 getdefault(char *name) 2075 { 2076 char namebuf[BUFSIZ]; 2077 char *value = NULL; 2078 2079 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2080 char *cp; 2081 int flags; 2082 2083 /* 2084 * ignore case 2085 */ 2086 flags = defcntl(DC_GETFLAGS, 0); 2087 TURNOFF(flags, DC_CASE); 2088 (void) defcntl(DC_SETFLAGS, flags); 2089 2090 /* Add "=" to the name */ 2091 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2092 (void) strncat(namebuf, "=", 2); 2093 2094 if ((cp = defread(namebuf)) != NULL) 2095 value = strdup(cp); 2096 2097 /* close */ 2098 (void) defopen((char *)NULL); 2099 } 2100 return (value); 2101 } 2102 2103 2104 /* 2105 * Command line options below 2106 */ 2107 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2108 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2109 static boolean_t adopt = _B_FALSE; 2110 static boolean_t foreground = _B_FALSE; 2111 2112 int 2113 main(int argc, char *argv[]) 2114 { 2115 int i; 2116 int c; 2117 struct phyint_instance *pii; 2118 char *value; 2119 2120 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2121 srandom(gethostid()); /* Initialize the random number generator */ 2122 2123 /* 2124 * NOTE: The messages output by in.mpathd are not suitable for 2125 * translation, so we do not call textdomain(). 2126 */ 2127 (void) setlocale(LC_ALL, ""); 2128 2129 /* 2130 * Get the user specified value of 'failure detection time' 2131 * from /etc/default/mpathd 2132 */ 2133 value = getdefault("FAILURE_DETECTION_TIME"); 2134 if (value != NULL) { 2135 user_failure_detection_time = 2136 (int)strtol((char *)value, NULL, 0); 2137 2138 if (user_failure_detection_time <= 0) { 2139 user_failure_detection_time = FAILURE_DETECTION_TIME; 2140 logerr("Invalid failure detection time %s, assuming " 2141 "default %d\n", value, user_failure_detection_time); 2142 2143 } else if (user_failure_detection_time < 2144 MIN_FAILURE_DETECTION_TIME) { 2145 user_failure_detection_time = 2146 MIN_FAILURE_DETECTION_TIME; 2147 logerr("Too small failure detection time of %s, " 2148 "assuming minimum %d\n", value, 2149 user_failure_detection_time); 2150 } 2151 free(value); 2152 } else { 2153 /* User has not specified the parameter, Use default value */ 2154 user_failure_detection_time = FAILURE_DETECTION_TIME; 2155 } 2156 2157 /* 2158 * This gives the frequency at which probes will be sent. 2159 * When fdt ms elapses, we should be able to determine 2160 * whether 5 consecutive probes have failed or not. 2161 * 1 probe will be sent in every user_probe_interval ms, 2162 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2163 * user_probe_interval. Thus when we send out probe 'n' we 2164 * can be sure that probe 'n - 2' is lost, if we have not 2165 * got the ack. (since the probe interval is > crtt). But 2166 * probe 'n - 1' may be a valid unacked probe, since the 2167 * time between 2 successive probes could be as small as 2168 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2169 */ 2170 user_probe_interval = user_failure_detection_time / 2171 (NUM_PROBE_FAILS + 2); 2172 2173 /* 2174 * Get the user specified value of failback_enabled from 2175 * /etc/default/mpathd 2176 */ 2177 value = getdefault("FAILBACK"); 2178 if (value != NULL) { 2179 if (strncasecmp(value, "yes", 3) == 0) 2180 failback_enabled = _B_TRUE; 2181 else if (strncasecmp(value, "no", 2) == 0) 2182 failback_enabled = _B_FALSE; 2183 else 2184 logerr("Invalid value for FAILBACK %s\n", value); 2185 free(value); 2186 } else { 2187 failback_enabled = _B_TRUE; 2188 } 2189 2190 /* 2191 * Get the user specified value of track_all_phyints from 2192 * /etc/default/mpathd. The sense is reversed in 2193 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2194 */ 2195 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2196 if (value != NULL) { 2197 if (strncasecmp(value, "yes", 3) == 0) 2198 track_all_phyints = _B_FALSE; 2199 else if (strncasecmp(value, "no", 2) == 0) 2200 track_all_phyints = _B_TRUE; 2201 else 2202 logerr("Invalid value for " 2203 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2204 free(value); 2205 } else { 2206 track_all_phyints = _B_FALSE; 2207 } 2208 2209 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2210 switch (c) { 2211 case 'a': 2212 adopt = _B_TRUE; 2213 break; 2214 case 'm': 2215 force_mcast = _B_TRUE; 2216 break; 2217 case 'd': 2218 debug = D_ALL; 2219 foreground = _B_TRUE; 2220 break; 2221 case 'D': 2222 i = (int)strtol(optarg, NULL, 0); 2223 if (i == 0) { 2224 (void) fprintf(stderr, "Bad debug flags: %s\n", 2225 optarg); 2226 exit(1); 2227 } 2228 debug |= i; 2229 foreground = _B_TRUE; 2230 break; 2231 case 'l': 2232 /* 2233 * Turn off link state notification handling. 2234 * Undocumented command line flag, for debugging 2235 * purposes. 2236 */ 2237 handle_link_notifications = _B_FALSE; 2238 break; 2239 default: 2240 usage(argv[0]); 2241 exit(1); 2242 } 2243 } 2244 2245 /* 2246 * The sockets for the loopback command interface should be listening 2247 * before we fork and exit in daemonize(). This way, whoever started us 2248 * can use the loopback interface as soon as they get a zero exit 2249 * status. 2250 */ 2251 lsock_v4 = setup_listener(AF_INET); 2252 lsock_v6 = setup_listener(AF_INET6); 2253 2254 if (lsock_v4 < 0 && lsock_v6 < 0) { 2255 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2256 exit(1); 2257 } 2258 2259 if (!foreground) { 2260 if (!daemonize()) { 2261 logerr("cannot daemonize\n"); 2262 exit(EXIT_FAILURE); 2263 } 2264 initlog(); 2265 } 2266 2267 /* 2268 * Initializations: 2269 * 1. Create ifsock* sockets. These are used for performing SIOC* 2270 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2271 * 2. Initialize a pipe for handling/recording signal events. 2272 * 3. Create the routing sockets, used for listening 2273 * to routing / interface changes. 2274 * 4. phyint_init() - Initialize physical interface state 2275 * (in mpd_tables.c). Must be done before creating interfaces, 2276 * which timer_init() does indirectly. 2277 * 5. timer_init() - Initialize timer related stuff 2278 * 6. initifs() - Initialize our database of all known interfaces 2279 * 7. init_router_targets() - Initialize our database of all known 2280 * router targets. 2281 */ 2282 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2283 if (ifsock_v4 < 0) { 2284 logperror("main: IPv4 socket open"); 2285 exit(1); 2286 } 2287 2288 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2289 if (ifsock_v6 < 0) { 2290 logperror("main: IPv6 socket open"); 2291 exit(1); 2292 } 2293 2294 setup_eventpipe(); 2295 2296 rtsock_v4 = setup_rtsock(AF_INET); 2297 rtsock_v6 = setup_rtsock(AF_INET6); 2298 2299 if (phyint_init() == -1) { 2300 logerr("cannot initialize physical interface structures"); 2301 exit(1); 2302 } 2303 2304 timer_init(); 2305 2306 initifs(); 2307 2308 /* Inform kernel whether failback is enabled or disabled */ 2309 if (ioctl(ifsock_v4, SIOCSIPMPFAILBACK, (int *)&failback_enabled) < 0) { 2310 logperror("main: ioctl (SIOCSIPMPFAILBACK)"); 2311 exit(1); 2312 } 2313 2314 /* 2315 * If we're operating in "adopt" mode and no interfaces need to be 2316 * tracked, shut down (ifconfig(1M) will restart us on demand if 2317 * interfaces are subsequently put into multipathing groups). 2318 */ 2319 if (adopt && phyint_instances == NULL) 2320 exit(0); 2321 2322 /* 2323 * Main body. Keep listening for activity on any of the sockets 2324 * that we are monitoring and take appropriate action as necessary. 2325 * signals are also handled synchronously. 2326 */ 2327 for (;;) { 2328 if (poll(pollfds, pollfd_num, -1) < 0) { 2329 if (errno == EINTR) 2330 continue; 2331 logperror("main: poll"); 2332 exit(1); 2333 } 2334 for (i = 0; i < pollfd_num; i++) { 2335 if ((pollfds[i].fd == -1) || 2336 !(pollfds[i].revents & POLLIN)) 2337 continue; 2338 if (pollfds[i].fd == eventpipe_read) { 2339 in_signal(eventpipe_read); 2340 break; 2341 } 2342 if (pollfds[i].fd == rtsock_v4 || 2343 pollfds[i].fd == rtsock_v6) { 2344 process_rtsock(rtsock_v4, rtsock_v6); 2345 break; 2346 } 2347 for (pii = phyint_instances; pii != NULL; 2348 pii = pii->pii_next) { 2349 if (pollfds[i].fd == pii->pii_probe_sock) { 2350 if (pii->pii_af == AF_INET) 2351 in_data(pii); 2352 else 2353 in6_data(pii); 2354 break; 2355 } 2356 } 2357 if (pollfds[i].fd == lsock_v4) 2358 loopback_cmd(lsock_v4, AF_INET); 2359 else if (pollfds[i].fd == lsock_v6) 2360 loopback_cmd(lsock_v6, AF_INET6); 2361 } 2362 if (full_scan_required) { 2363 initifs(); 2364 full_scan_required = _B_FALSE; 2365 } 2366 } 2367 /* NOTREACHED */ 2368 return (EXIT_SUCCESS); 2369 } 2370 2371 static int 2372 setup_listener(int af) 2373 { 2374 int sock; 2375 int on; 2376 int len; 2377 int ret; 2378 struct sockaddr_storage laddr; 2379 struct sockaddr_in *sin; 2380 struct sockaddr_in6 *sin6; 2381 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2382 2383 assert(af == AF_INET || af == AF_INET6); 2384 2385 sock = socket(af, SOCK_STREAM, 0); 2386 if (sock < 0) { 2387 logperror("setup_listener: socket"); 2388 exit(1); 2389 } 2390 2391 on = 1; 2392 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2393 sizeof (on)) < 0) { 2394 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2395 exit(1); 2396 } 2397 2398 bzero(&laddr, sizeof (laddr)); 2399 laddr.ss_family = af; 2400 2401 if (af == AF_INET) { 2402 sin = (struct sockaddr_in *)&laddr; 2403 sin->sin_port = htons(MPATHD_PORT); 2404 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2405 len = sizeof (struct sockaddr_in); 2406 } else { 2407 sin6 = (struct sockaddr_in6 *)&laddr; 2408 sin6->sin6_port = htons(MPATHD_PORT); 2409 sin6->sin6_addr = loopback_addr; 2410 len = sizeof (struct sockaddr_in6); 2411 } 2412 2413 ret = bind(sock, (struct sockaddr *)&laddr, len); 2414 if (ret < 0) { 2415 if (errno == EADDRINUSE) { 2416 /* 2417 * Another instance of mpathd may be already active. 2418 */ 2419 logerr("main: is another instance of in.mpathd " 2420 "already active?\n"); 2421 exit(1); 2422 } else { 2423 (void) close(sock); 2424 return (-1); 2425 } 2426 } 2427 if (listen(sock, 30) < 0) { 2428 logperror("main: listen"); 2429 exit(1); 2430 } 2431 if (poll_add(sock) == -1) { 2432 (void) close(sock); 2433 exit(1); 2434 } 2435 2436 return (sock); 2437 } 2438 2439 /* 2440 * Table of commands and their expected size; used by loopback_cmd(). 2441 */ 2442 static struct { 2443 const char *name; 2444 unsigned int size; 2445 } commands[] = { 2446 { "MI_PING", sizeof (uint32_t) }, 2447 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2448 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2449 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2450 { "MI_QUERY", sizeof (mi_query_t) } 2451 }; 2452 2453 /* 2454 * Commands received over the loopback interface come here. Currently 2455 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2456 * module. ifconfig only makes a connection, and closes it to check if 2457 * in.mpathd is running. 2458 * if_mpadm sends commands in the format specified by the mpathd_interface 2459 * structure. 2460 */ 2461 static void 2462 loopback_cmd(int sock, int family) 2463 { 2464 int newfd; 2465 ssize_t len; 2466 struct sockaddr_storage peer; 2467 struct sockaddr_in *peer_sin; 2468 struct sockaddr_in6 *peer_sin6; 2469 socklen_t peerlen; 2470 union mi_commands mpi; 2471 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2472 char abuf[INET6_ADDRSTRLEN]; 2473 uint_t cmd; 2474 int retval; 2475 2476 peerlen = sizeof (peer); 2477 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2478 if (newfd < 0) { 2479 logperror("loopback_cmd: accept"); 2480 return; 2481 } 2482 2483 switch (family) { 2484 case AF_INET: 2485 /* 2486 * Validate the address and port to make sure that 2487 * non privileged processes don't connect and start 2488 * talking to us. 2489 */ 2490 if (peerlen != sizeof (struct sockaddr_in)) { 2491 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2492 (void) close(newfd); 2493 return; 2494 } 2495 peer_sin = (struct sockaddr_in *)&peer; 2496 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2497 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2498 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2499 abuf, sizeof (abuf)); 2500 logerr("Attempt to connect from addr %s port %d\n", 2501 abuf, ntohs(peer_sin->sin_port)); 2502 (void) close(newfd); 2503 return; 2504 } 2505 break; 2506 2507 case AF_INET6: 2508 if (peerlen != sizeof (struct sockaddr_in6)) { 2509 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2510 (void) close(newfd); 2511 return; 2512 } 2513 /* 2514 * Validate the address and port to make sure that 2515 * non privileged processes don't connect and start 2516 * talking to us. 2517 */ 2518 peer_sin6 = (struct sockaddr_in6 *)&peer; 2519 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2520 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2521 &loopback_addr))) { 2522 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2523 sizeof (abuf)); 2524 logerr("Attempt to connect from addr %s port %d\n", 2525 abuf, ntohs(peer_sin6->sin6_port)); 2526 (void) close(newfd); 2527 return; 2528 } 2529 2530 default: 2531 logdebug("loopback_cmd: family %d\n", family); 2532 (void) close(newfd); 2533 return; 2534 } 2535 2536 /* 2537 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2538 * all supported commands 2539 */ 2540 len = read(newfd, &mpi, sizeof (mpi)); 2541 2542 /* 2543 * ifconfig does not send any data. Just tests to see if mpathd 2544 * is already running. 2545 */ 2546 if (len <= 0) { 2547 (void) close(newfd); 2548 return; 2549 } 2550 2551 /* 2552 * In theory, we can receive any sized message for a stream socket, 2553 * but we don't expect that to happen for a small message over a 2554 * loopback connection. 2555 */ 2556 if (len < sizeof (uint32_t)) { 2557 logerr("loopback_cmd: bad command format or read returns " 2558 "partial data %d\n", len); 2559 } 2560 2561 cmd = mpi.mi_command; 2562 if (cmd >= MI_NCMD) { 2563 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2564 (void) close(newfd); 2565 return; 2566 } 2567 2568 if (len < commands[cmd].size) { 2569 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2570 commands[cmd].name, commands[cmd].size, len); 2571 (void) close(newfd); 2572 return; 2573 } 2574 2575 retval = process_cmd(newfd, &mpi); 2576 if (retval != IPMP_SUCCESS) { 2577 logerr("failed processing %s: %s\n", commands[cmd].name, 2578 ipmp_errmsg(retval)); 2579 } 2580 (void) close(newfd); 2581 } 2582 2583 extern int global_errno; /* set by failover() or failback() */ 2584 2585 /* 2586 * Process the offline, undo offline and set original index commands, 2587 * received from if_mpadm(1M) 2588 */ 2589 static unsigned int 2590 process_cmd(int newfd, union mi_commands *mpi) 2591 { 2592 uint_t nif = 0; 2593 uint32_t cmd; 2594 struct phyint *pi; 2595 struct phyint *pi2; 2596 struct phyint_group *pg; 2597 boolean_t success; 2598 int error; 2599 struct mi_offline *mio; 2600 struct mi_undo_offline *miu; 2601 struct lifreq lifr; 2602 int ifsock; 2603 struct mi_setoindex *mis; 2604 2605 cmd = mpi->mi_command; 2606 2607 switch (cmd) { 2608 case MI_OFFLINE: 2609 mio = &mpi->mi_ocmd; 2610 /* 2611 * Lookup the interface that needs to be offlined. 2612 * If it does not exist, return a suitable error. 2613 */ 2614 pi = phyint_lookup(mio->mio_ifname); 2615 if (pi == NULL) 2616 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2617 2618 /* 2619 * Verify that the minimum redundancy requirements are met. 2620 * The multipathing group must have at least the specified 2621 * number of functional interfaces after offlining the 2622 * requested interface. Otherwise return a suitable error. 2623 */ 2624 pg = pi->pi_group; 2625 nif = 0; 2626 if (pg != phyint_anongroup) { 2627 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2628 pi2 = pi2->pi_pgnext) { 2629 if ((pi2->pi_state == PI_RUNNING) || 2630 (pg->pg_groupfailed && 2631 !(pi2->pi_flags & IFF_OFFLINE))) 2632 nif++; 2633 } 2634 } 2635 if (nif < mio->mio_min_redundancy) 2636 return (send_result(newfd, IPMP_EMINRED, 0)); 2637 2638 /* 2639 * The order of operation is to set IFF_OFFLINE, followed by 2640 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2641 * can be created. Subsequent failover moves everything on 2642 * the OFFLINE interface to some other functional interface. 2643 */ 2644 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2645 if (success) { 2646 if (!pi->pi_empty) { 2647 error = try_failover(pi, FAILOVER_NORMAL); 2648 if (error != 0) { 2649 if (!change_lif_flags(pi, IFF_OFFLINE, 2650 _B_FALSE)) { 2651 logerr("process_cmd: couldn't" 2652 " clear OFFLINE flag on" 2653 " %s\n", pi->pi_name); 2654 /* 2655 * Offline interfaces should 2656 * not be probed. 2657 */ 2658 stop_probing(pi); 2659 } 2660 return (send_result(newfd, error, 2661 global_errno)); 2662 } 2663 } 2664 } else { 2665 return (send_result(newfd, IPMP_FAILURE, errno)); 2666 } 2667 2668 /* 2669 * The interface is now Offline, so stop probing it. 2670 * Note that if_mpadm(1M) will down the test addresses, 2671 * after receiving a success reply from us. The routing 2672 * socket message will then make us close the socket used 2673 * for sending probes. But it is more logical that an 2674 * offlined interface must not be probed, even if it has 2675 * test addresses. 2676 */ 2677 stop_probing(pi); 2678 return (send_result(newfd, IPMP_SUCCESS, 0)); 2679 2680 case MI_UNDO_OFFLINE: 2681 miu = &mpi->mi_ucmd; 2682 /* 2683 * Undo the offline command. As usual lookup the interface. 2684 * Send an error if it does not exist. 2685 */ 2686 pi = phyint_lookup(miu->miu_ifname); 2687 if (pi == NULL) 2688 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2689 2690 /* 2691 * Inverse of the offline operation. Do a failback, and then 2692 * clear the IFF_OFFLINE flag. 2693 */ 2694 error = do_failback(pi, _B_TRUE); 2695 if (error == IPMP_EFBPARTIAL) 2696 return (send_result(newfd, IPMP_EFBPARTIAL, 0)); 2697 error = do_failback(pi, _B_FALSE); 2698 2699 switch (error) { 2700 case IPMP_SUCCESS: 2701 if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) { 2702 logdebug("undo error %X\n", global_errno); 2703 error = IPMP_FAILURE; 2704 break; 2705 } 2706 /* FALLTHROUGH */ 2707 2708 case IPMP_EFBPARTIAL: 2709 /* 2710 * Reset the state of the interface based on the 2711 * current link state; if this phyint subsequently 2712 * acquires a test address, the state will be changed 2713 * again later as a result of the probes. 2714 */ 2715 if (LINK_UP(pi)) 2716 phyint_chstate(pi, PI_RUNNING); 2717 else 2718 phyint_chstate(pi, PI_FAILED); 2719 break; 2720 2721 case IPMP_FAILURE: 2722 break; 2723 2724 default: 2725 logdebug("do_failback: unexpected return value\n"); 2726 break; 2727 } 2728 return (send_result(newfd, error, global_errno)); 2729 2730 case MI_SETOINDEX: 2731 mis = &mpi->mi_scmd; 2732 2733 /* Get the socket for doing ioctls */ 2734 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2735 2736 /* 2737 * Get index of new original interface. 2738 * The index is returned in lifr.lifr_index. 2739 */ 2740 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2741 sizeof (lifr.lifr_name)); 2742 2743 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2744 return (send_result(newfd, IPMP_FAILURE, errno)); 2745 2746 /* 2747 * Set new original interface index. 2748 * The new index was put into lifr.lifr_index by the 2749 * SIOCGLIFINDEX ioctl. 2750 */ 2751 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2752 sizeof (lifr.lifr_name)); 2753 2754 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2755 return (send_result(newfd, IPMP_FAILURE, errno)); 2756 2757 return (send_result(newfd, IPMP_SUCCESS, 0)); 2758 2759 case MI_QUERY: 2760 return (process_query(newfd, &mpi->mi_qcmd)); 2761 2762 default: 2763 break; 2764 } 2765 2766 return (send_result(newfd, IPMP_EPROTO, 0)); 2767 } 2768 2769 /* 2770 * Process the query request pointed to by `miq' and send a reply on file 2771 * descriptor `fd'. Returns an IPMP error code. 2772 */ 2773 static unsigned int 2774 process_query(int fd, mi_query_t *miq) 2775 { 2776 ipmp_groupinfo_t *grinfop; 2777 ipmp_groupinfolist_t *grlp; 2778 ipmp_grouplist_t *grlistp; 2779 ipmp_ifinfo_t *ifinfop; 2780 ipmp_ifinfolist_t *iflp; 2781 ipmp_snap_t *snap; 2782 unsigned int retval; 2783 2784 switch (miq->miq_inforeq) { 2785 case IPMP_GROUPLIST: 2786 retval = getgrouplist(&grlistp); 2787 if (retval != IPMP_SUCCESS) 2788 return (send_result(fd, retval, errno)); 2789 2790 retval = send_result(fd, IPMP_SUCCESS, 0); 2791 if (retval == IPMP_SUCCESS) 2792 retval = send_grouplist(fd, grlistp); 2793 2794 ipmp_freegrouplist(grlistp); 2795 return (retval); 2796 2797 case IPMP_GROUPINFO: 2798 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2799 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2800 if (retval != IPMP_SUCCESS) 2801 return (send_result(fd, retval, errno)); 2802 2803 retval = send_result(fd, IPMP_SUCCESS, 0); 2804 if (retval == IPMP_SUCCESS) 2805 retval = send_groupinfo(fd, grinfop); 2806 2807 ipmp_freegroupinfo(grinfop); 2808 return (retval); 2809 2810 case IPMP_IFINFO: 2811 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2812 retval = getifinfo(miq->miq_ifname, &ifinfop); 2813 if (retval != IPMP_SUCCESS) 2814 return (send_result(fd, retval, errno)); 2815 2816 retval = send_result(fd, IPMP_SUCCESS, 0); 2817 if (retval == IPMP_SUCCESS) 2818 retval = send_ifinfo(fd, ifinfop); 2819 2820 ipmp_freeifinfo(ifinfop); 2821 return (retval); 2822 2823 case IPMP_SNAP: 2824 retval = getsnap(&snap); 2825 if (retval != IPMP_SUCCESS) 2826 return (send_result(fd, retval, errno)); 2827 2828 retval = send_result(fd, IPMP_SUCCESS, 0); 2829 if (retval != IPMP_SUCCESS) 2830 goto out; 2831 2832 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2833 if (retval != IPMP_SUCCESS) 2834 goto out; 2835 2836 retval = send_grouplist(fd, snap->sn_grlistp); 2837 if (retval != IPMP_SUCCESS) 2838 goto out; 2839 2840 iflp = snap->sn_ifinfolistp; 2841 for (; iflp != NULL; iflp = iflp->ifl_next) { 2842 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2843 if (retval != IPMP_SUCCESS) 2844 goto out; 2845 } 2846 2847 grlp = snap->sn_grinfolistp; 2848 for (; grlp != NULL; grlp = grlp->grl_next) { 2849 retval = send_groupinfo(fd, grlp->grl_grinfop); 2850 if (retval != IPMP_SUCCESS) 2851 goto out; 2852 } 2853 out: 2854 ipmp_snap_free(snap); 2855 return (retval); 2856 2857 default: 2858 break; 2859 2860 } 2861 return (send_result(fd, IPMP_EPROTO, 0)); 2862 } 2863 2864 /* 2865 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2866 * Returns an IPMP error code. 2867 */ 2868 static unsigned int 2869 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2870 { 2871 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2872 unsigned int retval; 2873 2874 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2875 if (retval != IPMP_SUCCESS) 2876 return (retval); 2877 2878 return (ipmp_writetlv(fd, IPMP_IFLIST, 2879 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2880 } 2881 2882 /* 2883 * Send the interface information pointed to by `ifinfop' on file descriptor 2884 * `fd'. Returns an IPMP error code. 2885 */ 2886 static unsigned int 2887 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2888 { 2889 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2890 } 2891 2892 /* 2893 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2894 * Returns an IPMP error code. 2895 */ 2896 static unsigned int 2897 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2898 { 2899 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2900 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2901 } 2902 2903 /* 2904 * Initialize an mi_result_t structure using `error' and `syserror' and 2905 * send it on file descriptor `fd'. Returns an IPMP error code. 2906 */ 2907 static unsigned int 2908 send_result(int fd, unsigned int error, int syserror) 2909 { 2910 mi_result_t me; 2911 2912 me.me_mpathd_error = error; 2913 if (error == IPMP_FAILURE) 2914 me.me_sys_error = syserror; 2915 else 2916 me.me_sys_error = 0; 2917 2918 return (ipmp_write(fd, &me, sizeof (me))); 2919 } 2920 2921 /* 2922 * Daemonize the process. 2923 */ 2924 static boolean_t 2925 daemonize(void) 2926 { 2927 switch (fork()) { 2928 case -1: 2929 return (_B_FALSE); 2930 2931 case 0: 2932 /* 2933 * Lose our controlling terminal, and become both a session 2934 * leader and a process group leader. 2935 */ 2936 if (setsid() == -1) 2937 return (_B_FALSE); 2938 2939 /* 2940 * Under POSIX, a session leader can accidentally (through 2941 * open(2)) acquire a controlling terminal if it does not 2942 * have one. Just to be safe, fork() again so we are not a 2943 * session leader. 2944 */ 2945 switch (fork()) { 2946 case -1: 2947 return (_B_FALSE); 2948 2949 case 0: 2950 (void) chdir("/"); 2951 (void) umask(022); 2952 (void) fdwalk(closefunc, NULL); 2953 break; 2954 2955 default: 2956 _exit(EXIT_SUCCESS); 2957 } 2958 break; 2959 2960 default: 2961 _exit(EXIT_SUCCESS); 2962 } 2963 2964 return (_B_TRUE); 2965 } 2966 2967 /* 2968 * The parent has created some fds before forking on purpose, keep them open. 2969 */ 2970 static int 2971 closefunc(void *not_used, int fd) 2972 /* ARGSUSED */ 2973 { 2974 if (fd != lsock_v4 && fd != lsock_v6) 2975 (void) close(fd); 2976 return (0); 2977 } 2978 2979 /* LOGGER */ 2980 2981 #include <syslog.h> 2982 2983 /* 2984 * Logging routines. All routines log to syslog, unless the daemon is 2985 * running in the foreground, in which case the logging goes to stderr. 2986 * 2987 * The following routines are available: 2988 * 2989 * logdebug(): A printf-like function for outputting debug messages 2990 * (messages at LOG_DEBUG) that are only of use to developers. 2991 * 2992 * logtrace(): A printf-like function for outputting tracing messages 2993 * (messages at LOG_INFO) from the daemon. This is typically used 2994 * to log the receipt of interesting network-related conditions. 2995 * 2996 * logerr(): A printf-like function for outputting error messages 2997 * (messages at LOG_ERR) from the daemon. 2998 * 2999 * logperror*(): A set of functions used to output error messages 3000 * (messages at LOG_ERR); these automatically append strerror(errno) 3001 * and a newline to the message passed to them. 3002 * 3003 * NOTE: since the logging functions write to syslog, the messages passed 3004 * to them are not eligible for localization. Thus, gettext() must 3005 * *not* be used. 3006 */ 3007 3008 static int logging = 0; 3009 3010 static void 3011 initlog(void) 3012 { 3013 logging++; 3014 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3015 } 3016 3017 /* PRINTFLIKE1 */ 3018 void 3019 logerr(char *fmt, ...) 3020 { 3021 va_list ap; 3022 3023 va_start(ap, fmt); 3024 3025 if (logging) 3026 vsyslog(LOG_ERR, fmt, ap); 3027 else 3028 (void) vfprintf(stderr, fmt, ap); 3029 va_end(ap); 3030 } 3031 3032 /* PRINTFLIKE1 */ 3033 void 3034 logtrace(char *fmt, ...) 3035 { 3036 va_list ap; 3037 3038 va_start(ap, fmt); 3039 3040 if (logging) 3041 vsyslog(LOG_INFO, fmt, ap); 3042 else 3043 (void) vfprintf(stderr, fmt, ap); 3044 va_end(ap); 3045 } 3046 3047 /* PRINTFLIKE1 */ 3048 void 3049 logdebug(char *fmt, ...) 3050 { 3051 va_list ap; 3052 3053 va_start(ap, fmt); 3054 3055 if (logging) 3056 vsyslog(LOG_DEBUG, fmt, ap); 3057 else 3058 (void) vfprintf(stderr, fmt, ap); 3059 va_end(ap); 3060 } 3061 3062 /* PRINTFLIKE1 */ 3063 void 3064 logperror(char *str) 3065 { 3066 if (logging) 3067 syslog(LOG_ERR, "%s: %m\n", str); 3068 else 3069 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3070 } 3071 3072 void 3073 logperror_pii(struct phyint_instance *pii, char *str) 3074 { 3075 if (logging) { 3076 syslog(LOG_ERR, "%s (%s %s): %m\n", 3077 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3078 } else { 3079 (void) fprintf(stderr, "%s (%s %s): %s\n", 3080 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3081 strerror(errno)); 3082 } 3083 } 3084 3085 void 3086 logperror_li(struct logint *li, char *str) 3087 { 3088 struct phyint_instance *pii = li->li_phyint_inst; 3089 3090 if (logging) { 3091 syslog(LOG_ERR, "%s (%s %s): %m\n", 3092 str, AF_STR(pii->pii_af), li->li_name); 3093 } else { 3094 (void) fprintf(stderr, "%s (%s %s): %s\n", 3095 str, AF_STR(pii->pii_af), li->li_name, 3096 strerror(errno)); 3097 } 3098 } 3099 3100 void 3101 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3102 { 3103 if (polled) 3104 (void) poll_remove(pii->pii_probe_sock); 3105 (void) close(pii->pii_probe_sock); 3106 pii->pii_probe_sock = -1; 3107 pii->pii_basetime_inited = 0; 3108 } 3109