1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License, Version 1.0 only 6 * (the "License"). You may not use this file except in compliance 7 * with the License. 8 * 9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 10 * or http://www.opensolaris.org/os/licensing. 11 * See the License for the specific language governing permissions 12 * and limitations under the License. 13 * 14 * When distributing Covered Code, include this CDDL HEADER in each 15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 16 * If applicable, add the following below this CDDL HEADER, with the 17 * fields enclosed by brackets "[]" replaced with your own identifying 18 * information: Portions Copyright [yyyy] [name of copyright owner] 19 * 20 * CDDL HEADER END 21 */ 22 /* 23 * Copyright 2004 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 #include "mpd_defs.h" 30 #include "mpd_tables.h" 31 32 int debug = 0; /* Debug flag */ 33 static int pollfd_num = 0; /* Num. of poll descriptors */ 34 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 35 36 /* All times below in ms */ 37 int user_failure_detection_time; /* user specified failure detection */ 38 /* time (fdt) */ 39 int user_probe_interval; /* derived from user specified fdt */ 40 41 static int rtsock_v4; /* AF_INET routing socket */ 42 static int rtsock_v6; /* AF_INET6 routing socket */ 43 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 44 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 45 static int lsock_v4; /* Listen socket to detect mpathd */ 46 static int lsock_v6; /* Listen socket to detect mpathd */ 47 static int mibfd = -1; /* fd to get mib info */ 48 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 49 50 boolean_t full_scan_required = _B_FALSE; 51 static uint_t last_initifs_time; /* Time when initifs was last run */ 52 static char **argv0; /* Saved for re-exec on SIGHUP */ 53 boolean_t handle_link_notifications = _B_TRUE; 54 55 static void initlog(void); 56 static void run_timeouts(void); 57 static void initifs(void); 58 static void check_if_removed(struct phyint_instance *pii); 59 static void select_test_ifs(void); 60 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 61 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 62 static void router_add_v4(mib2_ipRouteEntry_t *rp1, 63 struct in_addr nexthop_v4); 64 static void router_add_v6(mib2_ipv6RouteEntry_t *rp1, 65 struct in6_addr nexthop_v6); 66 static void router_add_common(int af, char *ifname, 67 struct in6_addr nexthop); 68 static void init_router_targets(); 69 static void cleanup(void); 70 static int setup_listener(int af); 71 static void check_config(void); 72 static void check_addr_unique(int af, char *name); 73 static void init_host_targets(void); 74 static void dup_host_targets(struct phyint_instance *desired_pii); 75 static void loopback_cmd(int sock, int family); 76 static int poll_remove(int fd); 77 static boolean_t daemonize(void); 78 static int closefunc(void *, int); 79 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 80 static unsigned int process_query(int fd, mi_query_t *miq); 81 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 82 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 83 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 84 static unsigned int send_result(int fd, unsigned int error, int syserror); 85 86 /* 87 * Return the current time in milliseconds (from an arbitrary reference) 88 * truncated to fit into an int. Truncation is ok since we are interested 89 * only in differences and not the absolute values. 90 */ 91 uint_t 92 getcurrenttime(void) 93 { 94 uint_t cur_time; /* In ms */ 95 96 /* 97 * Use of a non-user-adjustable source of time is 98 * required. However millisecond precision is sufficient. 99 * divide by 10^6 100 */ 101 cur_time = (uint_t)(gethrtime() / 1000000LL); 102 return (cur_time); 103 } 104 105 /* 106 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 107 */ 108 int 109 poll_add(int fd) 110 { 111 int i; 112 int new_num; 113 struct pollfd *newfds; 114 retry: 115 /* Check if already present */ 116 for (i = 0; i < pollfd_num; i++) { 117 if (pollfds[i].fd == fd) 118 return (0); 119 } 120 /* Check for empty spot already present */ 121 for (i = 0; i < pollfd_num; i++) { 122 if (pollfds[i].fd == -1) { 123 pollfds[i].fd = fd; 124 return (0); 125 } 126 } 127 128 /* Allocate space for 32 more fds and initialize to -1 */ 129 new_num = pollfd_num + 32; 130 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 131 if (newfds == NULL) { 132 logperror("poll_add: realloc"); 133 return (-1); 134 } 135 for (i = pollfd_num; i < new_num; i++) { 136 newfds[i].fd = -1; 137 newfds[i].events = POLLIN; 138 } 139 pollfd_num = new_num; 140 pollfds = newfds; 141 goto retry; 142 } 143 144 /* 145 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 146 */ 147 static int 148 poll_remove(int fd) 149 { 150 int i; 151 152 /* Check if already present */ 153 for (i = 0; i < pollfd_num; i++) { 154 if (pollfds[i].fd == fd) { 155 pollfds[i].fd = -1; 156 return (0); 157 } 158 } 159 return (-1); 160 } 161 162 /* 163 * Extract information about the phyint instance. If the phyint instance still 164 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 165 * will use it to detect phyint instances that don't exist any longer and 166 * remove them, from our database of phyint instances. 167 * Return value: 168 * returns true if the phyint instance exists in the kernel, 169 * returns false otherwise 170 */ 171 static boolean_t 172 pii_process(int af, char *name, struct phyint_instance **pii_p) 173 { 174 int err; 175 struct phyint_instance *pii; 176 struct phyint_instance *pii_other; 177 178 if (debug & D_PHYINT) 179 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 180 181 pii = phyint_inst_lookup(af, name); 182 if (pii == NULL) { 183 /* 184 * Phyint instance does not exist in our tables, 185 * create new phyint instance 186 */ 187 pii = phyint_inst_init_from_k(af, name); 188 } else { 189 /* Phyint exists in our tables */ 190 err = phyint_inst_update_from_k(pii); 191 192 switch (err) { 193 case PI_IOCTL_ERROR: 194 /* Some ioctl error. don't change anything */ 195 pii->pii_in_use = 1; 196 break; 197 198 case PI_GROUP_CHANGED: 199 /* 200 * The phyint has changed group. 201 */ 202 restore_phyint(pii->pii_phyint); 203 /* FALLTHRU */ 204 205 case PI_IFINDEX_CHANGED: 206 /* 207 * Interface index has changed. Delete and 208 * recreate the phyint as it is quite likely 209 * the interface has been unplumbed and replumbed. 210 */ 211 pii_other = phyint_inst_other(pii); 212 if (pii_other != NULL) 213 phyint_inst_delete(pii_other); 214 phyint_inst_delete(pii); 215 pii = phyint_inst_init_from_k(af, name); 216 break; 217 218 case PI_DELETED: 219 /* Phyint instance has disappeared from kernel */ 220 pii->pii_in_use = 0; 221 break; 222 223 case PI_OK: 224 /* Phyint instance exists and is fine */ 225 pii->pii_in_use = 1; 226 break; 227 228 default: 229 /* Unknown status */ 230 logerr("pii_process: Unknown status %d\n", err); 231 break; 232 } 233 } 234 235 *pii_p = pii; 236 if (pii != NULL) 237 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 238 else 239 return (_B_FALSE); 240 } 241 242 /* 243 * This phyint is leaving the group. Try to restore the phyint to its 244 * initial state. Return the addresses that belong to other group members, 245 * to the group, and take back any addresses owned by this phyint 246 */ 247 void 248 restore_phyint(struct phyint *pi) 249 { 250 if (pi->pi_group == phyint_anongroup) 251 return; 252 253 /* 254 * Move everthing to some other member in the group. 255 * The phyint has changed group in the kernel. But we 256 * have yet to do it in our tables. 257 */ 258 if (!pi->pi_empty) 259 (void) try_failover(pi, FAILOVER_TO_ANY); 260 /* 261 * Move all addresses owned by 'pi' back to pi, from each 262 * of the other members of the group 263 */ 264 (void) try_failback(pi, _B_FALSE); 265 } 266 267 /* 268 * Scan all interfaces to detect changes as well as new and deleted interfaces 269 */ 270 static void 271 initifs() 272 { 273 int n; 274 int af; 275 char *cp; 276 char *buf; 277 int numifs; 278 struct lifnum lifn; 279 struct lifconf lifc; 280 struct lifreq *lifr; 281 struct logint *li; 282 struct phyint_instance *pii; 283 struct phyint_instance *next_pii; 284 char pi_name[LIFNAMSIZ + 1]; 285 boolean_t exists; 286 struct phyint *pi; 287 288 if (debug & D_PHYINT) 289 logdebug("initifs: Scanning interfaces\n"); 290 291 last_initifs_time = getcurrenttime(); 292 293 /* 294 * Mark the interfaces so that we can find phyints and logints 295 * which have disappeared from the kernel. pii_process() and 296 * logint_init_from_k() will set {pii,li}_in_use when they find 297 * the interface in the kernel. Also, clear dupaddr bit on probe 298 * logint. check_addr_unique() will set the dupaddr bit on the 299 * probe logint, if the testaddress is not unique. 300 */ 301 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 302 pii->pii_in_use = 0; 303 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 304 li->li_in_use = 0; 305 if (pii->pii_probe_logint == li) 306 li->li_dupaddr = 0; 307 } 308 } 309 310 lifn.lifn_family = AF_UNSPEC; 311 lifn.lifn_flags = 0; 312 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 313 logperror("initifs: ioctl (get interface numbers)"); 314 return; 315 } 316 numifs = lifn.lifn_count; 317 318 buf = (char *)calloc(numifs, sizeof (struct lifreq)); 319 if (buf == NULL) { 320 logperror("initifs: calloc"); 321 return; 322 } 323 324 lifc.lifc_family = AF_UNSPEC; 325 lifc.lifc_flags = 0; 326 lifc.lifc_len = numifs * sizeof (struct lifreq); 327 lifc.lifc_buf = buf; 328 329 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 330 /* 331 * EINVAL is commonly encountered, when things change 332 * underneath us rapidly, (eg. at boot, when new interfaces 333 * are plumbed successively) and the kernel finds the buffer 334 * size we passed as too small. We will retry again 335 * when we see the next routing socket msg, or at worst after 336 * IF_SCAN_INTERVAL ms. 337 */ 338 if (errno != EINVAL) { 339 logperror("initifs: ioctl" 340 " (get interface configuration)"); 341 } 342 free(buf); 343 return; 344 } 345 346 lifr = (struct lifreq *)lifc.lifc_req; 347 348 /* 349 * For each lifreq returned by SIOGGLIFCONF, call pii_process() 350 * and get the state of the corresponding phyint_instance. If it is 351 * successful, then call logint_init_from_k() to get the state of the 352 * logint. 353 */ 354 for (n = lifc.lifc_len / sizeof (struct lifreq); n > 0; n--, lifr++) { 355 af = lifr->lifr_addr.ss_family; 356 357 /* 358 * Need to pass a phyint name to pii_process. Insert the 359 * null where the ':' IF_SEPARATOR is found in the logical 360 * name. 361 */ 362 (void) strncpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 363 pi_name[sizeof (pi_name) - 1] = '\0'; 364 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 365 *cp = '\0'; 366 367 exists = pii_process(af, pi_name, &pii); 368 if (exists) { 369 /* The phyint is fine. So process the logint */ 370 logint_init_from_k(pii, lifr->lifr_name); 371 } 372 check_addr_unique(af, lifr->lifr_name); 373 } 374 375 free(buf); 376 377 /* 378 * If the test address is now unique, and if it was not unique 379 * previously, clear the li_dupaddrmsg_printed flag and log a 380 * recovery message 381 */ 382 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 383 struct logint *li; 384 char abuf[INET6_ADDRSTRLEN]; 385 386 li = pii->pii_probe_logint; 387 if ((li != NULL) && !li->li_dupaddr && 388 li->li_dupaddrmsg_printed) { 389 logerr("Test address %s is unique; enabling probe-" 390 "based failure detection\n", 391 pr_addr(pii->pii_af, li->li_addr, abuf, 392 sizeof (abuf))); 393 li->li_dupaddrmsg_printed = 0; 394 } 395 } 396 397 /* 398 * Scan for phyints and logints that have disappeared from the 399 * kernel, and delete them. 400 */ 401 pii = phyint_instances; 402 403 while (pii != NULL) { 404 next_pii = pii->pii_next; 405 check_if_removed(pii); 406 pii = next_pii; 407 } 408 409 /* 410 * Select a test address for sending probes on each phyint instance 411 */ 412 select_test_ifs(); 413 414 /* 415 * Handle link up/down notifications from the NICs. 416 */ 417 process_link_state_changes(); 418 419 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 420 /* 421 * If this is a case of group failure, we don't have much 422 * to do until the group recovers again. 423 */ 424 if (GROUP_FAILED(pi->pi_group)) 425 continue; 426 427 /* 428 * Try/Retry any pending failovers / failbacks, that did not 429 * not complete, or that could not be initiated previously. 430 * This implements the 3 invariants described in the big block 431 * comment at the beginning of probe.c 432 */ 433 if (pi->pi_flags & IFF_INACTIVE) { 434 if (!pi->pi_empty) 435 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 436 } else { 437 struct phyint_instance *pii; 438 439 pii = pi->pi_v4; 440 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 441 pii = pi->pi_v6; 442 if (LINK_UP(pi) && !PROBE_CAPABLE(pii)) 443 continue; 444 /* 445 * It is possible that the phyint has started 446 * receiving packets, after it has been marked 447 * PI_FAILED. Don't initiate failover, if the 448 * phyint has started recovering. failure_state() 449 * captures this check. A similar logic is used 450 * for failback/repair case. 451 */ 452 if (pi->pi_state == PI_FAILED && !pi->pi_empty && 453 (failure_state(pii) == PHYINT_FAILURE)) { 454 (void) try_failover(pi, FAILOVER_NORMAL); 455 } else if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 456 if (try_failback(pi, _B_FALSE) != 457 IPMP_FAILURE) { 458 (void) change_lif_flags(pi, IFF_FAILED, 459 _B_FALSE); 460 /* Per state diagram */ 461 pi->pi_empty = 0; 462 } 463 } 464 } 465 } 466 } 467 468 /* 469 * Check that test/probe addresses are always unique. link-locals and 470 * ptp unnumbered may not be unique, and bind to such an (IFF_NOFAILOVER) 471 * address can produce unexpected results. Log an error and alert the user. 472 */ 473 static void 474 check_addr_unique(int af, char *name) 475 { 476 struct lifreq lifr; 477 struct phyint *pi; 478 struct in6_addr addr; 479 struct phyint_instance *pii; 480 struct sockaddr_in *sin; 481 struct sockaddr_in6 *sin6; 482 int ifsock; 483 char abuf[INET6_ADDRSTRLEN]; 484 485 /* Get the socket for doing ioctls */ 486 ifsock = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 487 488 (void) strncpy(lifr.lifr_name, name, sizeof (lifr.lifr_name)); 489 lifr.lifr_name[sizeof (lifr.lifr_name) - 1] = '\0'; 490 /* 491 * Get the address corresponding to 'name'. We cannot 492 * do a logint lookup in our tables, because, not all logints 493 * in the system are tracked by mpathd. (eg. things not in a group) 494 */ 495 if (ioctl(ifsock, SIOCGLIFADDR, (char *)&lifr) < 0) { 496 if (errno == ENXIO) { 497 /* Interface has vanished */ 498 return; 499 } else { 500 logperror("ioctl (get addr)"); 501 return; 502 } 503 } 504 505 if (af == AF_INET) { 506 sin = (struct sockaddr_in *)&lifr.lifr_addr; 507 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 508 } else { 509 sin6 = (struct sockaddr_in6 *)&lifr.lifr_addr; 510 addr = sin6->sin6_addr; 511 } 512 513 /* 514 * Does the address 'addr' match any known test address ? If so 515 * it is a duplicate, unless we are looking at the same logint 516 */ 517 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 518 pii = PHYINT_INSTANCE(pi, af); 519 if (pii == NULL || pii->pii_probe_logint == NULL) 520 continue; 521 522 if (!IN6_ARE_ADDR_EQUAL(&addr, 523 &pii->pii_probe_logint->li_addr)) { 524 continue; 525 } 526 527 if (strncmp(pii->pii_probe_logint->li_name, name, 528 sizeof (pii->pii_probe_logint->li_name)) == 0) { 529 continue; 530 } 531 532 /* 533 * This test address is not unique. Set the dupaddr bit 534 */ 535 pii->pii_probe_logint->li_dupaddr = 1; 536 537 /* 538 * Log an error message if not already logged 539 */ 540 if (pii->pii_probe_logint->li_dupaddrmsg_printed) 541 continue; 542 543 logerr("Test address %s is not unique; disabling " 544 "probe-based failure detection\n", 545 pr_addr(af, addr, abuf, sizeof (abuf))); 546 547 pii->pii_probe_logint->li_dupaddrmsg_printed = 1; 548 } 549 } 550 551 /* 552 * The pii_probe_logint used for probing, must satisfy the following properties 553 * with respect to its li_flags. 554 * IFF_NOFAILOVER - must be set (except in singleton group case) 555 * IFF_UP - must be set 556 * IFF_NOXMIT - must be clear 557 * IFF_NOLOCAL - must be clear 558 * IFF_DEPRECATED - preferably set (for IPv4) 559 */ 560 #define BEST_FLAG_SET (IFF_NOFAILOVER | IFF_UP | IFF_DEPRECATED) 561 #define CLEAR_FLAG_SET (IFF_NOXMIT | IFF_NOLOCAL) 562 #define TEST_CLEAR_FLAG_SET CLEAR_FLAG_SET 563 #define TEST_MINIMAL_FLAG_SET (IFF_UP | CLEAR_FLAG_SET) 564 #define TEST_BEST_FLAG_SET (BEST_FLAG_SET | CLEAR_FLAG_SET) 565 566 /* 567 * Stop probing an interface. Called when an interface is offlined. 568 * The probe socket is closed on each interface instance, and the 569 * interface state set to PI_OFFLINE. 570 */ 571 static void 572 stop_probing(struct phyint *pi) 573 { 574 struct phyint_instance *pii; 575 576 pii = pi->pi_v4; 577 if (pii != NULL) { 578 if (pii->pii_probe_sock != -1) 579 close_probe_socket(pii, _B_TRUE); 580 pii->pii_probe_logint = NULL; 581 } 582 583 pii = pi->pi_v6; 584 if (pii != NULL) { 585 if (pii->pii_probe_sock != -1) 586 close_probe_socket(pii, _B_TRUE); 587 pii->pii_probe_logint = NULL; 588 } 589 590 phyint_chstate(pi, PI_OFFLINE); 591 } 592 593 /* 594 * Do the test address selection for each phyint instance. Pick an 595 * IFF_NOFAILOVER address as test address. For singleton case, 596 * if user didn't configure an IFF_NOFAILOVER address, we will pick a 597 * normal address as test address. For (multiple adapter) groups, 598 * user is required to configure IFF_NOFAILOVER test address. Call 599 * phyint_inst_sockinit() to complete the initializations. 600 */ 601 static void 602 select_test_ifs(void) 603 { 604 struct phyint *pi; 605 struct phyint_instance *pii; 606 struct phyint_instance *next_pii; 607 struct logint *li; 608 struct logint *test_logint; 609 boolean_t target_scan_reqd = _B_FALSE; 610 struct target *tg; 611 612 if (debug & D_PHYINT) 613 logdebug("select_test_ifs\n"); 614 615 /* 616 * For each phyint instance, do the test address selection 617 */ 618 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 619 next_pii = pii->pii_next; 620 /* 621 * An interface that is offline, should not be probed. 622 * Offline interfaces should always in PI_OFFLINE state, 623 * unless some other entity has set the offline flag. 624 */ 625 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 626 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 627 logerr("shouldn't be probing offline" 628 " interface %s (state is: %u)." 629 " Stopping probes.\n", 630 pii->pii_phyint->pi_name, 631 pii->pii_phyint->pi_state); 632 stop_probing(pii->pii_phyint); 633 } 634 continue; 635 } 636 637 test_logint = pii->pii_probe_logint; 638 639 if (test_logint != NULL) { 640 if ((test_logint->li_flags & TEST_BEST_FLAG_SET) 641 == BEST_FLAG_SET) 642 continue; 643 644 /* 645 * If user configures IFF_NOXMIT or IFF_NOLOCAL 646 * flags on test addresses after in.mpathd has 647 * has started, the daemon aborts. In future 648 * this can be better handling, i.e. instead 649 * of abort the daemon, a more appropriate 650 * action may be issuing a warning and choose 651 * a different test address. 652 */ 653 assert((test_logint->li_flags & TEST_CLEAR_FLAG_SET) 654 == 0); 655 } 656 657 /* 658 * Walk the logints of this phyint instance, and select 659 * the best available test address 660 */ 661 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 662 /* 663 * Skip any IPv6 logints that are not link-local, 664 * since we should always have a link-local address 665 * anyway and in6_data() expects link-local replies. 666 */ 667 if (pii->pii_af == AF_INET6 && 668 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 669 continue; 670 671 if ((li->li_flags & TEST_MINIMAL_FLAG_SET) == IFF_UP) { 672 /* 673 * Now we have a testaddress, that satisfies 674 * the minimal properties. 675 */ 676 if ((li->li_flags & TEST_BEST_FLAG_SET) 677 == BEST_FLAG_SET) { 678 /* 679 * This is the best possible address. 680 * So break, and continue to the 681 * next phyint 682 */ 683 test_logint = li; 684 break; 685 } 686 if ((test_logint == NULL) || 687 (!(test_logint->li_flags & 688 IFF_NOFAILOVER) && 689 (li->li_flags & IFF_NOFAILOVER))) 690 /* 691 * This is a possible candidate, 692 * unless we find a better one. 693 */ 694 test_logint = li; 695 } 696 } 697 698 /* 699 * If we've gone from a singleton group to a multiple adapter 700 * group, and we haven't found an IFF_NOFAILOVER test address 701 * by now, the old test address is no longer valid. If we are 702 * not dealing with a singleton group, and the above test 703 * address selection loop has selected a non IFF_NOFAILOVER 704 * address as a candidate, we will correct that here. 705 */ 706 if ((test_logint != NULL) && 707 !SINGLETON_GROUP(pii->pii_phyint) && 708 !(test_logint->li_flags & IFF_NOFAILOVER)) { 709 test_logint = NULL; 710 if (pii->pii_probe_sock != -1) 711 close_probe_socket(pii, _B_TRUE); 712 pii->pii_probe_logint = NULL; 713 } 714 715 if (test_logint == NULL) { 716 /* 717 * We don't have a test address. Don't print an 718 * error message immediately. check_config() will 719 * take care of it. Zero out the probe stats array 720 * since it is no longer relevant. Optimize by 721 * checking if it is already zeroed out. 722 */ 723 int pr_ndx; 724 725 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 726 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 727 clear_pii_probe_stats(pii); 728 reset_crtt_all(pii->pii_phyint); 729 } 730 continue; 731 } else if (test_logint == pii->pii_probe_logint) { 732 /* 733 * If we didn't find any new test addr, go to the 734 * next phyint. 735 */ 736 continue; 737 } 738 739 /* 740 * The phyint is either being assigned a new testaddr 741 * or is being assigned a testaddr for the 1st time. 742 * Need to initialize the phyint socket 743 */ 744 pii->pii_probe_logint = test_logint; 745 if (!phyint_inst_sockinit(pii)) { 746 if (debug & D_PHYINT) { 747 logdebug("select_test_ifs: " 748 "phyint_sockinit failed\n"); 749 } 750 phyint_inst_delete(pii); 751 continue; 752 } 753 754 /* 755 * This phyint instance is now enabled for probes; this 756 * impacts our state machine in two ways: 757 * 758 * 1. If we're probe *capable* as well (i.e., we have 759 * probe targets) and the interface is in PI_NOTARGETS, 760 * then transition to PI_RUNNING. 761 * 762 * 2. If we're not probe capable, and the other phyint 763 * instance is also not probe capable, and we were in 764 * PI_RUNNING, then transition to PI_NOTARGETS. 765 * 766 * Also see the state diagram in mpd_probe.c. 767 */ 768 if (PROBE_CAPABLE(pii)) { 769 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 770 phyint_chstate(pii->pii_phyint, PI_RUNNING); 771 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 772 if (pii->pii_phyint->pi_state == PI_RUNNING) 773 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 774 } 775 776 if (pii->pii_phyint->pi_flags & IFF_POINTOPOINT) { 777 tg = pii->pii_targets; 778 if (tg != NULL) 779 target_delete(tg); 780 assert(pii->pii_targets == NULL); 781 assert(pii->pii_target_next == NULL); 782 assert(pii->pii_ntargets == 0); 783 target_create(pii, test_logint->li_dstaddr, 784 _B_TRUE); 785 } 786 787 /* 788 * If no targets are currently known for this phyint 789 * we need to call init_router_targets. Since 790 * init_router_targets() initializes the list of targets 791 * for all phyints it is done below the loop. 792 */ 793 if (pii->pii_targets == NULL) 794 target_scan_reqd = _B_TRUE; 795 796 /* 797 * Start the probe timer for this instance. 798 */ 799 if (!pii->pii_basetime_inited && pii->pii_probe_sock != -1) { 800 start_timer(pii); 801 pii->pii_basetime_inited = 1; 802 } 803 } 804 805 /* 806 * Check the interface list for any interfaces that are marked 807 * PI_FAILED but no longer enabled to send probes, and call 808 * phyint_check_for_repair() to see if the link now indicates that the 809 * interface should be repaired. Also see the state diagram in 810 * mpd_probe.c. 811 */ 812 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 813 if (pi->pi_state == PI_FAILED && 814 !PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) { 815 phyint_check_for_repair(pi); 816 } 817 } 818 819 /* 820 * Try to populate the target list. init_router_targets populates 821 * the target list from the routing table. If our target list is 822 * still empty, init_host_targets adds host targets based on the 823 * host target list of other phyints in the group. 824 */ 825 if (target_scan_reqd) { 826 init_router_targets(); 827 init_host_targets(); 828 } 829 } 830 831 /* 832 * Check phyint group configuration, to detect any inconsistencies, 833 * and log an error message. This is called from runtimeouts every 834 * 20 secs. But the error message is displayed once. If the 835 * consistency is resolved by the admin, a recovery message is displayed 836 * once. 837 */ 838 static void 839 check_config(void) 840 { 841 struct phyint_group *pg; 842 struct phyint *pi; 843 boolean_t v4_in_group; 844 boolean_t v6_in_group; 845 846 /* 847 * All phyints of a group must be homogenous to ensure that 848 * failover or failback can be done. If any phyint in a group 849 * has IPv4 plumbed, check that all phyints have IPv4 plumbed. 850 * Do a similar check for IPv6. 851 */ 852 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 853 if (pg == phyint_anongroup) 854 continue; 855 856 v4_in_group = _B_FALSE; 857 v6_in_group = _B_FALSE; 858 /* 859 * 1st pass. Determine if at least 1 phyint in the group 860 * has IPv4 plumbed and if so set v4_in_group to true. 861 * Repeat similarly for IPv6. 862 */ 863 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 864 if (pi->pi_v4 != NULL) 865 v4_in_group = _B_TRUE; 866 if (pi->pi_v6 != NULL) 867 v6_in_group = _B_TRUE; 868 } 869 870 /* 871 * 2nd pass. If v4_in_group is true, check that phyint 872 * has IPv4 plumbed. Repeat similarly for IPv6. Print 873 * out a message the 1st time only. 874 */ 875 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 876 if (pi->pi_flags & IFF_OFFLINE) 877 continue; 878 879 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 880 if (!pi->pi_cfgmsg_printed) { 881 logerr("NIC %s of group %s is" 882 " not plumbed for IPv4 and may" 883 " affect failover capability\n", 884 pi->pi_name, 885 pi->pi_group->pg_name); 886 pi->pi_cfgmsg_printed = 1; 887 } 888 } else if (v6_in_group == _B_TRUE && 889 pi->pi_v6 == NULL) { 890 if (!pi->pi_cfgmsg_printed) { 891 logerr("NIC %s of group %s is" 892 " not plumbed for IPv6 and may" 893 " affect failover capability\n", 894 pi->pi_name, 895 pi->pi_group->pg_name); 896 pi->pi_cfgmsg_printed = 1; 897 } 898 } else { 899 /* 900 * The phyint matches the group configuration, 901 * if we have reached this point. If it was 902 * improperly configured earlier, log an 903 * error recovery message 904 */ 905 if (pi->pi_cfgmsg_printed) { 906 logerr("NIC %s is now consistent with " 907 "group %s and failover capability " 908 "is restored\n", pi->pi_name, 909 pi->pi_group->pg_name); 910 pi->pi_cfgmsg_printed = 0; 911 } 912 } 913 914 } 915 } 916 917 /* 918 * In order to perform probe-based failure detection, a phyint must 919 * have at least 1 test/probe address for sending and receiving probes 920 * (either on IPv4 or IPv6 instance or both). If no test address has 921 * been configured, notify the administrator, but continue on since we 922 * can still perform load spreading, along with "link up/down" based 923 * failure detection. 924 * 925 * Note: In the singleton group case, when user didn't configure 926 * a test address, the probe address is picked by this daemon. 927 */ 928 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 929 if (pi->pi_flags & IFF_OFFLINE) 930 continue; 931 932 if ((pi->pi_v4 == NULL || 933 pi->pi_v4->pii_probe_logint == NULL) && 934 (pi->pi_v6 == NULL || 935 pi->pi_v6->pii_probe_logint == NULL)) { 936 if (!pi->pi_taddrmsg_printed) { 937 logerr("No test address configured on " 938 "interface %s; disabling probe-based " 939 "failure detection on it\n", pi->pi_name); 940 pi->pi_taddrmsg_printed = 1; 941 } 942 } else if (pi->pi_taddrmsg_printed) { 943 logerr("Test address now configured on interface %s; " 944 "enabling probe-based failure detection on it\n", 945 pi->pi_name); 946 pi->pi_taddrmsg_printed = 0; 947 } 948 949 } 950 } 951 952 /* 953 * Timer mechanism using relative time (in milliseconds) from the 954 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 955 * will fire after TIMER_INFINITY milliseconds. 956 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 957 * time values. Hence 2 consecutive timer events cannot be spaced farther 958 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 959 * that can be passed for the delay parameter of timer_schedule() 960 */ 961 static uint_t timer_next; /* Currently scheduled timeout */ 962 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 963 964 static void 965 timer_init(void) 966 { 967 timer_next = getcurrenttime() + TIMER_INFINITY; 968 /* 969 * The call to run_timeouts() will get the timer started 970 * Since there are no phyints at this point, the timer will 971 * be set for IF_SCAN_INTERVAL ms. 972 */ 973 run_timeouts(); 974 } 975 976 /* 977 * Make sure the next SIGALRM occurs delay milliseconds from the current 978 * time if not earlier. We are interested only in time differences. 979 */ 980 void 981 timer_schedule(uint_t delay) 982 { 983 uint_t now; 984 struct itimerval itimerval; 985 986 if (debug & D_TIMER) 987 logdebug("timer_schedule(%u)\n", delay); 988 989 assert(delay <= TIMER_INFINITY); 990 991 now = getcurrenttime(); 992 if (delay == 0) { 993 /* Minimum allowed delay */ 994 delay = 1; 995 } 996 /* Will this timer occur before the currently scheduled SIGALRM? */ 997 if (timer_active && TIME_GE(now + delay, timer_next)) { 998 if (debug & D_TIMER) { 999 logdebug("timer_schedule(%u) - no action: " 1000 "now %u next %u\n", delay, now, timer_next); 1001 } 1002 return; 1003 } 1004 timer_next = now + delay; 1005 1006 itimerval.it_value.tv_sec = delay / 1000; 1007 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 1008 itimerval.it_interval.tv_sec = 0; 1009 itimerval.it_interval.tv_usec = 0; 1010 if (debug & D_TIMER) { 1011 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1012 delay, itimerval.it_value.tv_sec, 1013 itimerval.it_value.tv_usec); 1014 } 1015 timer_active = _B_TRUE; 1016 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1017 logperror("timer_schedule: setitimer"); 1018 exit(2); 1019 } 1020 } 1021 1022 /* 1023 * Timer has fired. Determine when the next timer event will occur by asking 1024 * all the timer routines. Should not be called from a timer routine. 1025 */ 1026 static void 1027 run_timeouts(void) 1028 { 1029 uint_t next; 1030 uint_t next_event_time; 1031 struct phyint_instance *pii; 1032 struct phyint_instance *next_pii; 1033 static boolean_t timeout_running; 1034 1035 /* assert that recursive timeouts don't happen. */ 1036 assert(!timeout_running); 1037 1038 timeout_running = _B_TRUE; 1039 1040 if (debug & D_TIMER) 1041 logdebug("run_timeouts()\n"); 1042 1043 next = TIMER_INFINITY; 1044 1045 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1046 next_pii = pii->pii_next; 1047 next_event_time = phyint_inst_timer(pii); 1048 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1049 next = next_event_time; 1050 1051 if (debug & D_TIMER) { 1052 logdebug("run_timeouts(%s %s): next scheduled for" 1053 " this phyint inst %u, next scheduled global" 1054 " %u ms\n", 1055 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1056 next_event_time, next); 1057 } 1058 } 1059 1060 /* 1061 * Make sure initifs() is called at least once every 1062 * IF_SCAN_INTERVAL, to make sure that we are in sync 1063 * with the kernel, in case we have missed any routing 1064 * socket messages. 1065 */ 1066 if (next > IF_SCAN_INTERVAL) 1067 next = IF_SCAN_INTERVAL; 1068 1069 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1070 initifs(); 1071 check_config(); 1072 } 1073 1074 if (debug & D_TIMER) 1075 logdebug("run_timeouts: %u ms\n", next); 1076 1077 timer_schedule(next); 1078 timeout_running = _B_FALSE; 1079 } 1080 1081 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1082 static int eventpipe_write = -1; 1083 static boolean_t cleanup_started = _B_FALSE; 1084 /* Don't write to eventpipe if in cleanup */ 1085 /* 1086 * Ensure that signals are processed synchronously with the rest of 1087 * the code by just writing a one character signal number on the pipe. 1088 * The poll loop will pick this up and process the signal event. 1089 */ 1090 static void 1091 sig_handler(int signo) 1092 { 1093 uchar_t buf = (uchar_t)signo; 1094 1095 /* 1096 * Don't write to pipe if cleanup has already begun. cleanup() 1097 * might have closed the pipe already 1098 */ 1099 if (cleanup_started) 1100 return; 1101 1102 if (eventpipe_write == -1) { 1103 logerr("sig_handler: no pipe found\n"); 1104 return; 1105 } 1106 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1107 logperror("sig_handler: write"); 1108 } 1109 1110 extern struct probes_missed probes_missed; 1111 1112 /* 1113 * Pick up a signal "byte" from the pipe and process it. 1114 */ 1115 static void 1116 in_signal(int fd) 1117 { 1118 uchar_t buf; 1119 uint64_t sent, acked, lost, unacked, unknown; 1120 struct phyint_instance *pii; 1121 int pr_ndx; 1122 1123 switch (read(fd, &buf, sizeof (buf))) { 1124 case -1: 1125 logperror("in_signal: read"); 1126 exit(1); 1127 /* NOTREACHED */ 1128 case 1: 1129 break; 1130 case 0: 1131 logerr("in_signal: read end of file\n"); 1132 exit(1); 1133 /* NOTREACHED */ 1134 default: 1135 logerr("in_signal: read > 1\n"); 1136 exit(1); 1137 } 1138 1139 if (debug & D_TIMER) 1140 logdebug("in_signal() got %d\n", buf); 1141 1142 switch (buf) { 1143 case SIGALRM: 1144 if (debug & D_TIMER) { 1145 uint_t now = getcurrenttime(); 1146 1147 logdebug("in_signal(SIGALRM) delta %u\n", 1148 now - timer_next); 1149 } 1150 timer_active = _B_FALSE; 1151 run_timeouts(); 1152 break; 1153 case SIGUSR1: 1154 logdebug("Printing configuration:\n"); 1155 /* Print out the internal tables */ 1156 phyint_inst_print_all(); 1157 1158 /* 1159 * Print out the accumulated statistics about missed 1160 * probes (happens due to scheduling delay). 1161 */ 1162 logerr("Missed sending total of %d probes spread over" 1163 " %d occurrences\n", probes_missed.pm_nprobes, 1164 probes_missed.pm_ntimes); 1165 1166 /* 1167 * Print out the accumulated statistics about probes 1168 * that were sent. 1169 */ 1170 for (pii = phyint_instances; pii != NULL; 1171 pii = pii->pii_next) { 1172 unacked = 0; 1173 acked = pii->pii_cum_stats.acked; 1174 lost = pii->pii_cum_stats.lost; 1175 sent = pii->pii_cum_stats.sent; 1176 unknown = pii->pii_cum_stats.unknown; 1177 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1178 switch (pii->pii_probes[pr_ndx].pr_status) { 1179 case PR_ACKED: 1180 acked++; 1181 break; 1182 case PR_LOST: 1183 lost++; 1184 break; 1185 case PR_UNACKED: 1186 unacked++; 1187 break; 1188 } 1189 } 1190 logerr("\nProbe stats on (%s %s)\n" 1191 "Number of probes sent %lld\n" 1192 "Number of probe acks received %lld\n" 1193 "Number of probes/acks lost %lld\n" 1194 "Number of valid unacknowled probes %lld\n" 1195 "Number of ambiguous probe acks received %lld\n", 1196 AF_STR(pii->pii_af), pii->pii_name, 1197 sent, acked, lost, unacked, unknown); 1198 } 1199 break; 1200 case SIGHUP: 1201 logerr("SIGHUP: restart and reread config file\n"); 1202 cleanup(); 1203 (void) execv(argv0[0], argv0); 1204 _exit(0177); 1205 /* NOTREACHED */ 1206 case SIGINT: 1207 case SIGTERM: 1208 case SIGQUIT: 1209 cleanup(); 1210 exit(0); 1211 /* NOTREACHED */ 1212 default: 1213 logerr("in_signal: unknown signal: %d\n", buf); 1214 } 1215 } 1216 1217 static void 1218 cleanup(void) 1219 { 1220 struct phyint_instance *pii; 1221 struct phyint_instance *next_pii; 1222 1223 /* 1224 * Make sure that we don't write to eventpipe in 1225 * sig_handler() if any signal notably SIGALRM, 1226 * occurs after we close the eventpipe descriptor below 1227 */ 1228 cleanup_started = _B_TRUE; 1229 1230 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1231 next_pii = pii->pii_next; 1232 phyint_inst_delete(pii); 1233 } 1234 1235 (void) close(ifsock_v4); 1236 (void) close(ifsock_v6); 1237 (void) close(rtsock_v4); 1238 (void) close(rtsock_v6); 1239 (void) close(lsock_v4); 1240 (void) close(lsock_v6); 1241 (void) close(0); 1242 (void) close(1); 1243 (void) close(2); 1244 (void) close(mibfd); 1245 (void) close(eventpipe_read); 1246 (void) close(eventpipe_write); 1247 } 1248 1249 /* 1250 * Create pipe for signal delivery and set up signal handlers. 1251 */ 1252 static void 1253 setup_eventpipe(void) 1254 { 1255 int fds[2]; 1256 struct sigaction act; 1257 1258 if ((pipe(fds)) < 0) { 1259 logperror("setup_eventpipe: pipe"); 1260 exit(1); 1261 } 1262 eventpipe_read = fds[0]; 1263 eventpipe_write = fds[1]; 1264 if (poll_add(eventpipe_read) == -1) { 1265 exit(1); 1266 } 1267 1268 act.sa_handler = sig_handler; 1269 act.sa_flags = SA_RESTART; 1270 (void) sigaction(SIGALRM, &act, NULL); 1271 1272 (void) sigset(SIGHUP, sig_handler); 1273 (void) sigset(SIGUSR1, sig_handler); 1274 (void) sigset(SIGTERM, sig_handler); 1275 (void) sigset(SIGINT, sig_handler); 1276 (void) sigset(SIGQUIT, sig_handler); 1277 } 1278 1279 /* 1280 * Create a routing socket for receiving RTM_IFINFO messages. 1281 */ 1282 static int 1283 setup_rtsock(int af) 1284 { 1285 int s; 1286 int flags; 1287 1288 s = socket(PF_ROUTE, SOCK_RAW, af); 1289 if (s == -1) { 1290 logperror("setup_rtsock: socket PF_ROUTE"); 1291 exit(1); 1292 } 1293 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1294 logperror("setup_rtsock: fcntl F_GETFL"); 1295 (void) close(s); 1296 exit(1); 1297 } 1298 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1299 logperror("setup_rtsock: fcntl F_SETFL"); 1300 (void) close(s); 1301 exit(1); 1302 } 1303 if (poll_add(s) == -1) { 1304 (void) close(s); 1305 exit(1); 1306 } 1307 return (s); 1308 } 1309 1310 /* 1311 * Process an RTM_IFINFO message received on a routing socket. 1312 * The return value indicates whether a full interface scan is required. 1313 * Link up/down notifications from the NICs are reflected in the 1314 * IFF_RUNNING flag. 1315 * If just the state of the IFF_RUNNING interface flag has changed, a 1316 * a full interface scan isn't required. 1317 */ 1318 static boolean_t 1319 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1320 { 1321 struct sockaddr_dl *sdl; 1322 struct phyint *pi; 1323 uint64_t old_flags; 1324 struct phyint_instance *pii; 1325 1326 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1327 1328 /* 1329 * Although the sockaddr_dl structure is directly after the 1330 * if_msghdr_t structure. At the time of writing, the size of the 1331 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1332 * to the presence of a timeval structure, which contains longs, 1333 * in the if_data structure. Anyway, we know where the message ends, 1334 * so we work backwards to get the start of the sockaddr_dl structure. 1335 */ 1336 /*LINTED*/ 1337 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1338 sizeof (struct sockaddr_dl)); 1339 1340 assert(sdl->sdl_family == AF_LINK); 1341 1342 /* 1343 * The interface name is in sdl_data. 1344 * RTM_IFINFO messages are only generated for logical interface 1345 * zero, so there is no colon and logical interface number to 1346 * strip from the name. The name is not null terminated, but 1347 * there should be enough space in sdl_data to add the null. 1348 */ 1349 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1350 if (debug & D_LINKNOTE) 1351 logdebug("process_rtm_ifinfo: " 1352 "phyint name too long\n"); 1353 return (_B_TRUE); 1354 } 1355 sdl->sdl_data[sdl->sdl_nlen] = 0; 1356 1357 pi = phyint_lookup(sdl->sdl_data); 1358 if (pi == NULL) { 1359 if (debug & D_LINKNOTE) 1360 logdebug("process_rtm_ifinfo: phyint lookup failed" 1361 " for %s\n", sdl->sdl_data); 1362 return (_B_TRUE); 1363 } 1364 1365 /* 1366 * We want to try and avoid doing a full interface scan for 1367 * link state notifications from the NICs, as indicated 1368 * by the state of the IFF_RUNNING flag. If just the 1369 * IFF_RUNNING flag has changed state, the link state changes 1370 * are processed without a full scan. 1371 * If there is both an IPv4 and IPv6 instance associated with 1372 * the physical interface, we will get an RTM_IFINFO message 1373 * for each instance. If we just maintained a single copy of 1374 * the physical interface flags, it would appear that no flags 1375 * had changed when the second message is processed, leading us 1376 * to believe that the message wasn't generated by a flags change, 1377 * and that a full interface scan is required. 1378 * To get around this problem, two additional copies of the flags 1379 * are kept, one copy for each instance. These are only used in 1380 * this routine. At any one time, all three copies of the flags 1381 * should be identical except for the IFF_RUNNING flag. The 1382 * copy of the flags in the "phyint" structure is always up to 1383 * date. 1384 */ 1385 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1386 if (pii == NULL) { 1387 if (debug & D_LINKNOTE) 1388 logdebug("process_rtm_ifinfo: no instance of address " 1389 "family %s for %s\n", AF_STR(type), pi->pi_name); 1390 return (_B_TRUE); 1391 } 1392 1393 old_flags = pii->pii_flags; 1394 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1395 pi->pi_flags = pii->pii_flags; 1396 1397 if (debug & D_LINKNOTE) { 1398 logdebug("process_rtm_ifinfo: %s address family: %s, " 1399 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1400 AF_STR(type), old_flags, pi->pi_flags); 1401 } 1402 1403 /* 1404 * If IFF_STANDBY has changed, indicate that the interface has changed 1405 * types. 1406 */ 1407 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1408 phyint_newtype(pi); 1409 1410 /* 1411 * If IFF_INACTIVE has been set, then no data addresses should be 1412 * hosted on the interface. If IFF_INACTIVE has been cleared, then 1413 * move previously failed-over addresses back to it, provided it is 1414 * not failed. For details, see the state diagram in mpd_probe.c. 1415 */ 1416 if ((old_flags ^ pii->pii_flags) & IFF_INACTIVE) { 1417 if (pii->pii_flags & IFF_INACTIVE) { 1418 assert(pii->pii_flags & IFF_STANDBY); 1419 if (!pi->pi_empty) { 1420 (void) try_failover(pi, FAILOVER_TO_NONSTANDBY); 1421 } 1422 } else { 1423 if (pi->pi_state == PI_RUNNING && !pi->pi_full) { 1424 pi->pi_empty = 0; 1425 (void) try_failback(pi, _B_FALSE); 1426 } 1427 } 1428 } 1429 1430 /* Has just the IFF_RUNNING flag changed state ? */ 1431 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1432 struct phyint_instance *pii_other; 1433 /* 1434 * It wasn't just a link state change. Update 1435 * the other instance's copy of the flags. 1436 */ 1437 pii_other = phyint_inst_other(pii); 1438 if (pii_other != NULL) 1439 pii_other->pii_flags = pii->pii_flags; 1440 return (_B_TRUE); 1441 } 1442 1443 return (_B_FALSE); 1444 } 1445 1446 /* 1447 * Retrieve as many routing socket messages as possible, and try to 1448 * empty the routing sockets. Initiate full scan of targets or interfaces 1449 * as needed. 1450 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1451 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1452 */ 1453 static void 1454 process_rtsock(int rtsock_v4, int rtsock_v6) 1455 { 1456 int nbytes; 1457 int64_t msg[2048 / 8]; 1458 struct rt_msghdr *rtm; 1459 boolean_t need_if_scan = _B_FALSE; 1460 boolean_t need_rt_scan = _B_FALSE; 1461 boolean_t rtm_ifinfo_seen = _B_FALSE; 1462 int type; 1463 1464 /* Read as many messages as possible and try to empty the sockets */ 1465 for (type = AF_INET; ; type = AF_INET6) { 1466 for (;;) { 1467 nbytes = read((type == AF_INET) ? rtsock_v4 : 1468 rtsock_v6, msg, sizeof (msg)); 1469 if (nbytes <= 0) { 1470 /* No more messages */ 1471 break; 1472 } 1473 rtm = (struct rt_msghdr *)msg; 1474 if (rtm->rtm_version != RTM_VERSION) { 1475 logerr("process_rtsock: version %d " 1476 "not understood\n", rtm->rtm_version); 1477 break; 1478 } 1479 1480 if (debug & D_PHYINT) { 1481 logdebug("process_rtsock: message %d\n", 1482 rtm->rtm_type); 1483 } 1484 1485 switch (rtm->rtm_type) { 1486 case RTM_NEWADDR: 1487 case RTM_DELADDR: 1488 /* 1489 * Some logical interface has changed, 1490 * have to scan everything to determine 1491 * what actually changed. 1492 */ 1493 need_if_scan = _B_TRUE; 1494 break; 1495 1496 case RTM_IFINFO: 1497 rtm_ifinfo_seen = _B_TRUE; 1498 need_if_scan |= 1499 process_rtm_ifinfo((if_msghdr_t *)rtm, 1500 type); 1501 break; 1502 1503 case RTM_ADD: 1504 case RTM_DELETE: 1505 case RTM_CHANGE: 1506 case RTM_OLDADD: 1507 case RTM_OLDDEL: 1508 need_rt_scan = _B_TRUE; 1509 break; 1510 1511 default: 1512 /* Not interesting */ 1513 break; 1514 } 1515 } 1516 if (type == AF_INET6) 1517 break; 1518 } 1519 1520 if (need_if_scan) { 1521 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1522 logdebug("process_rtsock: synchronizing with kernel\n"); 1523 initifs(); 1524 } else if (rtm_ifinfo_seen) { 1525 if (debug & D_LINKNOTE) 1526 logdebug("process_rtsock: " 1527 "link up/down notification(s) seen\n"); 1528 process_link_state_changes(); 1529 } 1530 1531 if (need_rt_scan) 1532 init_router_targets(); 1533 } 1534 1535 /* 1536 * Look if the phyint instance or one of its logints have been removed from 1537 * the kernel and take appropriate action. 1538 * Uses {pii,li}_in_use. 1539 */ 1540 static void 1541 check_if_removed(struct phyint_instance *pii) 1542 { 1543 struct logint *li; 1544 struct logint *next_li; 1545 1546 /* Detect phyints that have been removed from the kernel. */ 1547 if (!pii->pii_in_use) { 1548 logtrace("%s %s has been removed from kernel\n", 1549 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1550 phyint_inst_delete(pii); 1551 } else { 1552 /* Detect logints that have been removed. */ 1553 for (li = pii->pii_logint; li != NULL; li = next_li) { 1554 next_li = li->li_next; 1555 if (!li->li_in_use) { 1556 logint_delete(li); 1557 } 1558 } 1559 } 1560 } 1561 1562 /* 1563 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1564 * tables defined by mib2.h. Parse the returned data and extract 1565 * the 'routing' information table. Process the 'routing' table 1566 * to get the list of known onlink routers, and update our database. 1567 * These onlink routers will serve as our probe targets. 1568 * Returns false, if any system calls resulted in errors, true otherwise. 1569 */ 1570 static boolean_t 1571 update_router_list(int fd) 1572 { 1573 union { 1574 char ubuf[1024]; 1575 union T_primitives uprim; 1576 } buf; 1577 1578 int flags; 1579 struct strbuf ctlbuf; 1580 struct strbuf databuf; 1581 struct T_optmgmt_req *tor; 1582 struct T_optmgmt_ack *toa; 1583 struct T_error_ack *tea; 1584 struct opthdr *optp; 1585 struct opthdr *req; 1586 int status; 1587 t_scalar_t prim; 1588 1589 tor = (struct T_optmgmt_req *)&buf; 1590 1591 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1592 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1593 tor->OPT_length = sizeof (struct opthdr); 1594 tor->MGMT_flags = T_CURRENT; 1595 1596 req = (struct opthdr *)&tor[1]; 1597 req->level = MIB2_IP; /* any MIB2_xxx value ok here */ 1598 req->name = 0; 1599 req->len = 0; 1600 1601 ctlbuf.buf = (char *)&buf; 1602 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1603 ctlbuf.maxlen = sizeof (buf); 1604 flags = 0; 1605 if (putmsg(fd, &ctlbuf, NULL, flags) == -1) { 1606 logperror("update_router_list: putmsg(ctl)"); 1607 return (_B_FALSE); 1608 } 1609 1610 /* 1611 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1612 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1613 * a control and data part. The control part contains a struct 1614 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1615 * the level, name and length of the data in the data part. The 1616 * data part contains the actual table data. The last message 1617 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1618 * single option with zero optlen. 1619 */ 1620 1621 for (;;) { 1622 /* 1623 * Go around this loop once for each table. Ignore 1624 * all tables except the routing information table. 1625 */ 1626 flags = 0; 1627 status = getmsg(fd, &ctlbuf, NULL, &flags); 1628 if (status < 0) { 1629 if (errno == EINTR) 1630 continue; 1631 logperror("update_router_list: getmsg(ctl)"); 1632 return (_B_FALSE); 1633 } 1634 if (ctlbuf.len < sizeof (t_scalar_t)) { 1635 logerr("update_router_list: ctlbuf.len %d\n", 1636 ctlbuf.len); 1637 return (_B_FALSE); 1638 } 1639 1640 prim = buf.uprim.type; 1641 1642 switch (prim) { 1643 1644 case T_ERROR_ACK: 1645 tea = &buf.uprim.error_ack; 1646 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1647 logerr("update_router_list: T_ERROR_ACK" 1648 " ctlbuf.len %d\n", ctlbuf.len); 1649 return (_B_FALSE); 1650 } 1651 logerr("update_router_list: T_ERROR_ACK:" 1652 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1653 tea->TLI_error, tea->UNIX_error); 1654 return (_B_FALSE); 1655 1656 case T_OPTMGMT_ACK: 1657 toa = &buf.uprim.optmgmt_ack; 1658 optp = (struct opthdr *)&toa[1]; 1659 if (ctlbuf.len < sizeof (struct T_optmgmt_ack)) { 1660 logerr("update_router_list: ctlbuf.len %d\n", 1661 ctlbuf.len); 1662 return (_B_FALSE); 1663 } 1664 if (toa->MGMT_flags != T_SUCCESS) { 1665 logerr("update_router_list: MGMT_flags 0x%lx\n", 1666 toa->MGMT_flags); 1667 return (_B_FALSE); 1668 } 1669 break; 1670 1671 default: 1672 logerr("update_router_list: unknown primitive %ld\n", 1673 prim); 1674 return (_B_FALSE); 1675 } 1676 1677 /* Process the T_OPGMGMT_ACK below */ 1678 assert(prim == T_OPTMGMT_ACK); 1679 1680 switch (status) { 1681 case 0: 1682 /* 1683 * We have reached the end of this T_OPTMGMT_ACK 1684 * message. If this is the last message i.e EOD, 1685 * return, else process the next T_OPTMGMT_ACK msg. 1686 */ 1687 if ((ctlbuf.len == sizeof (struct T_optmgmt_ack) + 1688 sizeof (struct opthdr)) && optp->len == 0 && 1689 optp->name == 0 && optp->level == 0) { 1690 /* 1691 * This is the EOD message. Return 1692 */ 1693 return (_B_TRUE); 1694 } 1695 continue; 1696 1697 case MORECTL: 1698 case MORECTL | MOREDATA: 1699 /* 1700 * This should not happen. We should be able to read 1701 * the control portion in a single getmsg. 1702 */ 1703 logerr("update_router_list: MORECTL\n"); 1704 return (_B_FALSE); 1705 1706 case MOREDATA: 1707 databuf.maxlen = optp->len; 1708 /* malloc of 0 bytes is ok */ 1709 databuf.buf = malloc((size_t)optp->len); 1710 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1711 logperror("update_router_list: malloc"); 1712 return (_B_FALSE); 1713 } 1714 databuf.len = 0; 1715 flags = 0; 1716 for (;;) { 1717 status = getmsg(fd, NULL, &databuf, &flags); 1718 if (status >= 0) { 1719 break; 1720 } else if (errno == EINTR) { 1721 continue; 1722 } else { 1723 logperror("update_router_list:" 1724 " getmsg(data)"); 1725 free(databuf.buf); 1726 return (_B_FALSE); 1727 } 1728 } 1729 1730 if (optp->level == MIB2_IP && 1731 optp->name == MIB2_IP_ROUTE) { 1732 /* LINTED */ 1733 ire_process_v4((mib2_ipRouteEntry_t *) 1734 databuf.buf, databuf.len); 1735 } else if (optp->level == MIB2_IP6 && 1736 optp->name == MIB2_IP6_ROUTE) { 1737 /* LINTED */ 1738 ire_process_v6((mib2_ipv6RouteEntry_t *) 1739 databuf.buf, databuf.len); 1740 } 1741 free(databuf.buf); 1742 } 1743 } 1744 /* NOTREACHED */ 1745 } 1746 1747 /* 1748 * Examine the IPv4 routing table, for default routers. For each default 1749 * router, populate the list of targets of each phyint that is on the same 1750 * link as the default router 1751 */ 1752 static void 1753 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1754 { 1755 mib2_ipRouteEntry_t *rp; 1756 mib2_ipRouteEntry_t *rp1; 1757 struct in_addr nexthop_v4; 1758 mib2_ipRouteEntry_t *endp; 1759 1760 if (len == 0) 1761 return; 1762 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1763 1764 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1765 1766 /* 1767 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1768 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1769 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1770 * This is a potential target for probing, which we try to add 1771 * to the list of probe targets. 1772 */ 1773 for (rp = buf; rp < endp; rp++) { 1774 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1775 continue; 1776 1777 /* Get the nexthop address. */ 1778 nexthop_v4.s_addr = rp->ipRouteNextHop; 1779 1780 /* 1781 * Get the nexthop address. Then determine the outgoing 1782 * interface, by examining all interface IREs, and picking the 1783 * match. We don't look at the interface specified in the route 1784 * because we need to add the router target on all matching 1785 * interfaces anyway; the goal is to avoid falling back to 1786 * multicast when some interfaces are in the same subnet but 1787 * not in the same group. 1788 */ 1789 for (rp1 = buf; rp1 < endp; rp1++) { 1790 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE)) { 1791 continue; 1792 } 1793 1794 /* 1795 * Determine the interface IRE that matches the nexthop. 1796 * i.e. (IRE addr & IRE mask) == (nexthop & IRE mask) 1797 */ 1798 if ((rp1->ipRouteDest & rp1->ipRouteMask) == 1799 (nexthop_v4.s_addr & rp1->ipRouteMask)) { 1800 /* 1801 * We found the interface ire 1802 */ 1803 router_add_v4(rp1, nexthop_v4); 1804 } 1805 } 1806 } 1807 } 1808 1809 void 1810 router_add_v4(mib2_ipRouteEntry_t *rp1, struct in_addr nexthop_v4) 1811 { 1812 char *cp; 1813 char ifname[LIFNAMSIZ + 1]; 1814 struct in6_addr nexthop; 1815 int len; 1816 1817 if (debug & D_TARGET) 1818 logdebug("router_add_v4()\n"); 1819 1820 len = MIN(rp1->ipRouteIfIndex.o_length, sizeof (ifname) - 1); 1821 (void) memcpy(ifname, rp1->ipRouteIfIndex.o_bytes, len); 1822 ifname[len] = '\0'; 1823 1824 if (ifname[0] == '\0') 1825 return; 1826 1827 cp = strchr(ifname, IF_SEPARATOR); 1828 if (cp != NULL) 1829 *cp = '\0'; 1830 1831 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1832 router_add_common(AF_INET, ifname, nexthop); 1833 } 1834 1835 void 1836 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1837 { 1838 struct phyint_instance *pii; 1839 struct phyint *pi; 1840 1841 if (debug & D_TARGET) 1842 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1843 1844 /* 1845 * Retrieve the phyint instance; bail if it's not known to us yet. 1846 */ 1847 pii = phyint_inst_lookup(af, ifname); 1848 if (pii == NULL) 1849 return; 1850 1851 /* 1852 * Don't use our own addresses as targets. 1853 */ 1854 if (own_address(pii->pii_af, nexthop)) 1855 return; 1856 1857 /* 1858 * If the phyint is part a named group, then add the address to all 1859 * members of the group; note that this is suboptimal in the IPv4 case 1860 * as it has already been added to all matching interfaces in 1861 * ire_process_v4(). Otherwise, add the address only to the phyint 1862 * itself, since other phyints in the anongroup may not be on the same 1863 * subnet. 1864 */ 1865 pi = pii->pii_phyint; 1866 if (pi->pi_group == phyint_anongroup) { 1867 target_add(pii, nexthop, _B_TRUE); 1868 } else { 1869 pi = pi->pi_group->pg_phyint; 1870 for (; pi != NULL; pi = pi->pi_pgnext) 1871 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1872 } 1873 } 1874 1875 /* 1876 * Examine the IPv6 routing table, for default routers. For each default 1877 * router, populate the list of targets of each phyint that is on the same 1878 * link as the default router 1879 */ 1880 static void 1881 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1882 { 1883 mib2_ipv6RouteEntry_t *rp; 1884 mib2_ipv6RouteEntry_t *endp; 1885 struct in6_addr nexthop_v6; 1886 1887 if (debug & D_TARGET) 1888 logdebug("ire_process_v6(len %d)\n", len); 1889 1890 if (len == 0) 1891 return; 1892 1893 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1894 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1895 1896 /* 1897 * Loop thru the routing table entries. Process any IRE_DEFAULT, 1898 * IRE_PREFIX, IRE_HOST, IRE_HOST_REDIRECT ire. Ignore the others. 1899 * For each such IRE_OFFSUBNET ire, get the nexthop gateway address. 1900 * This is a potential target for probing, which we try to add 1901 * to the list of probe targets. 1902 */ 1903 for (rp = buf; rp < endp; rp++) { 1904 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET)) 1905 continue; 1906 1907 /* 1908 * We have the outgoing interface in ipv6RouteIfIndex 1909 * if ipv6RouteIfindex.o_length is non-zero. The outgoing 1910 * interface must be present for link-local addresses. Since 1911 * we use only link-local addreses for probing, we don't 1912 * consider the case when the outgoing interface is not 1913 * known and we need to scan interface ires 1914 */ 1915 nexthop_v6 = rp->ipv6RouteNextHop; 1916 if (rp->ipv6RouteIfIndex.o_length != 0) { 1917 /* 1918 * We already have the outgoing interface 1919 * in ipv6RouteIfIndex. 1920 */ 1921 router_add_v6(rp, nexthop_v6); 1922 } 1923 } 1924 } 1925 1926 1927 void 1928 router_add_v6(mib2_ipv6RouteEntry_t *rp1, struct in6_addr nexthop_v6) 1929 { 1930 char ifname[LIFNAMSIZ + 1]; 1931 char *cp; 1932 int len; 1933 1934 if (debug & D_TARGET) 1935 logdebug("router_add_v6()\n"); 1936 1937 len = MIN(rp1->ipv6RouteIfIndex.o_length, sizeof (ifname) - 1); 1938 (void) memcpy(ifname, rp1->ipv6RouteIfIndex.o_bytes, len); 1939 ifname[len] = '\0'; 1940 1941 if (ifname[0] == '\0') 1942 return; 1943 1944 cp = strchr(ifname, IF_SEPARATOR); 1945 if (cp != NULL) 1946 *cp = '\0'; 1947 1948 router_add_common(AF_INET6, ifname, nexthop_v6); 1949 } 1950 1951 1952 1953 /* 1954 * Build a list of target routers, by scanning the routing tables. 1955 * It is assumed that interface routes exist, to reach the routers. 1956 */ 1957 static void 1958 init_router_targets(void) 1959 { 1960 struct target *tg; 1961 struct target *next_tg; 1962 struct phyint_instance *pii; 1963 struct phyint *pi; 1964 1965 if (force_mcast) 1966 return; 1967 1968 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1969 pi = pii->pii_phyint; 1970 /* 1971 * Exclude ptp and host targets. Set tg_in_use to false, 1972 * only for router targets. 1973 */ 1974 if (!pii->pii_targets_are_routers || 1975 (pi->pi_flags & IFF_POINTOPOINT)) 1976 continue; 1977 1978 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1979 tg->tg_in_use = 0; 1980 } 1981 1982 if (mibfd < 0) { 1983 mibfd = open("/dev/ip", O_RDWR); 1984 if (mibfd < 0) { 1985 logperror("mibopen: ip open"); 1986 exit(1); 1987 } 1988 } 1989 1990 if (!update_router_list(mibfd)) { 1991 (void) close(mibfd); 1992 mibfd = -1; 1993 } 1994 1995 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1996 if (!pii->pii_targets_are_routers || 1997 (pi->pi_flags & IFF_POINTOPOINT)) 1998 continue; 1999 2000 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 2001 next_tg = tg->tg_next; 2002 if (!tg->tg_in_use) { 2003 target_delete(tg); 2004 } 2005 } 2006 } 2007 } 2008 2009 /* 2010 * Attempt to assign host targets to any interfaces that do not currently 2011 * have probe targets by sharing targets with other interfaces in the group. 2012 */ 2013 static void 2014 init_host_targets(void) 2015 { 2016 struct phyint_instance *pii; 2017 struct phyint_group *pg; 2018 2019 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 2020 pg = pii->pii_phyint->pi_group; 2021 if (pg != phyint_anongroup && pii->pii_targets == NULL) 2022 dup_host_targets(pii); 2023 } 2024 } 2025 2026 /* 2027 * Duplicate host targets from other phyints of the group to 2028 * the phyint instance 'desired_pii'. 2029 */ 2030 static void 2031 dup_host_targets(struct phyint_instance *desired_pii) 2032 { 2033 int af; 2034 struct phyint *pi; 2035 struct phyint_instance *pii; 2036 struct target *tg; 2037 2038 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 2039 2040 af = desired_pii->pii_af; 2041 2042 /* 2043 * For every phyint in the same group as desired_pii, check if 2044 * it has any host targets. If so add them to desired_pii. 2045 */ 2046 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 2047 pii = PHYINT_INSTANCE(pi, af); 2048 /* 2049 * We know that we don't have targets on this phyint instance 2050 * since we have been called. But we still check for 2051 * pii_targets_are_routers because another phyint instance 2052 * could have router targets, since IFF_NOFAILOVER addresses 2053 * on different phyint instances may belong to different 2054 * subnets. 2055 */ 2056 if ((pii == NULL) || (pii == desired_pii) || 2057 pii->pii_targets_are_routers) 2058 continue; 2059 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 2060 target_create(desired_pii, tg->tg_address, _B_FALSE); 2061 } 2062 } 2063 } 2064 2065 static void 2066 usage(char *cmd) 2067 { 2068 (void) fprintf(stderr, "usage: %s\n", cmd); 2069 } 2070 2071 2072 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2073 2074 /* Get an option from the /etc/default/mpathd file */ 2075 static char * 2076 getdefault(char *name) 2077 { 2078 char namebuf[BUFSIZ]; 2079 char *value = NULL; 2080 2081 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2082 char *cp; 2083 int flags; 2084 2085 /* 2086 * ignore case 2087 */ 2088 flags = defcntl(DC_GETFLAGS, 0); 2089 TURNOFF(flags, DC_CASE); 2090 (void) defcntl(DC_SETFLAGS, flags); 2091 2092 /* Add "=" to the name */ 2093 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2094 (void) strncat(namebuf, "=", 2); 2095 2096 if ((cp = defread(namebuf)) != NULL) 2097 value = strdup(cp); 2098 2099 /* close */ 2100 (void) defopen((char *)NULL); 2101 } 2102 return (value); 2103 } 2104 2105 2106 /* 2107 * Command line options below 2108 */ 2109 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2110 boolean_t track_all_phyints = _B_FALSE; /* option to track all NICs */ 2111 static boolean_t adopt = _B_FALSE; 2112 static boolean_t foreground = _B_FALSE; 2113 2114 int 2115 main(int argc, char *argv[]) 2116 { 2117 int i; 2118 int c; 2119 struct phyint_instance *pii; 2120 char *value; 2121 2122 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2123 srandom(gethostid()); /* Initialize the random number generator */ 2124 2125 /* 2126 * NOTE: The messages output by in.mpathd are not suitable for 2127 * translation, so we do not call textdomain(). 2128 */ 2129 (void) setlocale(LC_ALL, ""); 2130 2131 /* 2132 * Get the user specified value of 'failure detection time' 2133 * from /etc/default/mpathd 2134 */ 2135 value = getdefault("FAILURE_DETECTION_TIME"); 2136 if (value != NULL) { 2137 user_failure_detection_time = 2138 (int)strtol((char *)value, NULL, 0); 2139 2140 if (user_failure_detection_time <= 0) { 2141 user_failure_detection_time = FAILURE_DETECTION_TIME; 2142 logerr("Invalid failure detection time %s, assuming " 2143 "default %d\n", value, user_failure_detection_time); 2144 2145 } else if (user_failure_detection_time < 2146 MIN_FAILURE_DETECTION_TIME) { 2147 user_failure_detection_time = 2148 MIN_FAILURE_DETECTION_TIME; 2149 logerr("Too small failure detection time of %s, " 2150 "assuming minimum %d\n", value, 2151 user_failure_detection_time); 2152 } 2153 free(value); 2154 } else { 2155 /* User has not specified the parameter, Use default value */ 2156 user_failure_detection_time = FAILURE_DETECTION_TIME; 2157 } 2158 2159 /* 2160 * This gives the frequency at which probes will be sent. 2161 * When fdt ms elapses, we should be able to determine 2162 * whether 5 consecutive probes have failed or not. 2163 * 1 probe will be sent in every user_probe_interval ms, 2164 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2165 * user_probe_interval. Thus when we send out probe 'n' we 2166 * can be sure that probe 'n - 2' is lost, if we have not 2167 * got the ack. (since the probe interval is > crtt). But 2168 * probe 'n - 1' may be a valid unacked probe, since the 2169 * time between 2 successive probes could be as small as 2170 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2171 */ 2172 user_probe_interval = user_failure_detection_time / 2173 (NUM_PROBE_FAILS + 2); 2174 2175 /* 2176 * Get the user specified value of failback_enabled from 2177 * /etc/default/mpathd 2178 */ 2179 value = getdefault("FAILBACK"); 2180 if (value != NULL) { 2181 if (strncasecmp(value, "yes", 3) == 0) 2182 failback_enabled = _B_TRUE; 2183 else if (strncasecmp(value, "no", 2) == 0) 2184 failback_enabled = _B_FALSE; 2185 else 2186 logerr("Invalid value for FAILBACK %s\n", value); 2187 free(value); 2188 } else { 2189 failback_enabled = _B_TRUE; 2190 } 2191 2192 /* 2193 * Get the user specified value of track_all_phyints from 2194 * /etc/default/mpathd. The sense is reversed in 2195 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2196 */ 2197 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2198 if (value != NULL) { 2199 if (strncasecmp(value, "yes", 3) == 0) 2200 track_all_phyints = _B_FALSE; 2201 else if (strncasecmp(value, "no", 2) == 0) 2202 track_all_phyints = _B_TRUE; 2203 else 2204 logerr("Invalid value for " 2205 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2206 free(value); 2207 } else { 2208 track_all_phyints = _B_FALSE; 2209 } 2210 2211 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2212 switch (c) { 2213 case 'a': 2214 adopt = _B_TRUE; 2215 break; 2216 case 'm': 2217 force_mcast = _B_TRUE; 2218 break; 2219 case 'd': 2220 debug = D_ALL; 2221 foreground = _B_TRUE; 2222 break; 2223 case 'D': 2224 i = (int)strtol(optarg, NULL, 0); 2225 if (i == 0) { 2226 (void) fprintf(stderr, "Bad debug flags: %s\n", 2227 optarg); 2228 exit(1); 2229 } 2230 debug |= i; 2231 foreground = _B_TRUE; 2232 break; 2233 case 'l': 2234 /* 2235 * Turn off link state notification handling. 2236 * Undocumented command line flag, for debugging 2237 * purposes. 2238 */ 2239 handle_link_notifications = _B_FALSE; 2240 break; 2241 default: 2242 usage(argv[0]); 2243 exit(1); 2244 } 2245 } 2246 2247 /* 2248 * The sockets for the loopback command interface should be listening 2249 * before we fork and exit in daemonize(). This way, whoever started us 2250 * can use the loopback interface as soon as they get a zero exit 2251 * status. 2252 */ 2253 lsock_v4 = setup_listener(AF_INET); 2254 lsock_v6 = setup_listener(AF_INET6); 2255 2256 if (lsock_v4 < 0 && lsock_v6 < 0) { 2257 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2258 exit(1); 2259 } 2260 2261 if (!foreground) { 2262 if (!daemonize()) { 2263 logerr("cannot daemonize\n"); 2264 exit(EXIT_FAILURE); 2265 } 2266 initlog(); 2267 } 2268 2269 /* 2270 * Initializations: 2271 * 1. Create ifsock* sockets. These are used for performing SIOC* 2272 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2273 * 2. Initialize a pipe for handling/recording signal events. 2274 * 3. Create the routing sockets, used for listening 2275 * to routing / interface changes. 2276 * 4. phyint_init() - Initialize physical interface state 2277 * (in mpd_tables.c). Must be done before creating interfaces, 2278 * which timer_init() does indirectly. 2279 * 5. timer_init() - Initialize timer related stuff 2280 * 6. initifs() - Initialize our database of all known interfaces 2281 * 7. init_router_targets() - Initialize our database of all known 2282 * router targets. 2283 */ 2284 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2285 if (ifsock_v4 < 0) { 2286 logperror("main: IPv4 socket open"); 2287 exit(1); 2288 } 2289 2290 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2291 if (ifsock_v6 < 0) { 2292 logperror("main: IPv6 socket open"); 2293 exit(1); 2294 } 2295 2296 setup_eventpipe(); 2297 2298 rtsock_v4 = setup_rtsock(AF_INET); 2299 rtsock_v6 = setup_rtsock(AF_INET6); 2300 2301 if (phyint_init() == -1) { 2302 logerr("cannot initialize physical interface structures"); 2303 exit(1); 2304 } 2305 2306 timer_init(); 2307 2308 initifs(); 2309 2310 /* 2311 * If we're operating in "adopt" mode and no interfaces need to be 2312 * tracked, shut down (ifconfig(1M) will restart us on demand if 2313 * interfaces are subsequently put into multipathing groups). 2314 */ 2315 if (adopt && phyint_instances == NULL) 2316 exit(0); 2317 2318 /* 2319 * Main body. Keep listening for activity on any of the sockets 2320 * that we are monitoring and take appropriate action as necessary. 2321 * signals are also handled synchronously. 2322 */ 2323 for (;;) { 2324 if (poll(pollfds, pollfd_num, -1) < 0) { 2325 if (errno == EINTR) 2326 continue; 2327 logperror("main: poll"); 2328 exit(1); 2329 } 2330 for (i = 0; i < pollfd_num; i++) { 2331 if ((pollfds[i].fd == -1) || 2332 !(pollfds[i].revents & POLLIN)) 2333 continue; 2334 if (pollfds[i].fd == eventpipe_read) { 2335 in_signal(eventpipe_read); 2336 break; 2337 } 2338 if (pollfds[i].fd == rtsock_v4 || 2339 pollfds[i].fd == rtsock_v6) { 2340 process_rtsock(rtsock_v4, rtsock_v6); 2341 break; 2342 } 2343 for (pii = phyint_instances; pii != NULL; 2344 pii = pii->pii_next) { 2345 if (pollfds[i].fd == pii->pii_probe_sock) { 2346 if (pii->pii_af == AF_INET) 2347 in_data(pii); 2348 else 2349 in6_data(pii); 2350 break; 2351 } 2352 } 2353 if (pollfds[i].fd == lsock_v4) 2354 loopback_cmd(lsock_v4, AF_INET); 2355 else if (pollfds[i].fd == lsock_v6) 2356 loopback_cmd(lsock_v6, AF_INET6); 2357 } 2358 if (full_scan_required) { 2359 initifs(); 2360 full_scan_required = _B_FALSE; 2361 } 2362 } 2363 /* NOTREACHED */ 2364 return (EXIT_SUCCESS); 2365 } 2366 2367 static int 2368 setup_listener(int af) 2369 { 2370 int sock; 2371 int on; 2372 int len; 2373 int ret; 2374 struct sockaddr_storage laddr; 2375 struct sockaddr_in *sin; 2376 struct sockaddr_in6 *sin6; 2377 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2378 2379 assert(af == AF_INET || af == AF_INET6); 2380 2381 sock = socket(af, SOCK_STREAM, 0); 2382 if (sock < 0) { 2383 logperror("setup_listener: socket"); 2384 exit(1); 2385 } 2386 2387 on = 1; 2388 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2389 sizeof (on)) < 0) { 2390 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2391 exit(1); 2392 } 2393 2394 bzero(&laddr, sizeof (laddr)); 2395 laddr.ss_family = af; 2396 2397 if (af == AF_INET) { 2398 sin = (struct sockaddr_in *)&laddr; 2399 sin->sin_port = htons(MPATHD_PORT); 2400 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2401 len = sizeof (struct sockaddr_in); 2402 } else { 2403 sin6 = (struct sockaddr_in6 *)&laddr; 2404 sin6->sin6_port = htons(MPATHD_PORT); 2405 sin6->sin6_addr = loopback_addr; 2406 len = sizeof (struct sockaddr_in6); 2407 } 2408 2409 ret = bind(sock, (struct sockaddr *)&laddr, len); 2410 if (ret < 0) { 2411 if (errno == EADDRINUSE) { 2412 /* 2413 * Another instance of mpathd may be already active. 2414 */ 2415 logerr("main: is another instance of in.mpathd " 2416 "already active?\n"); 2417 exit(1); 2418 } else { 2419 (void) close(sock); 2420 return (-1); 2421 } 2422 } 2423 if (listen(sock, 30) < 0) { 2424 logperror("main: listen"); 2425 exit(1); 2426 } 2427 if (poll_add(sock) == -1) { 2428 (void) close(sock); 2429 exit(1); 2430 } 2431 2432 return (sock); 2433 } 2434 2435 /* 2436 * Table of commands and their expected size; used by loopback_cmd(). 2437 */ 2438 static struct { 2439 const char *name; 2440 unsigned int size; 2441 } commands[] = { 2442 { "MI_PING", sizeof (uint32_t) }, 2443 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2444 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2445 { "MI_SETOINDEX", sizeof (mi_setoindex_t) }, 2446 { "MI_QUERY", sizeof (mi_query_t) } 2447 }; 2448 2449 /* 2450 * Commands received over the loopback interface come here. Currently 2451 * the agents that send commands are ifconfig, if_mpadm and the RCM IPMP 2452 * module. ifconfig only makes a connection, and closes it to check if 2453 * in.mpathd is running. 2454 * if_mpadm sends commands in the format specified by the mpathd_interface 2455 * structure. 2456 */ 2457 static void 2458 loopback_cmd(int sock, int family) 2459 { 2460 int newfd; 2461 ssize_t len; 2462 struct sockaddr_storage peer; 2463 struct sockaddr_in *peer_sin; 2464 struct sockaddr_in6 *peer_sin6; 2465 socklen_t peerlen; 2466 union mi_commands mpi; 2467 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2468 char abuf[INET6_ADDRSTRLEN]; 2469 uint_t cmd; 2470 int retval; 2471 2472 peerlen = sizeof (peer); 2473 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2474 if (newfd < 0) { 2475 logperror("loopback_cmd: accept"); 2476 return; 2477 } 2478 2479 switch (family) { 2480 case AF_INET: 2481 /* 2482 * Validate the address and port to make sure that 2483 * non privileged processes don't connect and start 2484 * talking to us. 2485 */ 2486 if (peerlen != sizeof (struct sockaddr_in)) { 2487 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2488 (void) close(newfd); 2489 return; 2490 } 2491 peer_sin = (struct sockaddr_in *)&peer; 2492 if ((ntohs(peer_sin->sin_port) >= IPPORT_RESERVED) || 2493 (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK)) { 2494 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2495 abuf, sizeof (abuf)); 2496 logerr("Attempt to connect from addr %s port %d\n", 2497 abuf, ntohs(peer_sin->sin_port)); 2498 (void) close(newfd); 2499 return; 2500 } 2501 break; 2502 2503 case AF_INET6: 2504 if (peerlen != sizeof (struct sockaddr_in6)) { 2505 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2506 (void) close(newfd); 2507 return; 2508 } 2509 /* 2510 * Validate the address and port to make sure that 2511 * non privileged processes don't connect and start 2512 * talking to us. 2513 */ 2514 peer_sin6 = (struct sockaddr_in6 *)&peer; 2515 if ((ntohs(peer_sin6->sin6_port) >= IPPORT_RESERVED) || 2516 (!IN6_ARE_ADDR_EQUAL(&peer_sin6->sin6_addr, 2517 &loopback_addr))) { 2518 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2519 sizeof (abuf)); 2520 logerr("Attempt to connect from addr %s port %d\n", 2521 abuf, ntohs(peer_sin6->sin6_port)); 2522 (void) close(newfd); 2523 return; 2524 } 2525 2526 default: 2527 logdebug("loopback_cmd: family %d\n", family); 2528 (void) close(newfd); 2529 return; 2530 } 2531 2532 /* 2533 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2534 * all supported commands 2535 */ 2536 len = read(newfd, &mpi, sizeof (mpi)); 2537 2538 /* 2539 * ifconfig does not send any data. Just tests to see if mpathd 2540 * is already running. 2541 */ 2542 if (len <= 0) { 2543 (void) close(newfd); 2544 return; 2545 } 2546 2547 /* 2548 * In theory, we can receive any sized message for a stream socket, 2549 * but we don't expect that to happen for a small message over a 2550 * loopback connection. 2551 */ 2552 if (len < sizeof (uint32_t)) { 2553 logerr("loopback_cmd: bad command format or read returns " 2554 "partial data %d\n", len); 2555 } 2556 2557 cmd = mpi.mi_command; 2558 if (cmd >= MI_NCMD) { 2559 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2560 (void) close(newfd); 2561 return; 2562 } 2563 2564 if (len < commands[cmd].size) { 2565 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2566 commands[cmd].name, commands[cmd].size, len); 2567 (void) close(newfd); 2568 return; 2569 } 2570 2571 retval = process_cmd(newfd, &mpi); 2572 if (retval != IPMP_SUCCESS) { 2573 logerr("failed processing %s: %s\n", commands[cmd].name, 2574 ipmp_errmsg(retval)); 2575 } 2576 (void) close(newfd); 2577 } 2578 2579 extern int global_errno; /* set by failover() or failback() */ 2580 2581 /* 2582 * Process the offline, undo offline and set original index commands, 2583 * received from if_mpadm(1M) 2584 */ 2585 static unsigned int 2586 process_cmd(int newfd, union mi_commands *mpi) 2587 { 2588 uint_t nif = 0; 2589 uint32_t cmd; 2590 struct phyint *pi; 2591 struct phyint *pi2; 2592 struct phyint_group *pg; 2593 boolean_t success; 2594 int error; 2595 struct mi_offline *mio; 2596 struct mi_undo_offline *miu; 2597 struct lifreq lifr; 2598 int ifsock; 2599 struct mi_setoindex *mis; 2600 2601 cmd = mpi->mi_command; 2602 2603 switch (cmd) { 2604 case MI_OFFLINE: 2605 mio = &mpi->mi_ocmd; 2606 /* 2607 * Lookup the interface that needs to be offlined. 2608 * If it does not exist, return a suitable error. 2609 */ 2610 pi = phyint_lookup(mio->mio_ifname); 2611 if (pi == NULL) 2612 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2613 2614 /* 2615 * Verify that the minimum redundancy requirements are met. 2616 * The multipathing group must have at least the specified 2617 * number of functional interfaces after offlining the 2618 * requested interface. Otherwise return a suitable error. 2619 */ 2620 pg = pi->pi_group; 2621 nif = 0; 2622 if (pg != phyint_anongroup) { 2623 for (nif = 0, pi2 = pg->pg_phyint; pi2 != NULL; 2624 pi2 = pi2->pi_pgnext) { 2625 if ((pi2->pi_state == PI_RUNNING) || 2626 (pg->pg_groupfailed && 2627 !(pi2->pi_flags & IFF_OFFLINE))) 2628 nif++; 2629 } 2630 } 2631 if (nif < mio->mio_min_redundancy) 2632 return (send_result(newfd, IPMP_EMINRED, 0)); 2633 2634 /* 2635 * The order of operation is to set IFF_OFFLINE, followed by 2636 * failover. Setting IFF_OFFLINE ensures that no new ipif's 2637 * can be created. Subsequent failover moves everything on 2638 * the OFFLINE interface to some other functional interface. 2639 */ 2640 success = change_lif_flags(pi, IFF_OFFLINE, _B_TRUE); 2641 if (success) { 2642 if (!pi->pi_empty) { 2643 error = try_failover(pi, FAILOVER_NORMAL); 2644 if (error != 0) { 2645 if (!change_lif_flags(pi, IFF_OFFLINE, 2646 _B_FALSE)) { 2647 logerr("process_cmd: couldn't" 2648 " clear OFFLINE flag on" 2649 " %s\n", pi->pi_name); 2650 /* 2651 * Offline interfaces should 2652 * not be probed. 2653 */ 2654 stop_probing(pi); 2655 } 2656 return (send_result(newfd, error, 2657 global_errno)); 2658 } 2659 } 2660 } else { 2661 return (send_result(newfd, IPMP_FAILURE, errno)); 2662 } 2663 2664 /* 2665 * The interface is now Offline, so stop probing it. 2666 * Note that if_mpadm(1M) will down the test addresses, 2667 * after receiving a success reply from us. The routing 2668 * socket message will then make us close the socket used 2669 * for sending probes. But it is more logical that an 2670 * offlined interface must not be probed, even if it has 2671 * test addresses. 2672 */ 2673 stop_probing(pi); 2674 return (send_result(newfd, IPMP_SUCCESS, 0)); 2675 2676 case MI_UNDO_OFFLINE: 2677 miu = &mpi->mi_ucmd; 2678 /* 2679 * Undo the offline command. As usual lookup the interface. 2680 * Send an error if it does not exist. 2681 */ 2682 pi = phyint_lookup(miu->miu_ifname); 2683 if (pi == NULL) 2684 return (send_result(newfd, IPMP_FAILURE, EINVAL)); 2685 2686 /* 2687 * Inverse of the offline operation. Do a failback, and then 2688 * clear the IFF_OFFLINE flag. 2689 */ 2690 error = do_failback(pi, _B_TRUE); 2691 if (error == IPMP_EFBPARTIAL) 2692 return (send_result(newfd, IPMP_EFBPARTIAL, 0)); 2693 error = do_failback(pi, _B_FALSE); 2694 2695 switch (error) { 2696 case IPMP_SUCCESS: 2697 if (!change_lif_flags(pi, IFF_OFFLINE, _B_FALSE)) { 2698 logdebug("undo error %X\n", global_errno); 2699 error = IPMP_FAILURE; 2700 break; 2701 } 2702 /* FALLTHROUGH */ 2703 2704 case IPMP_EFBPARTIAL: 2705 /* 2706 * Reset the state of the interface based on the 2707 * current link state; if this phyint subsequently 2708 * acquires a test address, the state will be changed 2709 * again later as a result of the probes. 2710 */ 2711 if (LINK_UP(pi)) 2712 phyint_chstate(pi, PI_RUNNING); 2713 else 2714 phyint_chstate(pi, PI_FAILED); 2715 break; 2716 2717 case IPMP_FAILURE: 2718 break; 2719 2720 default: 2721 logdebug("do_failback: unexpected return value\n"); 2722 break; 2723 } 2724 return (send_result(newfd, error, global_errno)); 2725 2726 case MI_SETOINDEX: 2727 mis = &mpi->mi_scmd; 2728 2729 /* Get the socket for doing ioctls */ 2730 ifsock = (mis->mis_iftype == AF_INET) ? ifsock_v4 : ifsock_v6; 2731 2732 /* 2733 * Get index of new original interface. 2734 * The index is returned in lifr.lifr_index. 2735 */ 2736 (void) strlcpy(lifr.lifr_name, mis->mis_new_pifname, 2737 sizeof (lifr.lifr_name)); 2738 2739 if (ioctl(ifsock, SIOCGLIFINDEX, (char *)&lifr) < 0) 2740 return (send_result(newfd, IPMP_FAILURE, errno)); 2741 2742 /* 2743 * Set new original interface index. 2744 * The new index was put into lifr.lifr_index by the 2745 * SIOCGLIFINDEX ioctl. 2746 */ 2747 (void) strlcpy(lifr.lifr_name, mis->mis_lifname, 2748 sizeof (lifr.lifr_name)); 2749 2750 if (ioctl(ifsock, SIOCSLIFOINDEX, (char *)&lifr) < 0) 2751 return (send_result(newfd, IPMP_FAILURE, errno)); 2752 2753 return (send_result(newfd, IPMP_SUCCESS, 0)); 2754 2755 case MI_QUERY: 2756 return (process_query(newfd, &mpi->mi_qcmd)); 2757 2758 default: 2759 break; 2760 } 2761 2762 return (send_result(newfd, IPMP_EPROTO, 0)); 2763 } 2764 2765 /* 2766 * Process the query request pointed to by `miq' and send a reply on file 2767 * descriptor `fd'. Returns an IPMP error code. 2768 */ 2769 static unsigned int 2770 process_query(int fd, mi_query_t *miq) 2771 { 2772 ipmp_groupinfo_t *grinfop; 2773 ipmp_groupinfolist_t *grlp; 2774 ipmp_grouplist_t *grlistp; 2775 ipmp_ifinfo_t *ifinfop; 2776 ipmp_ifinfolist_t *iflp; 2777 ipmp_snap_t *snap; 2778 unsigned int retval; 2779 2780 switch (miq->miq_inforeq) { 2781 case IPMP_GROUPLIST: 2782 retval = getgrouplist(&grlistp); 2783 if (retval != IPMP_SUCCESS) 2784 return (send_result(fd, retval, errno)); 2785 2786 retval = send_result(fd, IPMP_SUCCESS, 0); 2787 if (retval == IPMP_SUCCESS) 2788 retval = send_grouplist(fd, grlistp); 2789 2790 ipmp_freegrouplist(grlistp); 2791 return (retval); 2792 2793 case IPMP_GROUPINFO: 2794 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2795 retval = getgroupinfo(miq->miq_ifname, &grinfop); 2796 if (retval != IPMP_SUCCESS) 2797 return (send_result(fd, retval, errno)); 2798 2799 retval = send_result(fd, IPMP_SUCCESS, 0); 2800 if (retval == IPMP_SUCCESS) 2801 retval = send_groupinfo(fd, grinfop); 2802 2803 ipmp_freegroupinfo(grinfop); 2804 return (retval); 2805 2806 case IPMP_IFINFO: 2807 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2808 retval = getifinfo(miq->miq_ifname, &ifinfop); 2809 if (retval != IPMP_SUCCESS) 2810 return (send_result(fd, retval, errno)); 2811 2812 retval = send_result(fd, IPMP_SUCCESS, 0); 2813 if (retval == IPMP_SUCCESS) 2814 retval = send_ifinfo(fd, ifinfop); 2815 2816 ipmp_freeifinfo(ifinfop); 2817 return (retval); 2818 2819 case IPMP_SNAP: 2820 retval = getsnap(&snap); 2821 if (retval != IPMP_SUCCESS) 2822 return (send_result(fd, retval, errno)); 2823 2824 retval = send_result(fd, IPMP_SUCCESS, 0); 2825 if (retval != IPMP_SUCCESS) 2826 goto out; 2827 2828 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2829 if (retval != IPMP_SUCCESS) 2830 goto out; 2831 2832 retval = send_grouplist(fd, snap->sn_grlistp); 2833 if (retval != IPMP_SUCCESS) 2834 goto out; 2835 2836 iflp = snap->sn_ifinfolistp; 2837 for (; iflp != NULL; iflp = iflp->ifl_next) { 2838 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2839 if (retval != IPMP_SUCCESS) 2840 goto out; 2841 } 2842 2843 grlp = snap->sn_grinfolistp; 2844 for (; grlp != NULL; grlp = grlp->grl_next) { 2845 retval = send_groupinfo(fd, grlp->grl_grinfop); 2846 if (retval != IPMP_SUCCESS) 2847 goto out; 2848 } 2849 out: 2850 ipmp_snap_free(snap); 2851 return (retval); 2852 2853 default: 2854 break; 2855 2856 } 2857 return (send_result(fd, IPMP_EPROTO, 0)); 2858 } 2859 2860 /* 2861 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2862 * Returns an IPMP error code. 2863 */ 2864 static unsigned int 2865 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2866 { 2867 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2868 unsigned int retval; 2869 2870 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2871 if (retval != IPMP_SUCCESS) 2872 return (retval); 2873 2874 return (ipmp_writetlv(fd, IPMP_IFLIST, 2875 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp)); 2876 } 2877 2878 /* 2879 * Send the interface information pointed to by `ifinfop' on file descriptor 2880 * `fd'. Returns an IPMP error code. 2881 */ 2882 static unsigned int 2883 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2884 { 2885 return (ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop)); 2886 } 2887 2888 /* 2889 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2890 * Returns an IPMP error code. 2891 */ 2892 static unsigned int 2893 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2894 { 2895 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2896 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2897 } 2898 2899 /* 2900 * Initialize an mi_result_t structure using `error' and `syserror' and 2901 * send it on file descriptor `fd'. Returns an IPMP error code. 2902 */ 2903 static unsigned int 2904 send_result(int fd, unsigned int error, int syserror) 2905 { 2906 mi_result_t me; 2907 2908 me.me_mpathd_error = error; 2909 if (error == IPMP_FAILURE) 2910 me.me_sys_error = syserror; 2911 else 2912 me.me_sys_error = 0; 2913 2914 return (ipmp_write(fd, &me, sizeof (me))); 2915 } 2916 2917 /* 2918 * Daemonize the process. 2919 */ 2920 static boolean_t 2921 daemonize(void) 2922 { 2923 switch (fork()) { 2924 case -1: 2925 return (_B_FALSE); 2926 2927 case 0: 2928 /* 2929 * Lose our controlling terminal, and become both a session 2930 * leader and a process group leader. 2931 */ 2932 if (setsid() == -1) 2933 return (_B_FALSE); 2934 2935 /* 2936 * Under POSIX, a session leader can accidentally (through 2937 * open(2)) acquire a controlling terminal if it does not 2938 * have one. Just to be safe, fork() again so we are not a 2939 * session leader. 2940 */ 2941 switch (fork()) { 2942 case -1: 2943 return (_B_FALSE); 2944 2945 case 0: 2946 (void) chdir("/"); 2947 (void) umask(022); 2948 (void) fdwalk(closefunc, NULL); 2949 break; 2950 2951 default: 2952 _exit(EXIT_SUCCESS); 2953 } 2954 break; 2955 2956 default: 2957 _exit(EXIT_SUCCESS); 2958 } 2959 2960 return (_B_TRUE); 2961 } 2962 2963 /* 2964 * The parent has created some fds before forking on purpose, keep them open. 2965 */ 2966 static int 2967 closefunc(void *not_used, int fd) 2968 /* ARGSUSED */ 2969 { 2970 if (fd != lsock_v4 && fd != lsock_v6) 2971 (void) close(fd); 2972 return (0); 2973 } 2974 2975 /* LOGGER */ 2976 2977 #include <syslog.h> 2978 2979 /* 2980 * Logging routines. All routines log to syslog, unless the daemon is 2981 * running in the foreground, in which case the logging goes to stderr. 2982 * 2983 * The following routines are available: 2984 * 2985 * logdebug(): A printf-like function for outputting debug messages 2986 * (messages at LOG_DEBUG) that are only of use to developers. 2987 * 2988 * logtrace(): A printf-like function for outputting tracing messages 2989 * (messages at LOG_INFO) from the daemon. This is typically used 2990 * to log the receipt of interesting network-related conditions. 2991 * 2992 * logerr(): A printf-like function for outputting error messages 2993 * (messages at LOG_ERR) from the daemon. 2994 * 2995 * logperror*(): A set of functions used to output error messages 2996 * (messages at LOG_ERR); these automatically append strerror(errno) 2997 * and a newline to the message passed to them. 2998 * 2999 * NOTE: since the logging functions write to syslog, the messages passed 3000 * to them are not eligible for localization. Thus, gettext() must 3001 * *not* be used. 3002 */ 3003 3004 static int logging = 0; 3005 3006 static void 3007 initlog(void) 3008 { 3009 logging++; 3010 openlog("in.mpathd", LOG_PID | LOG_CONS, LOG_DAEMON); 3011 } 3012 3013 /* PRINTFLIKE1 */ 3014 void 3015 logerr(char *fmt, ...) 3016 { 3017 va_list ap; 3018 3019 va_start(ap, fmt); 3020 3021 if (logging) 3022 vsyslog(LOG_ERR, fmt, ap); 3023 else 3024 (void) vfprintf(stderr, fmt, ap); 3025 va_end(ap); 3026 } 3027 3028 /* PRINTFLIKE1 */ 3029 void 3030 logtrace(char *fmt, ...) 3031 { 3032 va_list ap; 3033 3034 va_start(ap, fmt); 3035 3036 if (logging) 3037 vsyslog(LOG_INFO, fmt, ap); 3038 else 3039 (void) vfprintf(stderr, fmt, ap); 3040 va_end(ap); 3041 } 3042 3043 /* PRINTFLIKE1 */ 3044 void 3045 logdebug(char *fmt, ...) 3046 { 3047 va_list ap; 3048 3049 va_start(ap, fmt); 3050 3051 if (logging) 3052 vsyslog(LOG_DEBUG, fmt, ap); 3053 else 3054 (void) vfprintf(stderr, fmt, ap); 3055 va_end(ap); 3056 } 3057 3058 /* PRINTFLIKE1 */ 3059 void 3060 logperror(char *str) 3061 { 3062 if (logging) 3063 syslog(LOG_ERR, "%s: %m\n", str); 3064 else 3065 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 3066 } 3067 3068 void 3069 logperror_pii(struct phyint_instance *pii, char *str) 3070 { 3071 if (logging) { 3072 syslog(LOG_ERR, "%s (%s %s): %m\n", 3073 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 3074 } else { 3075 (void) fprintf(stderr, "%s (%s %s): %s\n", 3076 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 3077 strerror(errno)); 3078 } 3079 } 3080 3081 void 3082 logperror_li(struct logint *li, char *str) 3083 { 3084 struct phyint_instance *pii = li->li_phyint_inst; 3085 3086 if (logging) { 3087 syslog(LOG_ERR, "%s (%s %s): %m\n", 3088 str, AF_STR(pii->pii_af), li->li_name); 3089 } else { 3090 (void) fprintf(stderr, "%s (%s %s): %s\n", 3091 str, AF_STR(pii->pii_af), li->li_name, 3092 strerror(errno)); 3093 } 3094 } 3095 3096 void 3097 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 3098 { 3099 if (polled) 3100 (void) poll_remove(pii->pii_probe_sock); 3101 (void) close(pii->pii_probe_sock); 3102 pii->pii_probe_sock = -1; 3103 pii->pii_basetime_inited = 0; 3104 } 3105