1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include "mpd_defs.h" 27 #include "mpd_tables.h" 28 29 int debug = 0; /* Debug flag */ 30 static int pollfd_num = 0; /* Num. of poll descriptors */ 31 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 32 33 /* All times below in ms */ 34 int user_failure_detection_time; /* user specified failure detection */ 35 /* time (fdt) */ 36 int user_probe_interval; /* derived from user specified fdt */ 37 38 static int rtsock_v4; /* AF_INET routing socket */ 39 static int rtsock_v6; /* AF_INET6 routing socket */ 40 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 41 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 42 static int lsock_v4; /* Listen socket to detect mpathd */ 43 static int lsock_v6; /* Listen socket to detect mpathd */ 44 static int mibfd = -1; /* fd to get mib info */ 45 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 46 47 static uint_t last_initifs_time; /* Time when initifs was last run */ 48 static char **argv0; /* Saved for re-exec on SIGHUP */ 49 boolean_t handle_link_notifications = _B_TRUE; 50 51 static void initlog(void); 52 static void run_timeouts(void); 53 static void initifs(void); 54 static void check_if_removed(struct phyint_instance *pii); 55 static void select_test_ifs(void); 56 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 57 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 58 static void router_add_common(int af, char *ifname, 59 struct in6_addr nexthop); 60 static void init_router_targets(); 61 static void cleanup(void); 62 static int setup_listener(int af); 63 static void check_config(void); 64 static void check_testconfig(void); 65 static void check_addr_unique(struct phyint_instance *, 66 struct sockaddr_storage *); 67 static void init_host_targets(void); 68 static void dup_host_targets(struct phyint_instance *desired_pii); 69 static void loopback_cmd(int sock, int family); 70 static boolean_t daemonize(void); 71 static int closefunc(void *, int); 72 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 73 static unsigned int process_query(int fd, mi_query_t *miq); 74 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); 75 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 76 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 77 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 78 static unsigned int send_result(int fd, unsigned int error, int syserror); 79 80 addrlist_t *localaddrs; 81 82 /* 83 * Return the current time in milliseconds (from an arbitrary reference) 84 * truncated to fit into an int. Truncation is ok since we are interested 85 * only in differences and not the absolute values. 86 */ 87 uint_t 88 getcurrenttime(void) 89 { 90 uint_t cur_time; /* In ms */ 91 92 /* 93 * Use of a non-user-adjustable source of time is 94 * required. However millisecond precision is sufficient. 95 * divide by 10^6 96 */ 97 cur_time = (uint_t)(gethrtime() / 1000000LL); 98 return (cur_time); 99 } 100 101 uint64_t 102 getcurrentsec(void) 103 { 104 return (gethrtime() / NANOSEC); 105 } 106 107 /* 108 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 109 */ 110 int 111 poll_add(int fd) 112 { 113 int i; 114 int new_num; 115 struct pollfd *newfds; 116 retry: 117 /* Check if already present */ 118 for (i = 0; i < pollfd_num; i++) { 119 if (pollfds[i].fd == fd) 120 return (0); 121 } 122 /* Check for empty spot already present */ 123 for (i = 0; i < pollfd_num; i++) { 124 if (pollfds[i].fd == -1) { 125 pollfds[i].fd = fd; 126 return (0); 127 } 128 } 129 130 /* Allocate space for 32 more fds and initialize to -1 */ 131 new_num = pollfd_num + 32; 132 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 133 if (newfds == NULL) { 134 logperror("poll_add: realloc"); 135 return (-1); 136 } 137 for (i = pollfd_num; i < new_num; i++) { 138 newfds[i].fd = -1; 139 newfds[i].events = POLLIN; 140 } 141 pollfd_num = new_num; 142 pollfds = newfds; 143 goto retry; 144 } 145 146 /* 147 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 148 */ 149 int 150 poll_remove(int fd) 151 { 152 int i; 153 154 /* Check if already present */ 155 for (i = 0; i < pollfd_num; i++) { 156 if (pollfds[i].fd == fd) { 157 pollfds[i].fd = -1; 158 return (0); 159 } 160 } 161 return (-1); 162 } 163 164 /* 165 * Extract information about the phyint instance. If the phyint instance still 166 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 167 * will use it to detect phyint instances that don't exist any longer and 168 * remove them, from our database of phyint instances. 169 * Return value: 170 * returns true if the phyint instance exists in the kernel, 171 * returns false otherwise 172 */ 173 static boolean_t 174 pii_process(int af, char *name, struct phyint_instance **pii_p) 175 { 176 int err; 177 struct phyint_instance *pii; 178 struct phyint_instance *pii_other; 179 180 if (debug & D_PHYINT) 181 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 182 183 pii = phyint_inst_lookup(af, name); 184 if (pii == NULL) { 185 /* 186 * Phyint instance does not exist in our tables, 187 * create new phyint instance 188 */ 189 pii = phyint_inst_init_from_k(af, name); 190 } else { 191 /* Phyint exists in our tables */ 192 err = phyint_inst_update_from_k(pii); 193 194 switch (err) { 195 case PI_IOCTL_ERROR: 196 /* Some ioctl error. don't change anything */ 197 pii->pii_in_use = 1; 198 break; 199 200 case PI_GROUP_CHANGED: 201 case PI_IFINDEX_CHANGED: 202 /* 203 * Interface index or group membership has changed. 204 * Delete the old state and recreate based on the new 205 * state (it may no longer be in a group). 206 */ 207 pii_other = phyint_inst_other(pii); 208 if (pii_other != NULL) 209 phyint_inst_delete(pii_other); 210 phyint_inst_delete(pii); 211 pii = phyint_inst_init_from_k(af, name); 212 break; 213 214 case PI_DELETED: 215 /* Phyint instance has disappeared from kernel */ 216 pii->pii_in_use = 0; 217 break; 218 219 case PI_OK: 220 /* Phyint instance exists and is fine */ 221 pii->pii_in_use = 1; 222 break; 223 224 default: 225 /* Unknown status */ 226 logerr("pii_process: Unknown status %d\n", err); 227 break; 228 } 229 } 230 231 *pii_p = pii; 232 if (pii != NULL) 233 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 234 else 235 return (_B_FALSE); 236 } 237 238 /* 239 * Scan all interfaces to detect changes as well as new and deleted interfaces 240 */ 241 static void 242 initifs() 243 { 244 int i, nlifr; 245 int af; 246 char *cp; 247 char *buf; 248 int sockfd; 249 uint64_t flags; 250 struct lifnum lifn; 251 struct lifconf lifc; 252 struct lifreq lifreq; 253 struct lifreq *lifr; 254 struct logint *li; 255 struct phyint_instance *pii; 256 struct phyint_instance *next_pii; 257 struct phyint_group *pg, *next_pg; 258 char pi_name[LIFNAMSIZ + 1]; 259 260 if (debug & D_PHYINT) 261 logdebug("initifs: Scanning interfaces\n"); 262 263 last_initifs_time = getcurrenttime(); 264 265 /* 266 * Free the existing local address list; we'll build a new list below. 267 */ 268 addrlist_free(&localaddrs); 269 270 /* 271 * Mark the interfaces so that we can find phyints and logints 272 * which have disappeared from the kernel. pii_process() and 273 * logint_init_from_k() will set {pii,li}_in_use when they find 274 * the interface in the kernel. Also, clear dupaddr bit on probe 275 * logint. check_addr_unique() will set the dupaddr bit on the 276 * probe logint, if the testaddress is not unique. 277 */ 278 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 279 pii->pii_in_use = 0; 280 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 281 li->li_in_use = 0; 282 if (pii->pii_probe_logint == li) 283 li->li_dupaddr = 0; 284 } 285 } 286 287 /* 288 * As above, mark groups so that we can detect IPMP interfaces which 289 * have been removed from the kernel. Also, delete the group address 290 * list since we'll iteratively recreate it below. 291 */ 292 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 293 pg->pg_in_use = _B_FALSE; 294 addrlist_free(&pg->pg_addrs); 295 } 296 297 lifn.lifn_family = AF_UNSPEC; 298 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 299 again: 300 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 301 logperror("initifs: ioctl (get interface count)"); 302 return; 303 } 304 /* 305 * Pad the interface count to detect when additional interfaces have 306 * been configured between SIOCGLIFNUM and SIOCGLIFCONF. 307 */ 308 lifn.lifn_count += 4; 309 310 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { 311 logperror("initifs: calloc"); 312 return; 313 } 314 315 lifc.lifc_family = AF_UNSPEC; 316 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 317 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); 318 lifc.lifc_buf = buf; 319 320 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 321 logperror("initifs: ioctl (get interface configuration)"); 322 free(buf); 323 return; 324 } 325 326 /* 327 * If every lifr_req slot is taken, then additional interfaces must 328 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. 329 * Recalculate to make sure we didn't miss any interfaces. 330 */ 331 nlifr = lifc.lifc_len / sizeof (struct lifreq); 332 if (nlifr >= lifn.lifn_count) { 333 free(buf); 334 goto again; 335 } 336 337 /* 338 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the 339 * global list of addresses, phyint groups, phyints, and logints. 340 */ 341 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { 342 af = lifr->lifr_addr.ss_family; 343 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 344 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); 345 346 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 347 if (errno != ENXIO) 348 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 349 continue; 350 } 351 flags = lifreq.lifr_flags; 352 353 /* 354 * If the address is IFF_UP, add it to the local address list. 355 * (We ignore addresses that aren't IFF_UP since another node 356 * might legitimately have that address IFF_UP.) 357 */ 358 if (flags & IFF_UP) { 359 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, 360 &lifr->lifr_addr); 361 } 362 363 /* 364 * If this address is on an IPMP meta-interface, update our 365 * phyint_group information (either by recording that group 366 * still exists or creating a new group), and track what 367 * group the address is part of. 368 */ 369 if (flags & IFF_IPMP) { 370 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { 371 if (errno != ENXIO) 372 logperror("initifs: ioctl " 373 "(SIOCGLIFGROUPNAME)"); 374 continue; 375 } 376 377 pg = phyint_group_lookup(lifreq.lifr_groupname); 378 if (pg == NULL) { 379 pg = phyint_group_create(lifreq.lifr_groupname); 380 if (pg == NULL) { 381 logerr("initifs: cannot create group " 382 "%s\n", lifreq.lifr_groupname); 383 continue; 384 } 385 phyint_group_insert(pg); 386 } 387 pg->pg_in_use = _B_TRUE; 388 389 /* 390 * Add this to the group's list of data addresses. 391 */ 392 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, 393 &lifr->lifr_addr)) { 394 logerr("initifs: insufficient memory to track " 395 "data address information for %s\n", 396 lifr->lifr_name); 397 } 398 continue; 399 } 400 401 /* 402 * This isn't an address on an IPMP meta-interface, so it's 403 * either on an underlying interface or not related to any 404 * group. Update our phyint and logint information (via 405 * pii_process() and logint_init_from_k()) -- but first, 406 * convert the logint name to a phyint name so we can call 407 * pii_process(). 408 */ 409 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 410 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 411 *cp = '\0'; 412 413 if (pii_process(af, pi_name, &pii)) { 414 /* The phyint is fine. So process the logint */ 415 logint_init_from_k(pii, lifr->lifr_name); 416 check_addr_unique(pii, &lifr->lifr_addr); 417 } 418 } 419 free(buf); 420 421 /* 422 * Scan for groups, phyints and logints that have disappeared from the 423 * kernel, and delete them. 424 */ 425 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 426 next_pii = pii->pii_next; 427 check_if_removed(pii); 428 } 429 430 for (pg = phyint_groups; pg != NULL; pg = next_pg) { 431 next_pg = pg->pg_next; 432 if (!pg->pg_in_use) { 433 phyint_group_delete(pg); 434 continue; 435 } 436 /* 437 * Refresh the group's state. This is necessary since the 438 * group's state is defined by the set of usable interfaces in 439 * the group, and an interface is considered unusable if all 440 * of its addresses are down. When an address goes down/up, 441 * the RTM_DELADDR/RTM_NEWADDR brings us through here. 442 */ 443 phyint_group_refresh_state(pg); 444 } 445 446 /* 447 * Select a test address for sending probes on each phyint instance 448 */ 449 select_test_ifs(); 450 451 /* 452 * Handle link up/down notifications. 453 */ 454 process_link_state_changes(); 455 } 456 457 /* 458 * Check that a given test address is unique across all of the interfaces in a 459 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 460 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 461 * Any issues will be reported by check_testconfig(). 462 */ 463 static void 464 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 465 { 466 struct phyint *pi; 467 struct phyint_group *pg; 468 struct in6_addr addr; 469 struct phyint_instance *pii; 470 struct sockaddr_in *sin; 471 472 if (ss->ss_family == AF_INET) { 473 sin = (struct sockaddr_in *)ss; 474 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 475 } else { 476 assert(ss->ss_family == AF_INET6); 477 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 478 } 479 480 /* 481 * For anonymous groups, every interface is assumed to be on its own 482 * link, so there is no chance of overlapping addresses. 483 */ 484 pg = ourpii->pii_phyint->pi_group; 485 if (pg == phyint_anongroup) 486 return; 487 488 /* 489 * Walk the list of phyint instances in the group and check for test 490 * addresses matching ours. Of course, we skip ourself. 491 */ 492 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 493 pii = PHYINT_INSTANCE(pi, ss->ss_family); 494 if (pii == NULL || pii == ourpii || 495 pii->pii_probe_logint == NULL) 496 continue; 497 498 /* 499 * If this test address is not unique, set the dupaddr bit. 500 */ 501 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 502 pii->pii_probe_logint->li_dupaddr = 1; 503 } 504 } 505 506 /* 507 * Stop probing an interface. Called when an interface is offlined. 508 * The probe socket is closed on each interface instance, and the 509 * interface state set to PI_OFFLINE. 510 */ 511 void 512 stop_probing(struct phyint *pi) 513 { 514 struct phyint_instance *pii; 515 516 pii = pi->pi_v4; 517 if (pii != NULL) { 518 if (pii->pii_probe_sock != -1) 519 close_probe_socket(pii, _B_TRUE); 520 pii->pii_probe_logint = NULL; 521 } 522 523 pii = pi->pi_v6; 524 if (pii != NULL) { 525 if (pii->pii_probe_sock != -1) 526 close_probe_socket(pii, _B_TRUE); 527 pii->pii_probe_logint = NULL; 528 } 529 530 phyint_chstate(pi, PI_OFFLINE); 531 } 532 533 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 534 535 /* 536 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 537 * IFF_UP must also be set so that the associated address can be used as a 538 * source address. Further, we must be able to exchange packets with local 539 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 540 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 541 */ 542 static int 543 rate_testflags(uint64_t flags) 544 { 545 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 546 return (BAD_TESTFLAGS); 547 548 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 549 return (BAD_TESTFLAGS); 550 551 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 552 return (BEST_TESTFLAGS); 553 554 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 555 return (BEST_TESTFLAGS); 556 557 return (OK_TESTFLAGS); 558 } 559 560 /* 561 * Attempt to select a test address for each phyint instance. 562 * Call phyint_inst_sockinit() to complete the initializations. 563 */ 564 static void 565 select_test_ifs(void) 566 { 567 struct phyint *pi; 568 struct phyint_instance *pii; 569 struct phyint_instance *next_pii; 570 struct logint *li; 571 struct logint *probe_logint; 572 boolean_t target_scan_reqd = _B_FALSE; 573 int rating; 574 575 if (debug & D_PHYINT) 576 logdebug("select_test_ifs\n"); 577 578 /* 579 * For each phyint instance, do the test address selection 580 */ 581 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 582 next_pii = pii->pii_next; 583 probe_logint = NULL; 584 585 /* 586 * An interface that is offline should not be probed. 587 * IFF_OFFLINE interfaces should always be PI_OFFLINE 588 * unless some other entity has set the offline flag. 589 */ 590 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 591 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 592 logerr("shouldn't be probing offline" 593 " interface %s (state is: %u)." 594 " Stopping probes.\n", 595 pii->pii_phyint->pi_name, 596 pii->pii_phyint->pi_state); 597 stop_probing(pii->pii_phyint); 598 } 599 continue; 600 } else { 601 /* 602 * If something cleared IFF_OFFLINE (e.g., by accident 603 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is 604 * inherently racy), the phyint may still be offline. 605 * Just ignore it. 606 */ 607 if (pii->pii_phyint->pi_state == PI_OFFLINE) 608 continue; 609 } 610 611 li = pii->pii_probe_logint; 612 if (li != NULL) { 613 /* 614 * We've already got a test address; only proceed 615 * if it's suboptimal. 616 */ 617 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 618 continue; 619 } 620 621 /* 622 * Walk the logints of this phyint instance, and select 623 * the best available test address 624 */ 625 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 626 /* 627 * Skip 0.0.0.0 addresses, as those are never 628 * actually usable. 629 */ 630 if (pii->pii_af == AF_INET && 631 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 632 continue; 633 634 /* 635 * Skip any IPv6 logints that are not link-local, 636 * since we should always have a link-local address 637 * anyway and in6_data() expects link-local replies. 638 */ 639 if (pii->pii_af == AF_INET6 && 640 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 641 continue; 642 643 /* 644 * Rate the testflags. If we've found an optimal 645 * match, then break out; otherwise, record the most 646 * recent OK one. 647 */ 648 rating = rate_testflags(li->li_flags); 649 if (rating == BAD_TESTFLAGS) 650 continue; 651 652 probe_logint = li; 653 if (rating == BEST_TESTFLAGS) 654 break; 655 } 656 657 /* 658 * If the probe logint has changed, ditch the old one. 659 */ 660 if (pii->pii_probe_logint != NULL && 661 pii->pii_probe_logint != probe_logint) { 662 if (pii->pii_probe_sock != -1) 663 close_probe_socket(pii, _B_TRUE); 664 pii->pii_probe_logint = NULL; 665 } 666 667 if (probe_logint == NULL) { 668 /* 669 * We don't have a test address; zero out the probe 670 * stats array since it is no longer relevant. 671 * Optimize by checking if it is already zeroed out. 672 */ 673 int pr_ndx; 674 675 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 676 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 677 clear_pii_probe_stats(pii); 678 reset_crtt_all(pii->pii_phyint); 679 } 680 continue; 681 } else if (probe_logint == pii->pii_probe_logint) { 682 /* 683 * If we didn't find any new test addr, go to the 684 * next phyint. 685 */ 686 continue; 687 } 688 689 /* 690 * The phyint is either being assigned a new testaddr 691 * or is being assigned a testaddr for the 1st time. 692 * Need to initialize the phyint socket 693 */ 694 pii->pii_probe_logint = probe_logint; 695 if (!phyint_inst_sockinit(pii)) { 696 if (debug & D_PHYINT) { 697 logdebug("select_test_ifs: " 698 "phyint_sockinit failed\n"); 699 } 700 phyint_inst_delete(pii); 701 continue; 702 } 703 704 /* 705 * This phyint instance is now enabled for probes; this 706 * impacts our state machine in two ways: 707 * 708 * 1. If we're probe *capable* as well (i.e., we have 709 * probe targets) and the interface is in PI_NOTARGETS, 710 * then transition to PI_RUNNING. 711 * 712 * 2. If we're not probe capable, and the other phyint 713 * instance is also not probe capable, and we were in 714 * PI_RUNNING, then transition to PI_NOTARGETS. 715 * 716 * Also see the state diagram in mpd_probe.c. 717 */ 718 if (PROBE_CAPABLE(pii)) { 719 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 720 phyint_chstate(pii->pii_phyint, PI_RUNNING); 721 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 722 if (pii->pii_phyint->pi_state == PI_RUNNING) 723 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 724 } 725 726 /* 727 * If no targets are currently known for this phyint 728 * we need to call init_router_targets. Since 729 * init_router_targets() initializes the list of targets 730 * for all phyints it is done below the loop. 731 */ 732 if (pii->pii_targets == NULL) 733 target_scan_reqd = _B_TRUE; 734 735 /* 736 * Start the probe timer for this instance. 737 */ 738 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 739 start_timer(pii); 740 pii->pii_basetime_inited = 1; 741 } 742 } 743 744 /* 745 * Scan the interface list for any interfaces that are PI_FAILED or 746 * PI_NOTARGETS but no longer enabled to send probes, and call 747 * phyint_check_for_repair() to see if the link state indicates that 748 * the interface should be repaired. Also see the state diagram in 749 * mpd_probe.c. 750 */ 751 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 752 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && 753 (pi->pi_state == PI_FAILED || 754 pi->pi_state == PI_NOTARGETS)) { 755 phyint_check_for_repair(pi); 756 } 757 } 758 759 check_testconfig(); 760 761 /* 762 * Try to populate the target list. init_router_targets populates 763 * the target list from the routing table. If our target list is 764 * still empty, init_host_targets adds host targets based on the 765 * host target list of other phyints in the group. 766 */ 767 if (target_scan_reqd) { 768 init_router_targets(); 769 init_host_targets(); 770 } 771 } 772 773 /* 774 * Check test address configuration, and log notices/errors if appropriate. 775 * Note that this function only logs pre-existing conditions (e.g., that 776 * probe-based failure detection is disabled). 777 */ 778 static void 779 check_testconfig(void) 780 { 781 struct phyint *pi; 782 struct logint *li; 783 char abuf[INET6_ADDRSTRLEN]; 784 int pri; 785 786 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 787 if (pi->pi_flags & IFF_OFFLINE) 788 continue; 789 790 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 791 if (pi->pi_taddrmsg_printed || 792 pi->pi_duptaddrmsg_printed) { 793 if (pi->pi_duptaddrmsg_printed) 794 pri = LOG_ERR; 795 else 796 pri = LOG_INFO; 797 logmsg(pri, "Test address now configured on " 798 "interface %s; enabling probe-based " 799 "failure detection on it\n", pi->pi_name); 800 pi->pi_taddrmsg_printed = 0; 801 pi->pi_duptaddrmsg_printed = 0; 802 } 803 continue; 804 } 805 806 li = NULL; 807 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 808 pi->pi_v4->pii_probe_logint->li_dupaddr) 809 li = pi->pi_v4->pii_probe_logint; 810 811 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 812 pi->pi_v6->pii_probe_logint->li_dupaddr) 813 li = pi->pi_v6->pii_probe_logint; 814 815 if (li != NULL && li->li_dupaddr) { 816 if (pi->pi_duptaddrmsg_printed) 817 continue; 818 logerr("Test address %s is not unique in group; " 819 "disabling probe-based failure detection on %s\n", 820 pr_addr(li->li_phyint_inst->pii_af, 821 li->li_addr, abuf, sizeof (abuf)), pi->pi_name); 822 pi->pi_duptaddrmsg_printed = 1; 823 continue; 824 } 825 826 if (getcurrentsec() < pi->pi_taddrthresh) 827 continue; 828 829 if (!pi->pi_taddrmsg_printed) { 830 logtrace("No test address configured on interface %s; " 831 "disabling probe-based failure detection on it\n", 832 pi->pi_name); 833 pi->pi_taddrmsg_printed = 1; 834 } 835 } 836 } 837 838 /* 839 * Check phyint group configuration, to detect any inconsistencies, 840 * and log an error message. This is called from runtimeouts every 841 * 20 secs. But the error message is displayed once. If the 842 * consistency is resolved by the admin, a recovery message is displayed 843 * once. 844 */ 845 static void 846 check_config(void) 847 { 848 struct phyint_group *pg; 849 struct phyint *pi; 850 boolean_t v4_in_group; 851 boolean_t v6_in_group; 852 853 /* 854 * All phyints of a group must be homogeneous to ensure that they can 855 * take over for one another. If any phyint in a group has IPv4 856 * plumbed, check that all phyints have IPv4 plumbed. Do a similar 857 * check for IPv6. 858 */ 859 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 860 if (pg == phyint_anongroup) 861 continue; 862 863 v4_in_group = _B_FALSE; 864 v6_in_group = _B_FALSE; 865 /* 866 * 1st pass. Determine if at least 1 phyint in the group 867 * has IPv4 plumbed and if so set v4_in_group to true. 868 * Repeat similarly for IPv6. 869 */ 870 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 871 if (pi->pi_v4 != NULL) 872 v4_in_group = _B_TRUE; 873 if (pi->pi_v6 != NULL) 874 v6_in_group = _B_TRUE; 875 } 876 877 /* 878 * 2nd pass. If v4_in_group is true, check that phyint 879 * has IPv4 plumbed. Repeat similarly for IPv6. Print 880 * out a message the 1st time only. 881 */ 882 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 883 if (pi->pi_flags & IFF_OFFLINE) 884 continue; 885 886 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 887 if (!pi->pi_cfgmsg_printed) { 888 logerr("IP interface %s in group %s is" 889 " not plumbed for IPv4, affecting" 890 " IPv4 connectivity\n", 891 pi->pi_name, 892 pi->pi_group->pg_name); 893 pi->pi_cfgmsg_printed = 1; 894 } 895 } else if (v6_in_group == _B_TRUE && 896 pi->pi_v6 == NULL) { 897 if (!pi->pi_cfgmsg_printed) { 898 logerr("IP interface %s in group %s is" 899 " not plumbed for IPv6, affecting" 900 " IPv6 connectivity\n", 901 pi->pi_name, 902 pi->pi_group->pg_name); 903 pi->pi_cfgmsg_printed = 1; 904 } 905 } else { 906 /* 907 * The phyint matches the group configuration, 908 * if we have reached this point. If it was 909 * improperly configured earlier, log an 910 * error recovery message 911 */ 912 if (pi->pi_cfgmsg_printed) { 913 logerr("IP interface %s is now" 914 " consistent with group %s " 915 " and connectivity is restored\n", 916 pi->pi_name, pi->pi_group->pg_name); 917 pi->pi_cfgmsg_printed = 0; 918 } 919 } 920 921 } 922 } 923 } 924 925 /* 926 * Timer mechanism using relative time (in milliseconds) from the 927 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 928 * will fire after TIMER_INFINITY milliseconds. 929 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 930 * time values. Hence 2 consecutive timer events cannot be spaced farther 931 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 932 * that can be passed for the delay parameter of timer_schedule() 933 */ 934 static uint_t timer_next; /* Currently scheduled timeout */ 935 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 936 937 static void 938 timer_init(void) 939 { 940 timer_next = getcurrenttime() + TIMER_INFINITY; 941 /* 942 * The call to run_timeouts() will get the timer started 943 * Since there are no phyints at this point, the timer will 944 * be set for IF_SCAN_INTERVAL ms. 945 */ 946 run_timeouts(); 947 } 948 949 /* 950 * Make sure the next SIGALRM occurs delay milliseconds from the current 951 * time if not earlier. We are interested only in time differences. 952 */ 953 void 954 timer_schedule(uint_t delay) 955 { 956 uint_t now; 957 struct itimerval itimerval; 958 959 if (debug & D_TIMER) 960 logdebug("timer_schedule(%u)\n", delay); 961 962 assert(delay <= TIMER_INFINITY); 963 964 now = getcurrenttime(); 965 if (delay == 0) { 966 /* Minimum allowed delay */ 967 delay = 1; 968 } 969 /* Will this timer occur before the currently scheduled SIGALRM? */ 970 if (timer_active && TIME_GE(now + delay, timer_next)) { 971 if (debug & D_TIMER) { 972 logdebug("timer_schedule(%u) - no action: " 973 "now %u next %u\n", delay, now, timer_next); 974 } 975 return; 976 } 977 timer_next = now + delay; 978 979 itimerval.it_value.tv_sec = delay / 1000; 980 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 981 itimerval.it_interval.tv_sec = 0; 982 itimerval.it_interval.tv_usec = 0; 983 if (debug & D_TIMER) { 984 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 985 delay, itimerval.it_value.tv_sec, 986 itimerval.it_value.tv_usec); 987 } 988 timer_active = _B_TRUE; 989 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 990 logperror("timer_schedule: setitimer"); 991 exit(2); 992 } 993 } 994 995 /* 996 * Timer has fired. Determine when the next timer event will occur by asking 997 * all the timer routines. Should not be called from a timer routine. 998 */ 999 static void 1000 run_timeouts(void) 1001 { 1002 uint_t next; 1003 uint_t next_event_time; 1004 struct phyint_instance *pii; 1005 struct phyint_instance *next_pii; 1006 static boolean_t timeout_running; 1007 1008 /* assert that recursive timeouts don't happen. */ 1009 assert(!timeout_running); 1010 1011 timeout_running = _B_TRUE; 1012 1013 if (debug & D_TIMER) 1014 logdebug("run_timeouts()\n"); 1015 1016 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1017 initifs(); 1018 check_config(); 1019 } 1020 1021 next = TIMER_INFINITY; 1022 1023 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1024 next_pii = pii->pii_next; 1025 next_event_time = phyint_inst_timer(pii); 1026 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1027 next = next_event_time; 1028 1029 if (debug & D_TIMER) { 1030 logdebug("run_timeouts(%s %s): next scheduled for" 1031 " this phyint inst %u, next scheduled global" 1032 " %u ms\n", 1033 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1034 next_event_time, next); 1035 } 1036 } 1037 1038 /* 1039 * Make sure initifs() is called at least once every 1040 * IF_SCAN_INTERVAL, to make sure that we are in sync 1041 * with the kernel, in case we have missed any routing 1042 * socket messages. 1043 */ 1044 if (next > IF_SCAN_INTERVAL) 1045 next = IF_SCAN_INTERVAL; 1046 1047 if (debug & D_TIMER) 1048 logdebug("run_timeouts: %u ms\n", next); 1049 1050 timer_schedule(next); 1051 timeout_running = _B_FALSE; 1052 } 1053 1054 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1055 static int eventpipe_write = -1; 1056 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ 1057 1058 /* 1059 * Ensure that signals are processed synchronously with the rest of 1060 * the code by just writing a one character signal number on the pipe. 1061 * The poll loop will pick this up and process the signal event. 1062 */ 1063 static void 1064 sig_handler(int signo) 1065 { 1066 uchar_t buf = (uchar_t)signo; 1067 1068 /* 1069 * Don't write to pipe if cleanup has already begun. cleanup() 1070 * might have closed the pipe already 1071 */ 1072 if (cleanup_started) 1073 return; 1074 1075 if (eventpipe_write == -1) { 1076 logerr("sig_handler: no pipe found\n"); 1077 return; 1078 } 1079 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1080 logperror("sig_handler: write"); 1081 } 1082 1083 extern struct probes_missed probes_missed; 1084 1085 /* 1086 * Pick up a signal "byte" from the pipe and process it. 1087 */ 1088 static void 1089 in_signal(int fd) 1090 { 1091 uchar_t buf; 1092 uint64_t sent, acked, lost, unacked, unknown; 1093 struct phyint_instance *pii; 1094 int pr_ndx; 1095 1096 switch (read(fd, &buf, sizeof (buf))) { 1097 case -1: 1098 logperror("in_signal: read"); 1099 exit(1); 1100 /* NOTREACHED */ 1101 case 1: 1102 break; 1103 case 0: 1104 logerr("in_signal: read end of file\n"); 1105 exit(1); 1106 /* NOTREACHED */ 1107 default: 1108 logerr("in_signal: read > 1\n"); 1109 exit(1); 1110 } 1111 1112 if (debug & D_TIMER) 1113 logdebug("in_signal() got %d\n", buf); 1114 1115 switch (buf) { 1116 case SIGALRM: 1117 if (debug & D_TIMER) { 1118 uint_t now = getcurrenttime(); 1119 1120 logdebug("in_signal(SIGALRM) delta %u\n", 1121 now - timer_next); 1122 } 1123 timer_active = _B_FALSE; 1124 run_timeouts(); 1125 break; 1126 case SIGUSR1: 1127 logdebug("Printing configuration:\n"); 1128 /* Print out the internal tables */ 1129 phyint_inst_print_all(); 1130 1131 /* 1132 * Print out the accumulated statistics about missed 1133 * probes (happens due to scheduling delay). 1134 */ 1135 logerr("Missed sending total of %d probes spread over" 1136 " %d occurrences\n", probes_missed.pm_nprobes, 1137 probes_missed.pm_ntimes); 1138 1139 /* 1140 * Print out the accumulated statistics about probes 1141 * that were sent. 1142 */ 1143 for (pii = phyint_instances; pii != NULL; 1144 pii = pii->pii_next) { 1145 unacked = 0; 1146 acked = pii->pii_cum_stats.acked; 1147 lost = pii->pii_cum_stats.lost; 1148 sent = pii->pii_cum_stats.sent; 1149 unknown = pii->pii_cum_stats.unknown; 1150 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1151 switch (pii->pii_probes[pr_ndx].pr_status) { 1152 case PR_ACKED: 1153 acked++; 1154 break; 1155 case PR_LOST: 1156 lost++; 1157 break; 1158 case PR_UNACKED: 1159 unacked++; 1160 break; 1161 } 1162 } 1163 logerr("\nProbe stats on (%s %s)\n" 1164 "Number of probes sent %lld\n" 1165 "Number of probe acks received %lld\n" 1166 "Number of probes/acks lost %lld\n" 1167 "Number of valid unacknowledged probes %lld\n" 1168 "Number of ambiguous probe acks received %lld\n", 1169 AF_STR(pii->pii_af), pii->pii_name, 1170 sent, acked, lost, unacked, unknown); 1171 } 1172 break; 1173 case SIGHUP: 1174 logerr("SIGHUP: restart and reread config file\n"); 1175 cleanup(); 1176 (void) execv(argv0[0], argv0); 1177 _exit(0177); 1178 /* NOTREACHED */ 1179 case SIGINT: 1180 case SIGTERM: 1181 case SIGQUIT: 1182 cleanup(); 1183 exit(0); 1184 /* NOTREACHED */ 1185 default: 1186 logerr("in_signal: unknown signal: %d\n", buf); 1187 } 1188 } 1189 1190 static void 1191 cleanup(void) 1192 { 1193 struct phyint_instance *pii; 1194 struct phyint_instance *next_pii; 1195 1196 /* 1197 * Make sure that we don't write to eventpipe in 1198 * sig_handler() if any signal notably SIGALRM, 1199 * occurs after we close the eventpipe descriptor below 1200 */ 1201 cleanup_started = _B_TRUE; 1202 1203 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1204 next_pii = pii->pii_next; 1205 phyint_inst_delete(pii); 1206 } 1207 1208 (void) close(ifsock_v4); 1209 (void) close(ifsock_v6); 1210 (void) close(rtsock_v4); 1211 (void) close(rtsock_v6); 1212 (void) close(lsock_v4); 1213 (void) close(lsock_v6); 1214 (void) close(0); 1215 (void) close(1); 1216 (void) close(2); 1217 (void) close(mibfd); 1218 (void) close(eventpipe_read); 1219 (void) close(eventpipe_write); 1220 } 1221 1222 /* 1223 * Create pipe for signal delivery and set up signal handlers. 1224 */ 1225 static void 1226 setup_eventpipe(void) 1227 { 1228 int fds[2]; 1229 struct sigaction act; 1230 1231 if ((pipe(fds)) < 0) { 1232 logperror("setup_eventpipe: pipe"); 1233 exit(1); 1234 } 1235 eventpipe_read = fds[0]; 1236 eventpipe_write = fds[1]; 1237 if (poll_add(eventpipe_read) == -1) { 1238 exit(1); 1239 } 1240 1241 act.sa_handler = sig_handler; 1242 act.sa_flags = SA_RESTART; 1243 (void) sigaction(SIGALRM, &act, NULL); 1244 1245 (void) sigset(SIGHUP, sig_handler); 1246 (void) sigset(SIGUSR1, sig_handler); 1247 (void) sigset(SIGTERM, sig_handler); 1248 (void) sigset(SIGINT, sig_handler); 1249 (void) sigset(SIGQUIT, sig_handler); 1250 } 1251 1252 /* 1253 * Create a routing socket for receiving RTM_IFINFO messages. 1254 */ 1255 static int 1256 setup_rtsock(int af) 1257 { 1258 int s; 1259 int flags; 1260 int aware = RTAW_UNDER_IPMP; 1261 1262 s = socket(PF_ROUTE, SOCK_RAW, af); 1263 if (s == -1) { 1264 logperror("setup_rtsock: socket PF_ROUTE"); 1265 exit(1); 1266 } 1267 1268 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { 1269 logperror("setup_rtsock: setsockopt RT_AWARE"); 1270 (void) close(s); 1271 exit(1); 1272 } 1273 1274 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1275 logperror("setup_rtsock: fcntl F_GETFL"); 1276 (void) close(s); 1277 exit(1); 1278 } 1279 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1280 logperror("setup_rtsock: fcntl F_SETFL"); 1281 (void) close(s); 1282 exit(1); 1283 } 1284 if (poll_add(s) == -1) { 1285 (void) close(s); 1286 exit(1); 1287 } 1288 return (s); 1289 } 1290 1291 /* 1292 * Process an RTM_IFINFO message received on a routing socket. 1293 * The return value indicates whether a full interface scan is required. 1294 * Link up/down notifications are reflected in the IFF_RUNNING flag. 1295 * If just the state of the IFF_RUNNING interface flag has changed, a 1296 * a full interface scan isn't required. 1297 */ 1298 static boolean_t 1299 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1300 { 1301 struct sockaddr_dl *sdl; 1302 struct phyint *pi; 1303 uint64_t old_flags; 1304 struct phyint_instance *pii; 1305 1306 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1307 1308 /* 1309 * Although the sockaddr_dl structure is directly after the 1310 * if_msghdr_t structure. At the time of writing, the size of the 1311 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1312 * to the presence of a timeval structure, which contains longs, 1313 * in the if_data structure. Anyway, we know where the message ends, 1314 * so we work backwards to get the start of the sockaddr_dl structure. 1315 */ 1316 /*LINTED*/ 1317 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1318 sizeof (struct sockaddr_dl)); 1319 1320 assert(sdl->sdl_family == AF_LINK); 1321 1322 /* 1323 * The interface name is in sdl_data. 1324 * RTM_IFINFO messages are only generated for logical interface 1325 * zero, so there is no colon and logical interface number to 1326 * strip from the name. The name is not null terminated, but 1327 * there should be enough space in sdl_data to add the null. 1328 */ 1329 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1330 if (debug & D_LINKNOTE) 1331 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1332 return (_B_TRUE); 1333 } 1334 sdl->sdl_data[sdl->sdl_nlen] = 0; 1335 1336 pi = phyint_lookup(sdl->sdl_data); 1337 if (pi == NULL) { 1338 if (debug & D_LINKNOTE) 1339 logdebug("process_rtm_ifinfo: phyint lookup failed" 1340 " for %s\n", sdl->sdl_data); 1341 return (_B_TRUE); 1342 } 1343 1344 /* 1345 * We want to try and avoid doing a full interface scan for 1346 * link state notifications from the datalink layer, as indicated 1347 * by the state of the IFF_RUNNING flag. If just the 1348 * IFF_RUNNING flag has changed state, the link state changes 1349 * are processed without a full scan. 1350 * If there is both an IPv4 and IPv6 instance associated with 1351 * the physical interface, we will get an RTM_IFINFO message 1352 * for each instance. If we just maintained a single copy of 1353 * the physical interface flags, it would appear that no flags 1354 * had changed when the second message is processed, leading us 1355 * to believe that the message wasn't generated by a flags change, 1356 * and that a full interface scan is required. 1357 * To get around this problem, two additional copies of the flags 1358 * are kept, one copy for each instance. These are only used in 1359 * this routine. At any one time, all three copies of the flags 1360 * should be identical except for the IFF_RUNNING flag. The 1361 * copy of the flags in the "phyint" structure is always up to 1362 * date. 1363 */ 1364 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1365 if (pii == NULL) { 1366 if (debug & D_LINKNOTE) 1367 logdebug("process_rtm_ifinfo: no instance of address " 1368 "family %s for %s\n", AF_STR(type), pi->pi_name); 1369 return (_B_TRUE); 1370 } 1371 1372 old_flags = pii->pii_flags; 1373 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1374 pi->pi_flags = pii->pii_flags; 1375 1376 if (debug & D_LINKNOTE) { 1377 logdebug("process_rtm_ifinfo: %s address family: %s, " 1378 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1379 AF_STR(type), old_flags, pi->pi_flags); 1380 } 1381 1382 /* 1383 * If IFF_STANDBY has changed, indicate that the interface has changed 1384 * types. 1385 */ 1386 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1387 phyint_changed(pi); 1388 1389 /* Has just the IFF_RUNNING flag changed state ? */ 1390 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1391 struct phyint_instance *pii_other; 1392 /* 1393 * It wasn't just a link state change. Update 1394 * the other instance's copy of the flags. 1395 */ 1396 pii_other = phyint_inst_other(pii); 1397 if (pii_other != NULL) 1398 pii_other->pii_flags = pii->pii_flags; 1399 return (_B_TRUE); 1400 } 1401 1402 return (_B_FALSE); 1403 } 1404 1405 /* 1406 * Retrieve as many routing socket messages as possible, and try to 1407 * empty the routing sockets. Initiate full scan of targets or interfaces 1408 * as needed. 1409 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1410 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1411 */ 1412 static void 1413 process_rtsock(int rtsock_v4, int rtsock_v6) 1414 { 1415 int nbytes; 1416 int64_t msg[2048 / 8]; 1417 struct rt_msghdr *rtm; 1418 boolean_t need_if_scan = _B_FALSE; 1419 boolean_t need_rt_scan = _B_FALSE; 1420 boolean_t rtm_ifinfo_seen = _B_FALSE; 1421 int type; 1422 1423 /* Read as many messages as possible and try to empty the sockets */ 1424 for (type = AF_INET; ; type = AF_INET6) { 1425 for (;;) { 1426 nbytes = read((type == AF_INET) ? rtsock_v4 : 1427 rtsock_v6, msg, sizeof (msg)); 1428 if (nbytes <= 0) { 1429 /* No more messages */ 1430 break; 1431 } 1432 rtm = (struct rt_msghdr *)msg; 1433 if (rtm->rtm_version != RTM_VERSION) { 1434 logerr("process_rtsock: version %d " 1435 "not understood\n", rtm->rtm_version); 1436 break; 1437 } 1438 1439 if (debug & D_PHYINT) { 1440 logdebug("process_rtsock: message %d\n", 1441 rtm->rtm_type); 1442 } 1443 1444 switch (rtm->rtm_type) { 1445 case RTM_NEWADDR: 1446 case RTM_DELADDR: 1447 /* 1448 * Some logical interface has changed, 1449 * have to scan everything to determine 1450 * what actually changed. 1451 */ 1452 need_if_scan = _B_TRUE; 1453 break; 1454 1455 case RTM_IFINFO: 1456 rtm_ifinfo_seen = _B_TRUE; 1457 need_if_scan |= process_rtm_ifinfo( 1458 (if_msghdr_t *)rtm, type); 1459 break; 1460 1461 case RTM_ADD: 1462 case RTM_DELETE: 1463 case RTM_CHANGE: 1464 case RTM_OLDADD: 1465 case RTM_OLDDEL: 1466 need_rt_scan = _B_TRUE; 1467 break; 1468 1469 default: 1470 /* Not interesting */ 1471 break; 1472 } 1473 } 1474 if (type == AF_INET6) 1475 break; 1476 } 1477 1478 if (need_if_scan) { 1479 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1480 logdebug("process_rtsock: synchronizing with kernel\n"); 1481 initifs(); 1482 } else if (rtm_ifinfo_seen) { 1483 if (debug & D_LINKNOTE) 1484 logdebug("process_rtsock: " 1485 "link up/down notification(s) seen\n"); 1486 process_link_state_changes(); 1487 } 1488 1489 if (need_rt_scan) 1490 init_router_targets(); 1491 } 1492 1493 /* 1494 * Look if the phyint instance or one of its logints have been removed from 1495 * the kernel and take appropriate action. 1496 * Uses {pii,li}_in_use. 1497 */ 1498 static void 1499 check_if_removed(struct phyint_instance *pii) 1500 { 1501 struct logint *li; 1502 struct logint *next_li; 1503 1504 /* Detect phyints that have been removed from the kernel. */ 1505 if (!pii->pii_in_use) { 1506 logtrace("%s %s has been removed from kernel\n", 1507 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1508 phyint_inst_delete(pii); 1509 } else { 1510 /* Detect logints that have been removed. */ 1511 for (li = pii->pii_logint; li != NULL; li = next_li) { 1512 next_li = li->li_next; 1513 if (!li->li_in_use) { 1514 logint_delete(li); 1515 } 1516 } 1517 } 1518 } 1519 1520 /* 1521 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 1522 * tables defined by mib2.h. Parse the returned data and extract 1523 * the 'routing' information table. Process the 'routing' table 1524 * to get the list of known onlink routers, and update our database. 1525 * These onlink routers will serve as our probe targets. 1526 * Returns false, if any system calls resulted in errors, true otherwise. 1527 */ 1528 static boolean_t 1529 update_router_list(int fd) 1530 { 1531 union { 1532 char ubuf[1024]; 1533 union T_primitives uprim; 1534 } buf; 1535 1536 int flags; 1537 struct strbuf ctlbuf; 1538 struct strbuf databuf; 1539 struct T_optmgmt_req *tor; 1540 struct T_optmgmt_ack *toa; 1541 struct T_error_ack *tea; 1542 struct opthdr *optp; 1543 struct opthdr *req; 1544 int status; 1545 t_scalar_t prim; 1546 1547 tor = (struct T_optmgmt_req *)&buf; 1548 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 1549 tor->OPT_offset = sizeof (struct T_optmgmt_req); 1550 tor->OPT_length = sizeof (struct opthdr); 1551 tor->MGMT_flags = T_CURRENT; 1552 1553 /* 1554 * Note: we use the special level value below so that IP will return 1555 * us information concerning IRE_MARK_TESTHIDDEN routes. 1556 */ 1557 req = (struct opthdr *)&tor[1]; 1558 req->level = EXPER_IP_AND_TESTHIDDEN; 1559 req->name = 0; 1560 req->len = 0; 1561 1562 ctlbuf.buf = (char *)&buf; 1563 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 1564 ctlbuf.maxlen = sizeof (buf); 1565 if (putmsg(fd, &ctlbuf, NULL, 0) == -1) { 1566 logperror("update_router_list: putmsg(ctl)"); 1567 return (_B_FALSE); 1568 } 1569 1570 /* 1571 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 1572 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 1573 * a control and data part. The control part contains a struct 1574 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 1575 * the level, name and length of the data in the data part. The 1576 * data part contains the actual table data. The last message 1577 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 1578 * single option with zero optlen. 1579 */ 1580 1581 for (;;) { 1582 /* 1583 * Go around this loop once for each table. Ignore 1584 * all tables except the routing information table. 1585 */ 1586 flags = 0; 1587 status = getmsg(fd, &ctlbuf, NULL, &flags); 1588 if (status < 0) { 1589 if (errno == EINTR) 1590 continue; 1591 logperror("update_router_list: getmsg(ctl)"); 1592 return (_B_FALSE); 1593 } 1594 if (ctlbuf.len < sizeof (t_scalar_t)) { 1595 logerr("update_router_list: ctlbuf.len %d\n", 1596 ctlbuf.len); 1597 return (_B_FALSE); 1598 } 1599 1600 prim = buf.uprim.type; 1601 1602 switch (prim) { 1603 1604 case T_ERROR_ACK: 1605 tea = &buf.uprim.error_ack; 1606 if (ctlbuf.len < sizeof (struct T_error_ack)) { 1607 logerr("update_router_list: T_ERROR_ACK" 1608 " ctlbuf.len %d\n", ctlbuf.len); 1609 return (_B_FALSE); 1610 } 1611 logerr("update_router_list: T_ERROR_ACK:" 1612 " TLI_error = 0x%lx, UNIX_error = 0x%lx\n", 1613 tea->TLI_error, tea->UNIX_error); 1614 return (_B_FALSE); 1615 1616 case T_OPTMGMT_ACK: 1617 toa = &buf.uprim.optmgmt_ack; 1618 optp = (struct opthdr *)&toa[1]; 1619 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + 1620 sizeof (struct opthdr))) { 1621 logerr("update_router_list: ctlbuf.len %d\n", 1622 ctlbuf.len); 1623 return (_B_FALSE); 1624 } 1625 if (toa->MGMT_flags != T_SUCCESS) { 1626 logerr("update_router_list: MGMT_flags 0x%lx\n", 1627 toa->MGMT_flags); 1628 return (_B_FALSE); 1629 } 1630 break; 1631 1632 default: 1633 logerr("update_router_list: unknown primitive %ld\n", 1634 prim); 1635 return (_B_FALSE); 1636 } 1637 1638 /* Process the T_OPTMGMT_ACK below */ 1639 assert(prim == T_OPTMGMT_ACK); 1640 1641 switch (status) { 1642 case 0: 1643 /* 1644 * We have reached the end of this T_OPTMGMT_ACK 1645 * message. If this is the last message i.e EOD, 1646 * return, else process the next T_OPTMGMT_ACK msg. 1647 */ 1648 if (optp->len == 0 && optp->name == 0 && 1649 optp->level == 0) { 1650 /* 1651 * This is the EOD message. Return 1652 */ 1653 return (_B_TRUE); 1654 } 1655 continue; 1656 1657 case MORECTL: 1658 case MORECTL | MOREDATA: 1659 /* 1660 * This should not happen. We should be able to read 1661 * the control portion in a single getmsg. 1662 */ 1663 logerr("update_router_list: MORECTL\n"); 1664 return (_B_FALSE); 1665 1666 case MOREDATA: 1667 databuf.maxlen = optp->len; 1668 /* malloc of 0 bytes is ok */ 1669 databuf.buf = malloc((size_t)optp->len); 1670 if (databuf.maxlen != 0 && databuf.buf == NULL) { 1671 logperror("update_router_list: malloc"); 1672 return (_B_FALSE); 1673 } 1674 databuf.len = 0; 1675 flags = 0; 1676 for (;;) { 1677 if (getmsg(fd, NULL, &databuf, &flags) >= 0) 1678 break; 1679 if (errno == EINTR) 1680 continue; 1681 1682 logperror("update_router_list: getmsg(data)"); 1683 free(databuf.buf); 1684 return (_B_FALSE); 1685 } 1686 1687 if (optp->level == MIB2_IP && 1688 optp->name == MIB2_IP_ROUTE) { 1689 /* LINTED */ 1690 ire_process_v4((mib2_ipRouteEntry_t *) 1691 databuf.buf, databuf.len); 1692 } else if (optp->level == MIB2_IP6 && 1693 optp->name == MIB2_IP6_ROUTE) { 1694 /* LINTED */ 1695 ire_process_v6((mib2_ipv6RouteEntry_t *) 1696 databuf.buf, databuf.len); 1697 } 1698 free(databuf.buf); 1699 } 1700 } 1701 /* NOTREACHED */ 1702 } 1703 1704 1705 /* 1706 * Convert octet `octp' to a phyint name and store in `ifname' 1707 */ 1708 static void 1709 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) 1710 { 1711 char *cp; 1712 size_t len = MIN(octp->o_length, ifsize - 1); 1713 1714 (void) strncpy(ifname, octp->o_bytes, len); 1715 ifname[len] = '\0'; 1716 1717 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) 1718 *cp = '\0'; 1719 } 1720 1721 /* 1722 * Examine the IPv4 routing table `buf' for possible targets. For each 1723 * possible target, if it's on the same subnet an interface route, pass 1724 * it to router_add_common() for further consideration. 1725 */ 1726 static void 1727 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1728 { 1729 char ifname[LIFNAMSIZ]; 1730 mib2_ipRouteEntry_t *rp, *rp1, *endp; 1731 struct in_addr nexthop_v4; 1732 struct in6_addr nexthop; 1733 1734 if (len == 0) 1735 return; 1736 assert((len % sizeof (mib2_ipRouteEntry_t)) == 0); 1737 1738 endp = buf + (len / sizeof (mib2_ipRouteEntry_t)); 1739 1740 /* 1741 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1742 * cross-reference them with the interface routes to determine if 1743 * they're possible probe targets. 1744 */ 1745 for (rp = buf; rp < endp; rp++) { 1746 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1747 continue; 1748 1749 /* Get the nexthop address. */ 1750 nexthop_v4.s_addr = rp->ipRouteNextHop; 1751 1752 /* 1753 * Rescan the routing table looking for interface routes that 1754 * are on the same subnet, and try to add them. If they're 1755 * not relevant (e.g., the interface route isn't part of an 1756 * IPMP group, router_add_common() will discard). 1757 */ 1758 for (rp1 = buf; rp1 < endp; rp1++) { 1759 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || 1760 rp1->ipRouteIfIndex.o_length == 0) 1761 continue; 1762 1763 if ((rp1->ipRouteDest & rp1->ipRouteMask) != 1764 (nexthop_v4.s_addr & rp1->ipRouteMask)) 1765 continue; 1766 1767 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); 1768 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1769 router_add_common(AF_INET, ifname, nexthop); 1770 } 1771 } 1772 } 1773 1774 void 1775 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1776 { 1777 struct phyint_instance *pii; 1778 struct phyint *pi; 1779 1780 if (debug & D_TARGET) 1781 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1782 1783 /* 1784 * Retrieve the phyint instance; bail if it's not known to us yet. 1785 */ 1786 pii = phyint_inst_lookup(af, ifname); 1787 if (pii == NULL) 1788 return; 1789 1790 /* 1791 * Don't use our own addresses as targets. 1792 */ 1793 if (own_address(nexthop)) 1794 return; 1795 1796 /* 1797 * If the phyint is part a named group, then add the address to all 1798 * members of the group; note that this is suboptimal in the IPv4 case 1799 * as it has already been added to all matching interfaces in 1800 * ire_process_v4(). Otherwise, add the address only to the phyint 1801 * itself, since other phyints in the anongroup may not be on the same 1802 * subnet. 1803 */ 1804 pi = pii->pii_phyint; 1805 if (pi->pi_group == phyint_anongroup) { 1806 target_add(pii, nexthop, _B_TRUE); 1807 } else { 1808 pi = pi->pi_group->pg_phyint; 1809 for (; pi != NULL; pi = pi->pi_pgnext) 1810 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1811 } 1812 } 1813 1814 /* 1815 * Examine the IPv6 routing table `buf' for possible link-local targets, and 1816 * pass any contenders to router_add_common() for further consideration. 1817 */ 1818 static void 1819 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1820 { 1821 struct lifreq lifr; 1822 char ifname[LIFNAMSIZ]; 1823 char grname[LIFGRNAMSIZ]; 1824 mib2_ipv6RouteEntry_t *rp, *rp1, *endp; 1825 struct in6_addr nexthop_v6; 1826 1827 if (debug & D_TARGET) 1828 logdebug("ire_process_v6(len %d)\n", len); 1829 1830 if (len == 0) 1831 return; 1832 1833 assert((len % sizeof (mib2_ipv6RouteEntry_t)) == 0); 1834 endp = buf + (len / sizeof (mib2_ipv6RouteEntry_t)); 1835 1836 /* 1837 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1838 * cross-reference them with the interface routes to determine if 1839 * they're possible probe targets. 1840 */ 1841 for (rp = buf; rp < endp; rp++) { 1842 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || 1843 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) 1844 continue; 1845 1846 /* Get the nexthop address. */ 1847 nexthop_v6 = rp->ipv6RouteNextHop; 1848 1849 /* 1850 * The interface name should always exist for link-locals; 1851 * we use it to map this entry to an IPMP group name. 1852 */ 1853 if (rp->ipv6RouteIfIndex.o_length == 0) 1854 continue; 1855 1856 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); 1857 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || 1858 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { 1859 continue; 1860 } 1861 1862 /* 1863 * Rescan the list of routes for interface routes, and add the 1864 * above target to any interfaces in the same IPMP group. 1865 */ 1866 for (rp1 = buf; rp1 < endp; rp1++) { 1867 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || 1868 rp1->ipv6RouteIfIndex.o_length == 0) { 1869 continue; 1870 } 1871 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); 1872 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); 1873 1874 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && 1875 strcmp(lifr.lifr_groupname, grname) == 0) { 1876 router_add_common(AF_INET6, ifname, nexthop_v6); 1877 } 1878 } 1879 } 1880 } 1881 1882 /* 1883 * Build a list of target routers, by scanning the routing tables. 1884 * It is assumed that interface routes exist, to reach the routers. 1885 */ 1886 static void 1887 init_router_targets(void) 1888 { 1889 struct target *tg; 1890 struct target *next_tg; 1891 struct phyint_instance *pii; 1892 struct phyint *pi; 1893 1894 if (force_mcast) 1895 return; 1896 1897 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1898 pi = pii->pii_phyint; 1899 /* 1900 * Set tg_in_use to false only for router targets. 1901 */ 1902 if (!pii->pii_targets_are_routers) 1903 continue; 1904 1905 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1906 tg->tg_in_use = 0; 1907 } 1908 1909 if (mibfd < 0) { 1910 mibfd = open("/dev/ip", O_RDWR); 1911 if (mibfd < 0) { 1912 logperror("mibopen: ip open"); 1913 exit(1); 1914 } 1915 } 1916 1917 if (!update_router_list(mibfd)) { 1918 (void) close(mibfd); 1919 mibfd = -1; 1920 } 1921 1922 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1923 pi = pii->pii_phyint; 1924 if (!pii->pii_targets_are_routers) 1925 continue; 1926 1927 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1928 next_tg = tg->tg_next; 1929 /* 1930 * If the group has failed, it's likely the route was 1931 * removed by an application affected by that failure. 1932 * In that case, we keep the target so that we can 1933 * reliably repair, at which point we'll refresh the 1934 * target list again. 1935 */ 1936 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) 1937 target_delete(tg); 1938 } 1939 } 1940 } 1941 1942 /* 1943 * Attempt to assign host targets to any interfaces that do not currently 1944 * have probe targets by sharing targets with other interfaces in the group. 1945 */ 1946 static void 1947 init_host_targets(void) 1948 { 1949 struct phyint_instance *pii; 1950 struct phyint_group *pg; 1951 1952 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1953 pg = pii->pii_phyint->pi_group; 1954 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1955 dup_host_targets(pii); 1956 } 1957 } 1958 1959 /* 1960 * Duplicate host targets from other phyints of the group to 1961 * the phyint instance 'desired_pii'. 1962 */ 1963 static void 1964 dup_host_targets(struct phyint_instance *desired_pii) 1965 { 1966 int af; 1967 struct phyint *pi; 1968 struct phyint_instance *pii; 1969 struct target *tg; 1970 1971 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 1972 1973 af = desired_pii->pii_af; 1974 1975 /* 1976 * For every phyint in the same group as desired_pii, check if 1977 * it has any host targets. If so add them to desired_pii. 1978 */ 1979 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 1980 pii = PHYINT_INSTANCE(pi, af); 1981 /* 1982 * We know that we don't have targets on this phyint instance 1983 * since we have been called. But we still check for 1984 * pii_targets_are_routers because another phyint instance 1985 * could have router targets, since IFF_NOFAILOVER addresses 1986 * on different phyint instances may belong to different 1987 * subnets. 1988 */ 1989 if ((pii == NULL) || (pii == desired_pii) || 1990 pii->pii_targets_are_routers) 1991 continue; 1992 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1993 target_create(desired_pii, tg->tg_address, _B_FALSE); 1994 } 1995 } 1996 } 1997 1998 static void 1999 usage(char *cmd) 2000 { 2001 (void) fprintf(stderr, "usage: %s\n", cmd); 2002 } 2003 2004 2005 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 2006 2007 /* Get an option from the /etc/default/mpathd file */ 2008 static char * 2009 getdefault(char *name) 2010 { 2011 char namebuf[BUFSIZ]; 2012 char *value = NULL; 2013 2014 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 2015 char *cp; 2016 int flags; 2017 2018 /* 2019 * ignore case 2020 */ 2021 flags = defcntl(DC_GETFLAGS, 0); 2022 TURNOFF(flags, DC_CASE); 2023 (void) defcntl(DC_SETFLAGS, flags); 2024 2025 /* Add "=" to the name */ 2026 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 2027 (void) strncat(namebuf, "=", 2); 2028 2029 if ((cp = defread(namebuf)) != NULL) 2030 value = strdup(cp); 2031 2032 /* close */ 2033 (void) defopen((char *)NULL); 2034 } 2035 return (value); 2036 } 2037 2038 2039 /* 2040 * Command line options below 2041 */ 2042 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 2043 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ 2044 static boolean_t adopt = _B_FALSE; 2045 static boolean_t foreground = _B_FALSE; 2046 2047 int 2048 main(int argc, char *argv[]) 2049 { 2050 int i; 2051 int c; 2052 struct phyint *pi; 2053 struct phyint_instance *pii; 2054 char *value; 2055 2056 argv0 = argv; /* Saved for re-exec on SIGHUP */ 2057 srandom(gethostid()); /* Initialize the random number generator */ 2058 2059 /* 2060 * NOTE: The messages output by in.mpathd are not suitable for 2061 * translation, so we do not call textdomain(). 2062 */ 2063 (void) setlocale(LC_ALL, ""); 2064 2065 /* 2066 * Get the user specified value of 'failure detection time' 2067 * from /etc/default/mpathd 2068 */ 2069 value = getdefault("FAILURE_DETECTION_TIME"); 2070 if (value != NULL) { 2071 user_failure_detection_time = 2072 (int)strtol((char *)value, NULL, 0); 2073 2074 if (user_failure_detection_time <= 0) { 2075 user_failure_detection_time = FAILURE_DETECTION_TIME; 2076 logerr("Invalid failure detection time %s, assuming " 2077 "default of %d ms\n", value, 2078 user_failure_detection_time); 2079 2080 } else if (user_failure_detection_time < 2081 MIN_FAILURE_DETECTION_TIME) { 2082 user_failure_detection_time = 2083 MIN_FAILURE_DETECTION_TIME; 2084 logerr("Too small failure detection time of %s, " 2085 "assuming minimum of %d ms\n", value, 2086 user_failure_detection_time); 2087 } 2088 free(value); 2089 } else { 2090 /* User has not specified the parameter, Use default value */ 2091 user_failure_detection_time = FAILURE_DETECTION_TIME; 2092 } 2093 2094 /* 2095 * This gives the frequency at which probes will be sent. 2096 * When fdt ms elapses, we should be able to determine 2097 * whether 5 consecutive probes have failed or not. 2098 * 1 probe will be sent in every user_probe_interval ms, 2099 * randomly anytime in the (0.5 - 1.0) 2nd half of every 2100 * user_probe_interval. Thus when we send out probe 'n' we 2101 * can be sure that probe 'n - 2' is lost, if we have not 2102 * got the ack. (since the probe interval is > crtt). But 2103 * probe 'n - 1' may be a valid unacked probe, since the 2104 * time between 2 successive probes could be as small as 2105 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 2106 */ 2107 user_probe_interval = user_failure_detection_time / 2108 (NUM_PROBE_FAILS + 2); 2109 2110 /* 2111 * Get the user specified value of failback_enabled from 2112 * /etc/default/mpathd 2113 */ 2114 value = getdefault("FAILBACK"); 2115 if (value != NULL) { 2116 if (strcasecmp(value, "yes") == 0) 2117 failback_enabled = _B_TRUE; 2118 else if (strcasecmp(value, "no") == 0) 2119 failback_enabled = _B_FALSE; 2120 else 2121 logerr("Invalid value for FAILBACK %s\n", value); 2122 free(value); 2123 } else { 2124 failback_enabled = _B_TRUE; 2125 } 2126 2127 /* 2128 * Get the user specified value of track_all_phyints from 2129 * /etc/default/mpathd. The sense is reversed in 2130 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2131 */ 2132 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2133 if (value != NULL) { 2134 if (strcasecmp(value, "yes") == 0) 2135 track_all_phyints = _B_FALSE; 2136 else if (strcasecmp(value, "no") == 0) 2137 track_all_phyints = _B_TRUE; 2138 else 2139 logerr("Invalid value for " 2140 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2141 free(value); 2142 } else { 2143 track_all_phyints = _B_FALSE; 2144 } 2145 2146 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2147 switch (c) { 2148 case 'a': 2149 adopt = _B_TRUE; 2150 break; 2151 case 'm': 2152 force_mcast = _B_TRUE; 2153 break; 2154 case 'd': 2155 debug = D_ALL; 2156 foreground = _B_TRUE; 2157 break; 2158 case 'D': 2159 i = (int)strtol(optarg, NULL, 0); 2160 if (i == 0) { 2161 (void) fprintf(stderr, "Bad debug flags: %s\n", 2162 optarg); 2163 exit(1); 2164 } 2165 debug |= i; 2166 foreground = _B_TRUE; 2167 break; 2168 case 'l': 2169 /* 2170 * Turn off link state notification handling. 2171 * Undocumented command line flag, for debugging 2172 * purposes. 2173 */ 2174 handle_link_notifications = _B_FALSE; 2175 break; 2176 default: 2177 usage(argv[0]); 2178 exit(1); 2179 } 2180 } 2181 2182 /* 2183 * The sockets for the loopback command interface should be listening 2184 * before we fork and exit in daemonize(). This way, whoever started us 2185 * can use the loopback interface as soon as they get a zero exit 2186 * status. 2187 */ 2188 lsock_v4 = setup_listener(AF_INET); 2189 lsock_v6 = setup_listener(AF_INET6); 2190 2191 if (lsock_v4 < 0 && lsock_v6 < 0) { 2192 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2193 exit(1); 2194 } 2195 2196 if (!foreground) { 2197 if (!daemonize()) { 2198 logerr("cannot daemonize\n"); 2199 exit(EXIT_FAILURE); 2200 } 2201 initlog(); 2202 } 2203 2204 /* 2205 * Initializations: 2206 * 1. Create ifsock* sockets. These are used for performing SIOC* 2207 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2208 * 2. Initialize a pipe for handling/recording signal events. 2209 * 3. Create the routing sockets, used for listening 2210 * to routing / interface changes. 2211 * 4. phyint_init() - Initialize physical interface state 2212 * (in mpd_tables.c). Must be done before creating interfaces, 2213 * which timer_init() does indirectly. 2214 * 5. timer_init() - Initialize timer related stuff 2215 * 6. initifs() - Initialize our database of all known interfaces 2216 * 7. init_router_targets() - Initialize our database of all known 2217 * router targets. 2218 */ 2219 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2220 if (ifsock_v4 < 0) { 2221 logperror("main: IPv4 socket open"); 2222 exit(1); 2223 } 2224 2225 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2226 if (ifsock_v6 < 0) { 2227 logperror("main: IPv6 socket open"); 2228 exit(1); 2229 } 2230 2231 setup_eventpipe(); 2232 2233 rtsock_v4 = setup_rtsock(AF_INET); 2234 rtsock_v6 = setup_rtsock(AF_INET6); 2235 2236 if (phyint_init() == -1) { 2237 logerr("cannot initialize physical interface structures"); 2238 exit(1); 2239 } 2240 2241 timer_init(); 2242 2243 initifs(); 2244 2245 /* 2246 * If we're operating in "adopt" mode and no interfaces need to be 2247 * tracked, shut down (ifconfig(1M) will restart us on demand if 2248 * interfaces are subsequently put into multipathing groups). 2249 */ 2250 if (adopt && phyint_instances == NULL) 2251 exit(0); 2252 2253 /* 2254 * Main body. Keep listening for activity on any of the sockets 2255 * that we are monitoring and take appropriate action as necessary. 2256 * signals are also handled synchronously. 2257 */ 2258 for (;;) { 2259 if (poll(pollfds, pollfd_num, -1) < 0) { 2260 if (errno == EINTR) 2261 continue; 2262 logperror("main: poll"); 2263 exit(1); 2264 } 2265 for (i = 0; i < pollfd_num; i++) { 2266 if ((pollfds[i].fd == -1) || 2267 !(pollfds[i].revents & POLLIN)) 2268 continue; 2269 if (pollfds[i].fd == eventpipe_read) { 2270 in_signal(eventpipe_read); 2271 break; 2272 } 2273 if (pollfds[i].fd == rtsock_v4 || 2274 pollfds[i].fd == rtsock_v6) { 2275 process_rtsock(rtsock_v4, rtsock_v6); 2276 break; 2277 } 2278 2279 for (pii = phyint_instances; pii != NULL; 2280 pii = pii->pii_next) { 2281 if (pollfds[i].fd == pii->pii_probe_sock) { 2282 if (pii->pii_af == AF_INET) 2283 in_data(pii); 2284 else 2285 in6_data(pii); 2286 break; 2287 } 2288 } 2289 2290 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2291 if (pi->pi_notes != 0 && 2292 pollfds[i].fd == dlpi_fd(pi->pi_dh)) { 2293 (void) dlpi_recv(pi->pi_dh, NULL, NULL, 2294 NULL, NULL, 0, NULL); 2295 break; 2296 } 2297 } 2298 2299 if (pollfds[i].fd == lsock_v4) 2300 loopback_cmd(lsock_v4, AF_INET); 2301 else if (pollfds[i].fd == lsock_v6) 2302 loopback_cmd(lsock_v6, AF_INET6); 2303 } 2304 } 2305 /* NOTREACHED */ 2306 return (EXIT_SUCCESS); 2307 } 2308 2309 static int 2310 setup_listener(int af) 2311 { 2312 int sock; 2313 int on; 2314 int len; 2315 int ret; 2316 struct sockaddr_storage laddr; 2317 struct sockaddr_in *sin; 2318 struct sockaddr_in6 *sin6; 2319 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2320 2321 assert(af == AF_INET || af == AF_INET6); 2322 2323 sock = socket(af, SOCK_STREAM, 0); 2324 if (sock < 0) { 2325 logperror("setup_listener: socket"); 2326 exit(1); 2327 } 2328 2329 on = 1; 2330 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2331 sizeof (on)) < 0) { 2332 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2333 exit(1); 2334 } 2335 2336 bzero(&laddr, sizeof (laddr)); 2337 laddr.ss_family = af; 2338 2339 if (af == AF_INET) { 2340 sin = (struct sockaddr_in *)&laddr; 2341 sin->sin_port = htons(MPATHD_PORT); 2342 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2343 len = sizeof (struct sockaddr_in); 2344 } else { 2345 sin6 = (struct sockaddr_in6 *)&laddr; 2346 sin6->sin6_port = htons(MPATHD_PORT); 2347 sin6->sin6_addr = loopback_addr; 2348 len = sizeof (struct sockaddr_in6); 2349 } 2350 2351 ret = bind(sock, (struct sockaddr *)&laddr, len); 2352 if (ret < 0) { 2353 if (errno == EADDRINUSE) { 2354 /* 2355 * Another instance of mpathd may be already active. 2356 */ 2357 logerr("main: is another instance of in.mpathd " 2358 "already active?\n"); 2359 exit(1); 2360 } else { 2361 (void) close(sock); 2362 return (-1); 2363 } 2364 } 2365 if (listen(sock, 30) < 0) { 2366 logperror("main: listen"); 2367 exit(1); 2368 } 2369 if (poll_add(sock) == -1) { 2370 (void) close(sock); 2371 exit(1); 2372 } 2373 2374 return (sock); 2375 } 2376 2377 /* 2378 * Table of commands and their expected size; used by loopback_cmd(). 2379 */ 2380 static struct { 2381 const char *name; 2382 unsigned int size; 2383 } commands[] = { 2384 { "MI_PING", sizeof (uint32_t) }, 2385 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2386 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2387 { "MI_QUERY", sizeof (mi_query_t) } 2388 }; 2389 2390 /* 2391 * Commands received over the loopback interface come here (via libipmp). 2392 */ 2393 static void 2394 loopback_cmd(int sock, int family) 2395 { 2396 int newfd; 2397 ssize_t len; 2398 boolean_t is_priv = _B_FALSE; 2399 struct sockaddr_storage peer; 2400 struct sockaddr_in *peer_sin; 2401 struct sockaddr_in6 *peer_sin6; 2402 socklen_t peerlen; 2403 union mi_commands mpi; 2404 char abuf[INET6_ADDRSTRLEN]; 2405 uint_t cmd; 2406 int retval; 2407 2408 peerlen = sizeof (peer); 2409 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2410 if (newfd < 0) { 2411 logperror("loopback_cmd: accept"); 2412 return; 2413 } 2414 2415 switch (family) { 2416 case AF_INET: 2417 /* 2418 * Validate the address and port to make sure that 2419 * non privileged processes don't connect and start 2420 * talking to us. 2421 */ 2422 if (peerlen != sizeof (struct sockaddr_in)) { 2423 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2424 (void) close(newfd); 2425 return; 2426 } 2427 peer_sin = (struct sockaddr_in *)&peer; 2428 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; 2429 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2430 abuf, sizeof (abuf)); 2431 2432 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { 2433 logerr("Attempt to connect from addr %s port %d\n", 2434 abuf, ntohs(peer_sin->sin_port)); 2435 (void) close(newfd); 2436 return; 2437 } 2438 break; 2439 2440 case AF_INET6: 2441 if (peerlen != sizeof (struct sockaddr_in6)) { 2442 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2443 (void) close(newfd); 2444 return; 2445 } 2446 /* 2447 * Validate the address and port to make sure that 2448 * non privileged processes don't connect and start 2449 * talking to us. 2450 */ 2451 peer_sin6 = (struct sockaddr_in6 *)&peer; 2452 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; 2453 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2454 sizeof (abuf)); 2455 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { 2456 logerr("Attempt to connect from addr %s port %d\n", 2457 abuf, ntohs(peer_sin6->sin6_port)); 2458 (void) close(newfd); 2459 return; 2460 } 2461 2462 default: 2463 logdebug("loopback_cmd: family %d\n", family); 2464 (void) close(newfd); 2465 return; 2466 } 2467 2468 /* 2469 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2470 * all supported commands 2471 */ 2472 len = read(newfd, &mpi, sizeof (mpi)); 2473 2474 /* 2475 * In theory, we can receive any sized message for a stream socket, 2476 * but we don't expect that to happen for a small message over a 2477 * loopback connection. 2478 */ 2479 if (len < sizeof (uint32_t)) { 2480 logerr("loopback_cmd: bad command format or read returns " 2481 "partial data %d\n", len); 2482 (void) close(newfd); 2483 return; 2484 } 2485 2486 cmd = mpi.mi_command; 2487 if (cmd >= MI_NCMD) { 2488 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2489 (void) close(newfd); 2490 return; 2491 } 2492 2493 /* 2494 * Only MI_PING and MI_QUERY can come from unprivileged sources. 2495 */ 2496 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { 2497 logerr("Unprivileged request from %s for privileged " 2498 "command %s\n", abuf, commands[cmd].name); 2499 (void) close(newfd); 2500 return; 2501 } 2502 2503 if (len < commands[cmd].size) { 2504 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2505 commands[cmd].name, commands[cmd].size, len); 2506 (void) close(newfd); 2507 return; 2508 } 2509 2510 retval = process_cmd(newfd, &mpi); 2511 if (retval != IPMP_SUCCESS) { 2512 logerr("failed processing %s: %s\n", commands[cmd].name, 2513 ipmp_errmsg(retval)); 2514 } 2515 (void) close(newfd); 2516 } 2517 2518 /* 2519 * Process the commands received via libipmp. 2520 */ 2521 static unsigned int 2522 process_cmd(int newfd, union mi_commands *mpi) 2523 { 2524 struct phyint *pi; 2525 struct mi_offline *mio; 2526 struct mi_undo_offline *miu; 2527 unsigned int retval; 2528 2529 switch (mpi->mi_command) { 2530 case MI_PING: 2531 return (send_result(newfd, IPMP_SUCCESS, 0)); 2532 2533 case MI_OFFLINE: 2534 mio = &mpi->mi_ocmd; 2535 2536 pi = phyint_lookup(mio->mio_ifname); 2537 if (pi == NULL) 2538 return (send_result(newfd, IPMP_EUNKIF, 0)); 2539 2540 retval = phyint_offline(pi, mio->mio_min_redundancy); 2541 if (retval == IPMP_FAILURE) 2542 return (send_result(newfd, IPMP_FAILURE, errno)); 2543 2544 return (send_result(newfd, retval, 0)); 2545 2546 case MI_UNDO_OFFLINE: 2547 miu = &mpi->mi_ucmd; 2548 2549 pi = phyint_lookup(miu->miu_ifname); 2550 if (pi == NULL) 2551 return (send_result(newfd, IPMP_EUNKIF, 0)); 2552 2553 retval = phyint_undo_offline(pi); 2554 if (retval == IPMP_FAILURE) 2555 return (send_result(newfd, IPMP_FAILURE, errno)); 2556 2557 return (send_result(newfd, retval, 0)); 2558 2559 case MI_QUERY: 2560 return (process_query(newfd, &mpi->mi_qcmd)); 2561 2562 default: 2563 break; 2564 } 2565 2566 return (send_result(newfd, IPMP_EPROTO, 0)); 2567 } 2568 2569 /* 2570 * Process the query request pointed to by `miq' and send a reply on file 2571 * descriptor `fd'. Returns an IPMP error code. 2572 */ 2573 static unsigned int 2574 process_query(int fd, mi_query_t *miq) 2575 { 2576 ipmp_addrinfo_t *adinfop; 2577 ipmp_addrinfolist_t *adlp; 2578 ipmp_groupinfo_t *grinfop; 2579 ipmp_groupinfolist_t *grlp; 2580 ipmp_grouplist_t *grlistp; 2581 ipmp_ifinfo_t *ifinfop; 2582 ipmp_ifinfolist_t *iflp; 2583 ipmp_snap_t *snap; 2584 unsigned int retval; 2585 2586 switch (miq->miq_inforeq) { 2587 case IPMP_ADDRINFO: 2588 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, 2589 &adinfop); 2590 if (retval != IPMP_SUCCESS) 2591 return (send_result(fd, retval, errno)); 2592 2593 retval = send_result(fd, IPMP_SUCCESS, 0); 2594 if (retval == IPMP_SUCCESS) 2595 retval = send_addrinfo(fd, adinfop); 2596 2597 ipmp_freeaddrinfo(adinfop); 2598 return (retval); 2599 2600 case IPMP_GROUPLIST: 2601 retval = getgrouplist(&grlistp); 2602 if (retval != IPMP_SUCCESS) 2603 return (send_result(fd, retval, errno)); 2604 2605 retval = send_result(fd, IPMP_SUCCESS, 0); 2606 if (retval == IPMP_SUCCESS) 2607 retval = send_grouplist(fd, grlistp); 2608 2609 ipmp_freegrouplist(grlistp); 2610 return (retval); 2611 2612 case IPMP_GROUPINFO: 2613 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2614 retval = getgroupinfo(miq->miq_grname, &grinfop); 2615 if (retval != IPMP_SUCCESS) 2616 return (send_result(fd, retval, errno)); 2617 2618 retval = send_result(fd, IPMP_SUCCESS, 0); 2619 if (retval == IPMP_SUCCESS) 2620 retval = send_groupinfo(fd, grinfop); 2621 2622 ipmp_freegroupinfo(grinfop); 2623 return (retval); 2624 2625 case IPMP_IFINFO: 2626 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2627 retval = getifinfo(miq->miq_ifname, &ifinfop); 2628 if (retval != IPMP_SUCCESS) 2629 return (send_result(fd, retval, errno)); 2630 2631 retval = send_result(fd, IPMP_SUCCESS, 0); 2632 if (retval == IPMP_SUCCESS) 2633 retval = send_ifinfo(fd, ifinfop); 2634 2635 ipmp_freeifinfo(ifinfop); 2636 return (retval); 2637 2638 case IPMP_SNAP: 2639 /* 2640 * Before taking the snapshot, sync with the kernel. 2641 */ 2642 initifs(); 2643 2644 retval = getsnap(&snap); 2645 if (retval != IPMP_SUCCESS) 2646 return (send_result(fd, retval, errno)); 2647 2648 retval = send_result(fd, IPMP_SUCCESS, 0); 2649 if (retval != IPMP_SUCCESS) 2650 goto out; 2651 2652 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2653 if (retval != IPMP_SUCCESS) 2654 goto out; 2655 2656 retval = send_grouplist(fd, snap->sn_grlistp); 2657 if (retval != IPMP_SUCCESS) 2658 goto out; 2659 2660 iflp = snap->sn_ifinfolistp; 2661 for (; iflp != NULL; iflp = iflp->ifl_next) { 2662 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2663 if (retval != IPMP_SUCCESS) 2664 goto out; 2665 } 2666 2667 grlp = snap->sn_grinfolistp; 2668 for (; grlp != NULL; grlp = grlp->grl_next) { 2669 retval = send_groupinfo(fd, grlp->grl_grinfop); 2670 if (retval != IPMP_SUCCESS) 2671 goto out; 2672 } 2673 2674 adlp = snap->sn_adinfolistp; 2675 for (; adlp != NULL; adlp = adlp->adl_next) { 2676 retval = send_addrinfo(fd, adlp->adl_adinfop); 2677 if (retval != IPMP_SUCCESS) 2678 goto out; 2679 } 2680 out: 2681 ipmp_snap_free(snap); 2682 return (retval); 2683 2684 default: 2685 break; 2686 2687 } 2688 return (send_result(fd, IPMP_EPROTO, 0)); 2689 } 2690 2691 /* 2692 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2693 * Returns an IPMP error code. 2694 */ 2695 static unsigned int 2696 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2697 { 2698 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2699 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; 2700 unsigned int retval; 2701 2702 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2703 if (retval != IPMP_SUCCESS) 2704 return (retval); 2705 2706 retval = ipmp_writetlv(fd, IPMP_IFLIST, 2707 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); 2708 if (retval != IPMP_SUCCESS) 2709 return (retval); 2710 2711 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2712 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); 2713 } 2714 2715 /* 2716 * Send the interface information pointed to by `ifinfop' on file descriptor 2717 * `fd'. Returns an IPMP error code. 2718 */ 2719 static unsigned int 2720 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2721 { 2722 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; 2723 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; 2724 unsigned int retval; 2725 2726 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); 2727 if (retval != IPMP_SUCCESS) 2728 return (retval); 2729 2730 retval = ipmp_writetlv(fd, IPMP_ADDRLIST, 2731 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); 2732 if (retval != IPMP_SUCCESS) 2733 return (retval); 2734 2735 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2736 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); 2737 } 2738 2739 /* 2740 * Send the address information pointed to by `adinfop' on file descriptor 2741 * `fd'. Returns an IPMP error code. 2742 */ 2743 static unsigned int 2744 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) 2745 { 2746 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); 2747 } 2748 2749 /* 2750 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2751 * Returns an IPMP error code. 2752 */ 2753 static unsigned int 2754 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2755 { 2756 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2757 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2758 } 2759 2760 /* 2761 * Initialize an mi_result_t structure using `error' and `syserror' and 2762 * send it on file descriptor `fd'. Returns an IPMP error code. 2763 */ 2764 static unsigned int 2765 send_result(int fd, unsigned int error, int syserror) 2766 { 2767 mi_result_t me; 2768 2769 me.me_mpathd_error = error; 2770 if (error == IPMP_FAILURE) 2771 me.me_sys_error = syserror; 2772 else 2773 me.me_sys_error = 0; 2774 2775 return (ipmp_write(fd, &me, sizeof (me))); 2776 } 2777 2778 /* 2779 * Daemonize the process. 2780 */ 2781 static boolean_t 2782 daemonize(void) 2783 { 2784 switch (fork()) { 2785 case -1: 2786 return (_B_FALSE); 2787 2788 case 0: 2789 /* 2790 * Lose our controlling terminal, and become both a session 2791 * leader and a process group leader. 2792 */ 2793 if (setsid() == -1) 2794 return (_B_FALSE); 2795 2796 /* 2797 * Under POSIX, a session leader can accidentally (through 2798 * open(2)) acquire a controlling terminal if it does not 2799 * have one. Just to be safe, fork() again so we are not a 2800 * session leader. 2801 */ 2802 switch (fork()) { 2803 case -1: 2804 return (_B_FALSE); 2805 2806 case 0: 2807 (void) chdir("/"); 2808 (void) umask(022); 2809 (void) fdwalk(closefunc, NULL); 2810 break; 2811 2812 default: 2813 _exit(EXIT_SUCCESS); 2814 } 2815 break; 2816 2817 default: 2818 _exit(EXIT_SUCCESS); 2819 } 2820 2821 return (_B_TRUE); 2822 } 2823 2824 /* 2825 * The parent has created some fds before forking on purpose, keep them open. 2826 */ 2827 static int 2828 closefunc(void *not_used, int fd) 2829 /* ARGSUSED */ 2830 { 2831 if (fd != lsock_v4 && fd != lsock_v6) 2832 (void) close(fd); 2833 return (0); 2834 } 2835 2836 /* LOGGER */ 2837 2838 #include <syslog.h> 2839 2840 /* 2841 * Logging routines. All routines log to syslog, unless the daemon is 2842 * running in the foreground, in which case the logging goes to stderr. 2843 * 2844 * The following routines are available: 2845 * 2846 * logdebug(): A printf-like function for outputting debug messages 2847 * (messages at LOG_DEBUG) that are only of use to developers. 2848 * 2849 * logtrace(): A printf-like function for outputting tracing messages 2850 * (messages at LOG_INFO) from the daemon. This is typically used 2851 * to log the receipt of interesting network-related conditions. 2852 * 2853 * logerr(): A printf-like function for outputting error messages 2854 * (messages at LOG_ERR) from the daemon. 2855 * 2856 * logperror*(): A set of functions used to output error messages 2857 * (messages at LOG_ERR); these automatically append strerror(errno) 2858 * and a newline to the message passed to them. 2859 * 2860 * NOTE: since the logging functions write to syslog, the messages passed 2861 * to them are not eligible for localization. Thus, gettext() must 2862 * *not* be used. 2863 */ 2864 2865 static int logging = 0; 2866 2867 static void 2868 initlog(void) 2869 { 2870 logging++; 2871 openlog("in.mpathd", LOG_PID, LOG_DAEMON); 2872 } 2873 2874 /* PRINTFLIKE2 */ 2875 void 2876 logmsg(int pri, const char *fmt, ...) 2877 { 2878 va_list ap; 2879 2880 va_start(ap, fmt); 2881 2882 if (logging) 2883 vsyslog(pri, fmt, ap); 2884 else 2885 (void) vfprintf(stderr, fmt, ap); 2886 va_end(ap); 2887 } 2888 2889 /* PRINTFLIKE1 */ 2890 void 2891 logperror(const char *str) 2892 { 2893 if (logging) 2894 syslog(LOG_ERR, "%s: %m\n", str); 2895 else 2896 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 2897 } 2898 2899 void 2900 logperror_pii(struct phyint_instance *pii, const char *str) 2901 { 2902 if (logging) { 2903 syslog(LOG_ERR, "%s (%s %s): %m\n", 2904 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 2905 } else { 2906 (void) fprintf(stderr, "%s (%s %s): %s\n", 2907 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 2908 strerror(errno)); 2909 } 2910 } 2911 2912 void 2913 logperror_li(struct logint *li, const char *str) 2914 { 2915 struct phyint_instance *pii = li->li_phyint_inst; 2916 2917 if (logging) { 2918 syslog(LOG_ERR, "%s (%s %s): %m\n", 2919 str, AF_STR(pii->pii_af), li->li_name); 2920 } else { 2921 (void) fprintf(stderr, "%s (%s %s): %s\n", 2922 str, AF_STR(pii->pii_af), li->li_name, 2923 strerror(errno)); 2924 } 2925 } 2926 2927 void 2928 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 2929 { 2930 if (polled) 2931 (void) poll_remove(pii->pii_probe_sock); 2932 (void) close(pii->pii_probe_sock); 2933 pii->pii_probe_sock = -1; 2934 pii->pii_basetime_inited = 0; 2935 } 2936 2937 boolean_t 2938 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, 2939 struct sockaddr_storage *ssp) 2940 { 2941 addrlist_t *addrp; 2942 2943 if ((addrp = malloc(sizeof (addrlist_t))) == NULL) 2944 return (_B_FALSE); 2945 2946 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); 2947 addrp->al_flags = flags; 2948 addrp->al_addr = *ssp; 2949 addrp->al_next = *addrsp; 2950 *addrsp = addrp; 2951 return (_B_TRUE); 2952 } 2953 2954 void 2955 addrlist_free(addrlist_t **addrsp) 2956 { 2957 addrlist_t *addrp, *next_addrp; 2958 2959 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { 2960 next_addrp = addrp->al_next; 2961 free(addrp); 2962 } 2963 *addrsp = NULL; 2964 } 2965