1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 * 25 * Copyright 2021 Tintri by DDN, Inc. All rights reserved. 26 */ 27 28 #include "mpd_defs.h" 29 #include "mpd_tables.h" 30 31 int debug = 0; /* Debug flag */ 32 static int pollfd_num = 0; /* Num. of poll descriptors */ 33 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 34 /* All times below in ms */ 35 int user_failure_detection_time; /* user specified failure detection */ 36 /* time (fdt) */ 37 int user_probe_interval; /* derived from user specified fdt */ 38 39 /* 40 * Structure to store mib2 information returned by the kernel. 41 * This is used to process routing table information. 42 */ 43 typedef struct mib_item_s { 44 struct mib_item_s *mi_next; 45 struct opthdr mi_opthdr; 46 void *mi_valp; 47 } mib_item_t; 48 49 static int rtsock_v4; /* AF_INET routing socket */ 50 static int rtsock_v6; /* AF_INET6 routing socket */ 51 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 52 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 53 static int lsock_v4; /* Listen socket to detect mpathd */ 54 static int lsock_v6; /* Listen socket to detect mpathd */ 55 static int mibfd = -1; /* fd to get mib info */ 56 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 57 58 static uint_t last_initifs_time; /* Time when initifs was last run */ 59 static char **argv0; /* Saved for re-exec on SIGHUP */ 60 boolean_t handle_link_notifications = _B_TRUE; 61 static int ipRouteEntrySize; /* Size of IPv4 route entry */ 62 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */ 63 64 static void initlog(void); 65 static void run_timeouts(void); 66 static void initifs(void); 67 static void check_if_removed(struct phyint_instance *pii); 68 static void select_test_ifs(void); 69 static void update_router_list(mib_item_t *item); 70 static void mib_get_constants(mib_item_t *item); 71 static int mibwalk(void (*proc)(mib_item_t *)); 72 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 73 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 74 static void router_add_common(int af, char *ifname, 75 struct in6_addr nexthop); 76 static void init_router_targets(); 77 static void cleanup(void); 78 static int setup_listener(int af); 79 static void check_config(void); 80 static void check_testconfig(void); 81 static void check_addr_unique(struct phyint_instance *, 82 struct sockaddr_storage *); 83 static void init_host_targets(void); 84 static void dup_host_targets(struct phyint_instance *desired_pii); 85 static void loopback_cmd(int sock, int family); 86 static boolean_t daemonize(void); 87 static int closefunc(void *, int); 88 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 89 static unsigned int process_query(int fd, mi_query_t *miq); 90 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); 91 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 92 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 93 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 94 static unsigned int send_result(int fd, unsigned int error, int syserror); 95 96 addrlist_t *localaddrs; 97 98 /* 99 * Return the current time in milliseconds (from an arbitrary reference) 100 * truncated to fit into an int. Truncation is ok since we are interested 101 * only in differences and not the absolute values. 102 */ 103 uint_t 104 getcurrenttime(void) 105 { 106 uint_t cur_time; /* In ms */ 107 108 /* 109 * Use of a non-user-adjustable source of time is 110 * required. However millisecond precision is sufficient. 111 * divide by 10^6 112 */ 113 cur_time = (uint_t)(gethrtime() / 1000000LL); 114 return (cur_time); 115 } 116 117 uint64_t 118 getcurrentsec(void) 119 { 120 return (gethrtime() / NANOSEC); 121 } 122 123 /* 124 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 125 */ 126 int 127 poll_add(int fd) 128 { 129 int i; 130 int new_num; 131 struct pollfd *newfds; 132 retry: 133 /* Check if already present */ 134 for (i = 0; i < pollfd_num; i++) { 135 if (pollfds[i].fd == fd) 136 return (0); 137 } 138 /* Check for empty spot already present */ 139 for (i = 0; i < pollfd_num; i++) { 140 if (pollfds[i].fd == -1) { 141 pollfds[i].fd = fd; 142 return (0); 143 } 144 } 145 146 /* Allocate space for 32 more fds and initialize to -1 */ 147 new_num = pollfd_num + 32; 148 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 149 if (newfds == NULL) { 150 logperror("poll_add: realloc"); 151 return (-1); 152 } 153 for (i = pollfd_num; i < new_num; i++) { 154 newfds[i].fd = -1; 155 newfds[i].events = POLLIN; 156 } 157 pollfd_num = new_num; 158 pollfds = newfds; 159 goto retry; 160 } 161 162 /* 163 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 164 */ 165 int 166 poll_remove(int fd) 167 { 168 int i; 169 170 /* Check if already present */ 171 for (i = 0; i < pollfd_num; i++) { 172 if (pollfds[i].fd == fd) { 173 pollfds[i].fd = -1; 174 return (0); 175 } 176 } 177 return (-1); 178 } 179 180 /* 181 * Extract information about the phyint instance. If the phyint instance still 182 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 183 * will use it to detect phyint instances that don't exist any longer and 184 * remove them, from our database of phyint instances. 185 * Return value: 186 * returns true if the phyint instance exists in the kernel, 187 * returns false otherwise 188 */ 189 static boolean_t 190 pii_process(int af, char *name, struct phyint_instance **pii_p) 191 { 192 int err; 193 struct phyint_instance *pii; 194 struct phyint_instance *pii_other; 195 196 if (debug & D_PHYINT) 197 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 198 199 pii = phyint_inst_lookup(af, name); 200 if (pii == NULL) { 201 /* 202 * Phyint instance does not exist in our tables, 203 * create new phyint instance 204 */ 205 pii = phyint_inst_init_from_k(af, name); 206 } else { 207 /* Phyint exists in our tables */ 208 err = phyint_inst_update_from_k(pii); 209 210 switch (err) { 211 case PI_IOCTL_ERROR: 212 /* Some ioctl error. don't change anything */ 213 pii->pii_in_use = 1; 214 break; 215 216 case PI_GROUP_CHANGED: 217 case PI_IFINDEX_CHANGED: 218 /* 219 * Interface index or group membership has changed. 220 * Delete the old state and recreate based on the new 221 * state (it may no longer be in a group). 222 */ 223 pii_other = phyint_inst_other(pii); 224 if (pii_other != NULL) 225 phyint_inst_delete(pii_other); 226 phyint_inst_delete(pii); 227 pii = phyint_inst_init_from_k(af, name); 228 break; 229 230 case PI_DELETED: 231 /* Phyint instance has disappeared from kernel */ 232 pii->pii_in_use = 0; 233 break; 234 235 case PI_OK: 236 /* Phyint instance exists and is fine */ 237 pii->pii_in_use = 1; 238 break; 239 240 default: 241 /* Unknown status */ 242 logerr("pii_process: Unknown status %d\n", err); 243 break; 244 } 245 } 246 247 *pii_p = pii; 248 if (pii != NULL) 249 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 250 else 251 return (_B_FALSE); 252 } 253 254 /* 255 * Scan all interfaces to detect changes as well as new and deleted interfaces 256 */ 257 static void 258 initifs() 259 { 260 int i, nlifr; 261 int af; 262 char *cp; 263 char *buf; 264 int sockfd; 265 uint64_t flags; 266 struct lifnum lifn; 267 struct lifconf lifc; 268 struct lifreq lifreq; 269 struct lifreq *lifr; 270 struct logint *li; 271 struct phyint_instance *pii; 272 struct phyint_instance *next_pii; 273 struct phyint_group *pg, *next_pg; 274 char pi_name[LIFNAMSIZ + 1]; 275 276 if (debug & D_PHYINT) 277 logdebug("initifs: Scanning interfaces\n"); 278 279 last_initifs_time = getcurrenttime(); 280 281 /* 282 * Free the existing local address list; we'll build a new list below. 283 */ 284 addrlist_free(&localaddrs); 285 286 /* 287 * Mark the interfaces so that we can find phyints and logints 288 * which have disappeared from the kernel. pii_process() and 289 * logint_init_from_k() will set {pii,li}_in_use when they find 290 * the interface in the kernel. Also, clear dupaddr bit on probe 291 * logint. check_addr_unique() will set the dupaddr bit on the 292 * probe logint, if the testaddress is not unique. 293 */ 294 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 295 pii->pii_in_use = 0; 296 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 297 li->li_in_use = 0; 298 if (pii->pii_probe_logint == li) 299 li->li_dupaddr = 0; 300 } 301 } 302 303 /* 304 * As above, mark groups so that we can detect IPMP interfaces which 305 * have been removed from the kernel. Also, delete the group address 306 * list since we'll iteratively recreate it below. 307 */ 308 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 309 pg->pg_in_use = _B_FALSE; 310 addrlist_free(&pg->pg_addrs); 311 } 312 313 lifn.lifn_family = AF_UNSPEC; 314 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 315 again: 316 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 317 logperror("initifs: ioctl (get interface count)"); 318 return; 319 } 320 /* 321 * Pad the interface count to detect when additional interfaces have 322 * been configured between SIOCGLIFNUM and SIOCGLIFCONF. 323 */ 324 lifn.lifn_count += 4; 325 326 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { 327 logperror("initifs: calloc"); 328 return; 329 } 330 331 lifc.lifc_family = AF_UNSPEC; 332 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 333 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); 334 lifc.lifc_buf = buf; 335 336 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 337 logperror("initifs: ioctl (get interface configuration)"); 338 free(buf); 339 return; 340 } 341 342 /* 343 * If every lifr_req slot is taken, then additional interfaces must 344 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. 345 * Recalculate to make sure we didn't miss any interfaces. 346 */ 347 nlifr = lifc.lifc_len / sizeof (struct lifreq); 348 if (nlifr >= lifn.lifn_count) { 349 free(buf); 350 goto again; 351 } 352 353 /* 354 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the 355 * global list of addresses, phyint groups, phyints, and logints. 356 */ 357 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { 358 af = lifr->lifr_addr.ss_family; 359 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 360 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); 361 362 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 363 if (errno != ENXIO) 364 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 365 continue; 366 } 367 flags = lifreq.lifr_flags; 368 369 /* 370 * If the address is IFF_UP, add it to the local address list. 371 * (We ignore addresses that aren't IFF_UP since another node 372 * might legitimately have that address IFF_UP.) 373 */ 374 if (flags & IFF_UP) { 375 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, 376 &lifr->lifr_addr); 377 } 378 379 /* 380 * If this address is on an IPMP meta-interface, update our 381 * phyint_group information (either by recording that group 382 * still exists or creating a new group), and track what 383 * group the address is part of. 384 */ 385 if (flags & IFF_IPMP) { 386 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { 387 if (errno != ENXIO) 388 logperror("initifs: ioctl " 389 "(SIOCGLIFGROUPNAME)"); 390 continue; 391 } 392 393 pg = phyint_group_lookup(lifreq.lifr_groupname); 394 if (pg == NULL) { 395 pg = phyint_group_create(lifreq.lifr_groupname); 396 if (pg == NULL) { 397 logerr("initifs: cannot create group " 398 "%s\n", lifreq.lifr_groupname); 399 continue; 400 } 401 phyint_group_insert(pg); 402 } 403 pg->pg_in_use = _B_TRUE; 404 405 /* 406 * Add this to the group's list of data addresses. 407 */ 408 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, 409 &lifr->lifr_addr)) { 410 logerr("initifs: insufficient memory to track " 411 "data address information for %s\n", 412 lifr->lifr_name); 413 } 414 continue; 415 } 416 417 /* 418 * This isn't an address on an IPMP meta-interface, so it's 419 * either on an underlying interface or not related to any 420 * group. Update our phyint and logint information (via 421 * pii_process() and logint_init_from_k()) -- but first, 422 * convert the logint name to a phyint name so we can call 423 * pii_process(). 424 */ 425 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 426 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 427 *cp = '\0'; 428 429 if (pii_process(af, pi_name, &pii)) { 430 /* The phyint is fine. So process the logint */ 431 logint_init_from_k(pii, lifr->lifr_name); 432 check_addr_unique(pii, &lifr->lifr_addr); 433 } 434 } 435 free(buf); 436 437 /* 438 * Scan for groups, phyints and logints that have disappeared from the 439 * kernel, and delete them. 440 */ 441 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 442 next_pii = pii->pii_next; 443 check_if_removed(pii); 444 } 445 446 for (pg = phyint_groups; pg != NULL; pg = next_pg) { 447 next_pg = pg->pg_next; 448 if (!pg->pg_in_use) { 449 phyint_group_delete(pg); 450 continue; 451 } 452 /* 453 * Refresh the group's state. This is necessary since the 454 * group's state is defined by the set of usable interfaces in 455 * the group, and an interface is considered unusable if all 456 * of its addresses are down. When an address goes down/up, 457 * the RTM_DELADDR/RTM_NEWADDR brings us through here. 458 */ 459 phyint_group_refresh_state(pg); 460 } 461 462 /* 463 * Select a test address for sending probes on each phyint instance 464 */ 465 select_test_ifs(); 466 467 /* 468 * Handle link up/down notifications. 469 */ 470 process_link_state_changes(); 471 } 472 473 /* 474 * Check that a given test address is unique across all of the interfaces in a 475 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 476 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 477 * Any issues will be reported by check_testconfig(). 478 */ 479 static void 480 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 481 { 482 struct phyint *pi; 483 struct phyint_group *pg; 484 struct in6_addr addr; 485 struct phyint_instance *pii; 486 struct sockaddr_in *sin; 487 488 if (ss->ss_family == AF_INET) { 489 sin = (struct sockaddr_in *)ss; 490 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 491 } else { 492 assert(ss->ss_family == AF_INET6); 493 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 494 } 495 496 /* 497 * For anonymous groups, every interface is assumed to be on its own 498 * link, so there is no chance of overlapping addresses. 499 */ 500 pg = ourpii->pii_phyint->pi_group; 501 if (pg == phyint_anongroup) 502 return; 503 504 /* 505 * Walk the list of phyint instances in the group and check for test 506 * addresses matching ours. Of course, we skip ourself. 507 */ 508 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 509 pii = PHYINT_INSTANCE(pi, ss->ss_family); 510 if (pii == NULL || pii == ourpii || 511 pii->pii_probe_logint == NULL) 512 continue; 513 514 /* 515 * If this test address is not unique, set the dupaddr bit. 516 */ 517 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 518 pii->pii_probe_logint->li_dupaddr = 1; 519 } 520 } 521 522 /* 523 * Stop probing an interface. Called when an interface is offlined. 524 * The probe socket is closed on each interface instance, and the 525 * interface state set to PI_OFFLINE. 526 */ 527 void 528 stop_probing(struct phyint *pi) 529 { 530 struct phyint_instance *pii; 531 532 pii = pi->pi_v4; 533 if (pii != NULL) { 534 if (pii->pii_probe_sock != -1) 535 close_probe_socket(pii, _B_TRUE); 536 pii->pii_probe_logint = NULL; 537 } 538 539 pii = pi->pi_v6; 540 if (pii != NULL) { 541 if (pii->pii_probe_sock != -1) 542 close_probe_socket(pii, _B_TRUE); 543 pii->pii_probe_logint = NULL; 544 } 545 546 phyint_chstate(pi, PI_OFFLINE); 547 } 548 549 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 550 551 /* 552 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 553 * IFF_UP must also be set so that the associated address can be used as a 554 * source address. Further, we must be able to exchange packets with local 555 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 556 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 557 */ 558 static int 559 rate_testflags(uint64_t flags) 560 { 561 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 562 return (BAD_TESTFLAGS); 563 564 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 565 return (BAD_TESTFLAGS); 566 567 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 568 return (BEST_TESTFLAGS); 569 570 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 571 return (BEST_TESTFLAGS); 572 573 return (OK_TESTFLAGS); 574 } 575 576 /* 577 * Attempt to select a test address for each phyint instance. 578 * Call phyint_inst_sockinit() to complete the initializations. 579 */ 580 static void 581 select_test_ifs(void) 582 { 583 struct phyint *pi; 584 struct phyint_instance *pii; 585 struct phyint_instance *next_pii; 586 struct logint *li; 587 struct logint *probe_logint; 588 boolean_t target_scan_reqd = _B_FALSE; 589 int rating; 590 591 if (debug & D_PHYINT) 592 logdebug("select_test_ifs\n"); 593 594 /* 595 * For each phyint instance, do the test address selection 596 */ 597 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 598 next_pii = pii->pii_next; 599 probe_logint = NULL; 600 601 /* 602 * An interface that is offline should not be probed. 603 * IFF_OFFLINE interfaces should always be PI_OFFLINE 604 * unless some other entity has set the offline flag. 605 */ 606 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 607 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 608 logerr("shouldn't be probing offline" 609 " interface %s (state is: %u)." 610 " Stopping probes.\n", 611 pii->pii_phyint->pi_name, 612 pii->pii_phyint->pi_state); 613 stop_probing(pii->pii_phyint); 614 } 615 continue; 616 } else { 617 /* 618 * If something cleared IFF_OFFLINE (e.g., by accident 619 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is 620 * inherently racy), the phyint may still be offline. 621 * Just ignore it. 622 */ 623 if (pii->pii_phyint->pi_state == PI_OFFLINE) 624 continue; 625 } 626 627 li = pii->pii_probe_logint; 628 if (li != NULL) { 629 /* 630 * We've already got a test address; only proceed 631 * if it's suboptimal. 632 */ 633 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 634 continue; 635 } 636 637 /* 638 * Walk the logints of this phyint instance, and select 639 * the best available test address 640 */ 641 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 642 /* 643 * Skip 0.0.0.0 addresses, as those are never 644 * actually usable. 645 */ 646 if (pii->pii_af == AF_INET && 647 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 648 continue; 649 650 /* 651 * Skip any IPv6 logints that are not link-local, 652 * since we should always have a link-local address 653 * anyway and in6_data() expects link-local replies. 654 */ 655 if (pii->pii_af == AF_INET6 && 656 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 657 continue; 658 659 /* 660 * Rate the testflags. If we've found an optimal 661 * match, then break out; otherwise, record the most 662 * recent OK one. 663 */ 664 rating = rate_testflags(li->li_flags); 665 if (rating == BAD_TESTFLAGS) 666 continue; 667 668 probe_logint = li; 669 if (rating == BEST_TESTFLAGS) 670 break; 671 } 672 673 /* 674 * If the probe logint has changed, ditch the old one. 675 */ 676 if (pii->pii_probe_logint != NULL && 677 pii->pii_probe_logint != probe_logint) { 678 if (pii->pii_probe_sock != -1) 679 close_probe_socket(pii, _B_TRUE); 680 pii->pii_probe_logint = NULL; 681 } 682 683 if (probe_logint == NULL) { 684 /* 685 * We don't have a test address; zero out the probe 686 * stats array since it is no longer relevant. 687 * Optimize by checking if it is already zeroed out. 688 */ 689 int pr_ndx; 690 691 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 692 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 693 clear_pii_probe_stats(pii); 694 reset_crtt_all(pii->pii_phyint); 695 } 696 continue; 697 } else if (probe_logint == pii->pii_probe_logint) { 698 /* 699 * If we didn't find any new test addr, go to the 700 * next phyint. 701 */ 702 continue; 703 } 704 705 /* 706 * The phyint is either being assigned a new testaddr 707 * or is being assigned a testaddr for the 1st time. 708 * Need to initialize the phyint socket 709 */ 710 pii->pii_probe_logint = probe_logint; 711 if (!phyint_inst_sockinit(pii)) { 712 if (debug & D_PHYINT) { 713 logdebug("select_test_ifs: " 714 "phyint_sockinit failed\n"); 715 } 716 phyint_inst_delete(pii); 717 continue; 718 } 719 720 /* 721 * This phyint instance is now enabled for probes; this 722 * impacts our state machine in two ways: 723 * 724 * 1. If we're probe *capable* as well (i.e., we have 725 * probe targets) and the interface is in PI_NOTARGETS, 726 * then transition to PI_RUNNING. 727 * 728 * 2. If we're not probe capable, and the other phyint 729 * instance is also not probe capable, and we were in 730 * PI_RUNNING, then transition to PI_NOTARGETS. 731 * 732 * Also see the state diagram in mpd_probe.c. 733 */ 734 if (PROBE_CAPABLE(pii)) { 735 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 736 phyint_chstate(pii->pii_phyint, PI_RUNNING); 737 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 738 if (pii->pii_phyint->pi_state == PI_RUNNING) 739 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 740 } 741 742 /* 743 * If no targets are currently known for this phyint 744 * we need to call init_router_targets. Since 745 * init_router_targets() initializes the list of targets 746 * for all phyints it is done below the loop. 747 */ 748 if (pii->pii_targets == NULL) 749 target_scan_reqd = _B_TRUE; 750 751 /* 752 * Start the probe timer for this instance. 753 */ 754 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 755 start_timer(pii); 756 pii->pii_basetime_inited = 1; 757 } 758 } 759 760 /* 761 * Scan the interface list for any interfaces that are PI_FAILED or 762 * PI_NOTARGETS but no longer enabled to send probes, and call 763 * phyint_check_for_repair() to see if the link state indicates that 764 * the interface should be repaired. Also see the state diagram in 765 * mpd_probe.c. 766 */ 767 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 768 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && 769 (pi->pi_state == PI_FAILED || 770 pi->pi_state == PI_NOTARGETS)) { 771 phyint_check_for_repair(pi); 772 } 773 } 774 775 check_testconfig(); 776 777 /* 778 * Try to populate the target list. init_router_targets populates 779 * the target list from the routing table. If our target list is 780 * still empty, init_host_targets adds host targets based on the 781 * host target list of other phyints in the group. 782 */ 783 if (target_scan_reqd) { 784 init_router_targets(); 785 init_host_targets(); 786 } 787 } 788 789 /* 790 * Check test address configuration, and log notices/errors if appropriate. 791 * Note that this function only logs pre-existing conditions (e.g., that 792 * probe-based failure detection is disabled). 793 */ 794 static void 795 check_testconfig(void) 796 { 797 struct phyint *pi; 798 struct logint *li; 799 char abuf[INET6_ADDRSTRLEN]; 800 int pri; 801 802 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 803 if (pi->pi_flags & IFF_OFFLINE) 804 continue; 805 806 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 807 if (pi->pi_taddrmsg_printed || 808 pi->pi_duptaddrmsg_printed) { 809 if (pi->pi_duptaddrmsg_printed) 810 pri = LOG_ERR; 811 else 812 pri = LOG_INFO; 813 logmsg(pri, "Test address now configured on " 814 "interface %s; enabling probe-based " 815 "failure detection on it\n", pi->pi_name); 816 pi->pi_taddrmsg_printed = 0; 817 pi->pi_duptaddrmsg_printed = 0; 818 } 819 continue; 820 } 821 822 li = NULL; 823 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 824 pi->pi_v4->pii_probe_logint->li_dupaddr) 825 li = pi->pi_v4->pii_probe_logint; 826 827 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 828 pi->pi_v6->pii_probe_logint->li_dupaddr) 829 li = pi->pi_v6->pii_probe_logint; 830 831 if (li != NULL && li->li_dupaddr) { 832 if (pi->pi_duptaddrmsg_printed) 833 continue; 834 logerr("Test address %s is not unique in group; " 835 "disabling probe-based failure detection on %s\n", 836 pr_addr(li->li_phyint_inst->pii_af, 837 li->li_addr, abuf, sizeof (abuf)), pi->pi_name); 838 pi->pi_duptaddrmsg_printed = 1; 839 continue; 840 } 841 842 if (getcurrentsec() < pi->pi_taddrthresh) 843 continue; 844 845 if (!pi->pi_taddrmsg_printed) { 846 logtrace("No test address configured on interface %s; " 847 "disabling probe-based failure detection on it\n", 848 pi->pi_name); 849 pi->pi_taddrmsg_printed = 1; 850 } 851 } 852 } 853 854 /* 855 * Check phyint group configuration, to detect any inconsistencies, 856 * and log an error message. This is called from runtimeouts every 857 * 20 secs. But the error message is displayed once. If the 858 * consistency is resolved by the admin, a recovery message is displayed 859 * once. 860 */ 861 static void 862 check_config(void) 863 { 864 struct phyint_group *pg; 865 struct phyint *pi; 866 boolean_t v4_in_group; 867 boolean_t v6_in_group; 868 869 /* 870 * All phyints of a group must be homogeneous to ensure that they can 871 * take over for one another. If any phyint in a group has IPv4 872 * plumbed, check that all phyints have IPv4 plumbed. Do a similar 873 * check for IPv6. 874 */ 875 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 876 if (pg == phyint_anongroup) 877 continue; 878 879 v4_in_group = _B_FALSE; 880 v6_in_group = _B_FALSE; 881 /* 882 * 1st pass. Determine if at least 1 phyint in the group 883 * has IPv4 plumbed and if so set v4_in_group to true. 884 * Repeat similarly for IPv6. 885 */ 886 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 887 if (pi->pi_v4 != NULL) 888 v4_in_group = _B_TRUE; 889 if (pi->pi_v6 != NULL) 890 v6_in_group = _B_TRUE; 891 } 892 893 /* 894 * 2nd pass. If v4_in_group is true, check that phyint 895 * has IPv4 plumbed. Repeat similarly for IPv6. Print 896 * out a message the 1st time only. 897 */ 898 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 899 if (pi->pi_flags & IFF_OFFLINE) 900 continue; 901 902 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 903 if (!pi->pi_cfgmsg_printed) { 904 logerr("IP interface %s in group %s is" 905 " not plumbed for IPv4, affecting" 906 " IPv4 connectivity\n", 907 pi->pi_name, 908 pi->pi_group->pg_name); 909 pi->pi_cfgmsg_printed = 1; 910 } 911 } else if (v6_in_group == _B_TRUE && 912 pi->pi_v6 == NULL) { 913 if (!pi->pi_cfgmsg_printed) { 914 logerr("IP interface %s in group %s is" 915 " not plumbed for IPv6, affecting" 916 " IPv6 connectivity\n", 917 pi->pi_name, 918 pi->pi_group->pg_name); 919 pi->pi_cfgmsg_printed = 1; 920 } 921 } else { 922 /* 923 * The phyint matches the group configuration, 924 * if we have reached this point. If it was 925 * improperly configured earlier, log an 926 * error recovery message 927 */ 928 if (pi->pi_cfgmsg_printed) { 929 logerr("IP interface %s is now" 930 " consistent with group %s " 931 " and connectivity is restored\n", 932 pi->pi_name, pi->pi_group->pg_name); 933 pi->pi_cfgmsg_printed = 0; 934 } 935 } 936 937 } 938 } 939 } 940 941 /* 942 * Timer mechanism using relative time (in milliseconds) from the 943 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 944 * will fire after TIMER_INFINITY milliseconds. 945 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 946 * time values. Hence 2 consecutive timer events cannot be spaced farther 947 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 948 * that can be passed for the delay parameter of timer_schedule() 949 */ 950 static uint_t timer_next; /* Currently scheduled timeout */ 951 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 952 953 static void 954 timer_init(void) 955 { 956 timer_next = getcurrenttime() + TIMER_INFINITY; 957 /* 958 * The call to run_timeouts() will get the timer started 959 * Since there are no phyints at this point, the timer will 960 * be set for IF_SCAN_INTERVAL ms. 961 */ 962 run_timeouts(); 963 } 964 965 /* 966 * Make sure the next SIGALRM occurs delay milliseconds from the current 967 * time if not earlier. We are interested only in time differences. 968 */ 969 void 970 timer_schedule(uint_t delay) 971 { 972 uint_t now; 973 struct itimerval itimerval; 974 975 if (debug & D_TIMER) 976 logdebug("timer_schedule(%u)\n", delay); 977 978 assert(delay <= TIMER_INFINITY); 979 980 now = getcurrenttime(); 981 if (delay == 0) { 982 /* Minimum allowed delay */ 983 delay = 1; 984 } 985 /* Will this timer occur before the currently scheduled SIGALRM? */ 986 if (timer_active && TIME_GE(now + delay, timer_next)) { 987 if (debug & D_TIMER) { 988 logdebug("timer_schedule(%u) - no action: " 989 "now %u next %u\n", delay, now, timer_next); 990 } 991 return; 992 } 993 timer_next = now + delay; 994 995 itimerval.it_value.tv_sec = delay / 1000; 996 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 997 itimerval.it_interval.tv_sec = 0; 998 itimerval.it_interval.tv_usec = 0; 999 if (debug & D_TIMER) { 1000 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 1001 delay, itimerval.it_value.tv_sec, 1002 itimerval.it_value.tv_usec); 1003 } 1004 timer_active = _B_TRUE; 1005 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1006 logperror("timer_schedule: setitimer"); 1007 exit(2); 1008 } 1009 } 1010 1011 static void 1012 timer_cancel(void) 1013 { 1014 struct itimerval itimerval; 1015 1016 if (debug & D_TIMER) 1017 logdebug("timer_cancel()\n"); 1018 1019 bzero(&itimerval, sizeof (itimerval)); 1020 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) 1021 logperror("timer_cancel: setitimer"); 1022 } 1023 1024 /* 1025 * Timer has fired. Determine when the next timer event will occur by asking 1026 * all the timer routines. Should not be called from a timer routine. 1027 */ 1028 static void 1029 run_timeouts(void) 1030 { 1031 uint_t next; 1032 uint_t next_event_time; 1033 struct phyint_instance *pii; 1034 struct phyint_instance *next_pii; 1035 static boolean_t timeout_running; 1036 1037 /* assert that recursive timeouts don't happen. */ 1038 assert(!timeout_running); 1039 1040 timeout_running = _B_TRUE; 1041 1042 if (debug & D_TIMER) 1043 logdebug("run_timeouts()\n"); 1044 1045 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1046 initifs(); 1047 check_config(); 1048 } 1049 1050 next = TIMER_INFINITY; 1051 1052 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1053 next_pii = pii->pii_next; 1054 next_event_time = phyint_inst_timer(pii); 1055 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1056 next = next_event_time; 1057 1058 if (debug & D_TIMER) { 1059 logdebug("run_timeouts(%s %s): next scheduled for" 1060 " this phyint inst %u, next scheduled global" 1061 " %u ms\n", 1062 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1063 next_event_time, next); 1064 } 1065 } 1066 1067 /* 1068 * Make sure initifs() is called at least once every 1069 * IF_SCAN_INTERVAL, to make sure that we are in sync 1070 * with the kernel, in case we have missed any routing 1071 * socket messages. 1072 */ 1073 if (next > IF_SCAN_INTERVAL) 1074 next = IF_SCAN_INTERVAL; 1075 1076 if (debug & D_TIMER) 1077 logdebug("run_timeouts: %u ms\n", next); 1078 1079 timer_schedule(next); 1080 timeout_running = _B_FALSE; 1081 } 1082 1083 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1084 static int eventpipe_write = -1; 1085 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ 1086 1087 /* 1088 * Ensure that signals are processed synchronously with the rest of 1089 * the code by just writing a one character signal number on the pipe. 1090 * The poll loop will pick this up and process the signal event. 1091 */ 1092 static void 1093 sig_handler(int signo) 1094 { 1095 uchar_t buf = (uchar_t)signo; 1096 1097 /* 1098 * Don't write to pipe if cleanup has already begun. cleanup() 1099 * might have closed the pipe already 1100 */ 1101 if (cleanup_started) 1102 return; 1103 1104 if (eventpipe_write == -1) { 1105 logerr("sig_handler: no pipe found\n"); 1106 return; 1107 } 1108 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1109 logperror("sig_handler: write"); 1110 } 1111 1112 extern struct probes_missed probes_missed; 1113 1114 /* 1115 * Pick up a signal "byte" from the pipe and process it. 1116 */ 1117 static void 1118 in_signal(int fd) 1119 { 1120 uchar_t buf; 1121 uint64_t sent, acked, lost, unacked, unknown; 1122 struct phyint_instance *pii; 1123 int pr_ndx; 1124 1125 switch (read(fd, &buf, sizeof (buf))) { 1126 case -1: 1127 logperror("in_signal: read"); 1128 exit(1); 1129 /* NOTREACHED */ 1130 case 1: 1131 break; 1132 case 0: 1133 logerr("in_signal: read end of file\n"); 1134 exit(1); 1135 /* NOTREACHED */ 1136 default: 1137 logerr("in_signal: read > 1\n"); 1138 exit(1); 1139 } 1140 1141 if (debug & D_TIMER) 1142 logdebug("in_signal() got %d\n", buf); 1143 1144 switch (buf) { 1145 case SIGALRM: 1146 if (debug & D_TIMER) { 1147 uint_t now = getcurrenttime(); 1148 1149 logdebug("in_signal(SIGALRM) delta %u\n", 1150 now - timer_next); 1151 } 1152 timer_active = _B_FALSE; 1153 run_timeouts(); 1154 break; 1155 case SIGUSR1: 1156 logdebug("Printing configuration:\n"); 1157 /* Print out the internal tables */ 1158 phyint_inst_print_all(); 1159 1160 /* 1161 * Print out the accumulated statistics about missed 1162 * probes (happens due to scheduling delay). 1163 */ 1164 logerr("Missed sending total of %d probes spread over" 1165 " %d occurrences\n", probes_missed.pm_nprobes, 1166 probes_missed.pm_ntimes); 1167 1168 /* 1169 * Print out the accumulated statistics about probes 1170 * that were sent. 1171 */ 1172 for (pii = phyint_instances; pii != NULL; 1173 pii = pii->pii_next) { 1174 unacked = 0; 1175 acked = pii->pii_cum_stats.acked; 1176 lost = pii->pii_cum_stats.lost; 1177 sent = pii->pii_cum_stats.sent; 1178 unknown = pii->pii_cum_stats.unknown; 1179 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1180 switch (pii->pii_probes[pr_ndx].pr_status) { 1181 case PR_ACKED: 1182 acked++; 1183 break; 1184 case PR_LOST: 1185 lost++; 1186 break; 1187 case PR_UNACKED: 1188 unacked++; 1189 break; 1190 } 1191 } 1192 logerr("\nProbe stats on (%s %s)\n" 1193 "Number of probes sent %lld\n" 1194 "Number of probe acks received %lld\n" 1195 "Number of probes/acks lost %lld\n" 1196 "Number of valid unacknowledged probes %lld\n" 1197 "Number of ambiguous probe acks received %lld\n", 1198 AF_STR(pii->pii_af), pii->pii_name, 1199 sent, acked, lost, unacked, unknown); 1200 } 1201 break; 1202 case SIGHUP: 1203 logerr("SIGHUP: restart and reread config file\n"); 1204 /* 1205 * Cancel the interval timer. Needed since setitimer() uses 1206 * alarm() and the time left is inherited across exec(), and 1207 * thus the SIGALRM may be delivered before a handler has been 1208 * setup, causing in.mpathd to erroneously exit. 1209 */ 1210 timer_cancel(); 1211 cleanup(); 1212 (void) execv(argv0[0], argv0); 1213 _exit(0177); 1214 /* NOTREACHED */ 1215 case SIGINT: 1216 case SIGTERM: 1217 case SIGQUIT: 1218 cleanup(); 1219 exit(0); 1220 /* NOTREACHED */ 1221 default: 1222 logerr("in_signal: unknown signal: %d\n", buf); 1223 } 1224 } 1225 1226 static void 1227 cleanup(void) 1228 { 1229 struct phyint_instance *pii; 1230 struct phyint_instance *next_pii; 1231 1232 /* 1233 * Make sure that we don't write to eventpipe in 1234 * sig_handler() if any signal notably SIGALRM, 1235 * occurs after we close the eventpipe descriptor below 1236 */ 1237 cleanup_started = _B_TRUE; 1238 1239 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1240 next_pii = pii->pii_next; 1241 phyint_inst_delete(pii); 1242 } 1243 1244 (void) close(ifsock_v4); 1245 (void) close(ifsock_v6); 1246 (void) close(rtsock_v4); 1247 (void) close(rtsock_v6); 1248 (void) close(lsock_v4); 1249 (void) close(lsock_v6); 1250 (void) close(0); 1251 (void) close(1); 1252 (void) close(2); 1253 (void) close(mibfd); 1254 (void) close(eventpipe_read); 1255 (void) close(eventpipe_write); 1256 } 1257 1258 /* 1259 * Create pipe for signal delivery and set up signal handlers. 1260 */ 1261 static void 1262 setup_eventpipe(void) 1263 { 1264 int fds[2]; 1265 struct sigaction act; 1266 1267 if ((pipe(fds)) < 0) { 1268 logperror("setup_eventpipe: pipe"); 1269 exit(1); 1270 } 1271 eventpipe_read = fds[0]; 1272 eventpipe_write = fds[1]; 1273 if (poll_add(eventpipe_read) == -1) { 1274 exit(1); 1275 } 1276 1277 act.sa_handler = sig_handler; 1278 act.sa_flags = SA_RESTART; 1279 (void) sigaction(SIGALRM, &act, NULL); 1280 1281 (void) sigset(SIGHUP, sig_handler); 1282 (void) sigset(SIGUSR1, sig_handler); 1283 (void) sigset(SIGTERM, sig_handler); 1284 (void) sigset(SIGINT, sig_handler); 1285 (void) sigset(SIGQUIT, sig_handler); 1286 } 1287 1288 /* 1289 * Create a routing socket for receiving RTM_IFINFO messages. 1290 */ 1291 static int 1292 setup_rtsock(int af) 1293 { 1294 int s; 1295 int flags; 1296 int aware = RTAW_UNDER_IPMP; 1297 1298 s = socket(PF_ROUTE, SOCK_RAW, af); 1299 if (s == -1) { 1300 logperror("setup_rtsock: socket PF_ROUTE"); 1301 exit(1); 1302 } 1303 1304 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { 1305 logperror("setup_rtsock: setsockopt RT_AWARE"); 1306 (void) close(s); 1307 exit(1); 1308 } 1309 1310 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1311 logperror("setup_rtsock: fcntl F_GETFL"); 1312 (void) close(s); 1313 exit(1); 1314 } 1315 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1316 logperror("setup_rtsock: fcntl F_SETFL"); 1317 (void) close(s); 1318 exit(1); 1319 } 1320 if (poll_add(s) == -1) { 1321 (void) close(s); 1322 exit(1); 1323 } 1324 return (s); 1325 } 1326 1327 /* 1328 * Process an RTM_IFINFO message received on a routing socket. 1329 * The return value indicates whether a full interface scan is required. 1330 * Link up/down notifications are reflected in the IFF_RUNNING flag. 1331 * If just the state of the IFF_RUNNING interface flag has changed, a 1332 * a full interface scan isn't required. 1333 */ 1334 static boolean_t 1335 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1336 { 1337 struct sockaddr_dl *sdl; 1338 struct phyint *pi; 1339 uint64_t old_flags; 1340 struct phyint_instance *pii; 1341 1342 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1343 1344 /* 1345 * Although the sockaddr_dl structure is directly after the 1346 * if_msghdr_t structure. At the time of writing, the size of the 1347 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1348 * to the presence of a timeval structure, which contains longs, 1349 * in the if_data structure. Anyway, we know where the message ends, 1350 * so we work backwards to get the start of the sockaddr_dl structure. 1351 */ 1352 /*LINTED*/ 1353 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1354 sizeof (struct sockaddr_dl)); 1355 1356 assert(sdl->sdl_family == AF_LINK); 1357 1358 /* 1359 * The interface name is in sdl_data. 1360 * RTM_IFINFO messages are only generated for logical interface 1361 * zero, so there is no colon and logical interface number to 1362 * strip from the name. The name is not null terminated, but 1363 * there should be enough space in sdl_data to add the null. 1364 */ 1365 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1366 if (debug & D_LINKNOTE) 1367 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1368 return (_B_TRUE); 1369 } 1370 sdl->sdl_data[sdl->sdl_nlen] = 0; 1371 1372 pi = phyint_lookup(sdl->sdl_data); 1373 if (pi == NULL) { 1374 if (debug & D_LINKNOTE) 1375 logdebug("process_rtm_ifinfo: phyint lookup failed" 1376 " for %s\n", sdl->sdl_data); 1377 return (_B_TRUE); 1378 } 1379 1380 /* 1381 * We want to try and avoid doing a full interface scan for 1382 * link state notifications from the datalink layer, as indicated 1383 * by the state of the IFF_RUNNING flag. If just the 1384 * IFF_RUNNING flag has changed state, the link state changes 1385 * are processed without a full scan. 1386 * If there is both an IPv4 and IPv6 instance associated with 1387 * the physical interface, we will get an RTM_IFINFO message 1388 * for each instance. If we just maintained a single copy of 1389 * the physical interface flags, it would appear that no flags 1390 * had changed when the second message is processed, leading us 1391 * to believe that the message wasn't generated by a flags change, 1392 * and that a full interface scan is required. 1393 * To get around this problem, two additional copies of the flags 1394 * are kept, one copy for each instance. These are only used in 1395 * this routine. At any one time, all three copies of the flags 1396 * should be identical except for the IFF_RUNNING flag. The 1397 * copy of the flags in the "phyint" structure is always up to 1398 * date. 1399 */ 1400 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1401 if (pii == NULL) { 1402 if (debug & D_LINKNOTE) 1403 logdebug("process_rtm_ifinfo: no instance of address " 1404 "family %s for %s\n", AF_STR(type), pi->pi_name); 1405 return (_B_TRUE); 1406 } 1407 1408 old_flags = pii->pii_flags; 1409 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1410 pi->pi_flags = pii->pii_flags; 1411 1412 if (debug & D_LINKNOTE) { 1413 logdebug("process_rtm_ifinfo: %s address family: %s, " 1414 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1415 AF_STR(type), old_flags, pi->pi_flags); 1416 } 1417 1418 /* 1419 * If IFF_STANDBY has changed, indicate that the interface has changed 1420 * types and refresh IFF_INACTIVE if need be. 1421 */ 1422 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) { 1423 phyint_changed(pi); 1424 if (pii->pii_flags & IFF_STANDBY) 1425 phyint_standby_refresh_inactive(pi); 1426 } 1427 1428 /* Has just the IFF_RUNNING flag changed state ? */ 1429 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1430 struct phyint_instance *pii_other; 1431 /* 1432 * It wasn't just a link state change. Update 1433 * the other instance's copy of the flags. 1434 */ 1435 pii_other = phyint_inst_other(pii); 1436 if (pii_other != NULL) 1437 pii_other->pii_flags = pii->pii_flags; 1438 return (_B_TRUE); 1439 } 1440 1441 return (_B_FALSE); 1442 } 1443 1444 /* 1445 * Retrieve as many routing socket messages as possible, and try to 1446 * empty the routing sockets. Initiate full scan of targets or interfaces 1447 * as needed. 1448 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1449 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1450 */ 1451 static void 1452 process_rtsock(int rtsock_v4, int rtsock_v6) 1453 { 1454 int nbytes; 1455 int64_t msg[2048 / 8]; 1456 struct rt_msghdr *rtm; 1457 boolean_t need_if_scan = _B_FALSE; 1458 boolean_t need_rt_scan = _B_FALSE; 1459 boolean_t rtm_ifinfo_seen = _B_FALSE; 1460 int type; 1461 1462 /* Read as many messages as possible and try to empty the sockets */ 1463 for (type = AF_INET; ; type = AF_INET6) { 1464 for (;;) { 1465 nbytes = read((type == AF_INET) ? rtsock_v4 : 1466 rtsock_v6, msg, sizeof (msg)); 1467 if (nbytes <= 0) { 1468 /* No more messages */ 1469 break; 1470 } 1471 rtm = (struct rt_msghdr *)msg; 1472 if (rtm->rtm_version != RTM_VERSION) { 1473 logerr("process_rtsock: version %d " 1474 "not understood\n", rtm->rtm_version); 1475 break; 1476 } 1477 1478 if (debug & D_PHYINT) { 1479 logdebug("process_rtsock: message %d\n", 1480 rtm->rtm_type); 1481 } 1482 1483 switch (rtm->rtm_type) { 1484 case RTM_NEWADDR: 1485 case RTM_DELADDR: 1486 /* 1487 * Some logical interface has changed, 1488 * have to scan everything to determine 1489 * what actually changed. 1490 */ 1491 need_if_scan = _B_TRUE; 1492 break; 1493 1494 case RTM_IFINFO: 1495 rtm_ifinfo_seen = _B_TRUE; 1496 need_if_scan |= process_rtm_ifinfo( 1497 (if_msghdr_t *)rtm, type); 1498 break; 1499 1500 case RTM_ADD: 1501 case RTM_DELETE: 1502 case RTM_CHANGE: 1503 case RTM_OLDADD: 1504 case RTM_OLDDEL: 1505 need_rt_scan = _B_TRUE; 1506 break; 1507 1508 default: 1509 /* Not interesting */ 1510 break; 1511 } 1512 } 1513 if (type == AF_INET6) 1514 break; 1515 } 1516 1517 if (need_if_scan) { 1518 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1519 logdebug("process_rtsock: synchronizing with kernel\n"); 1520 initifs(); 1521 } else if (rtm_ifinfo_seen) { 1522 if (debug & D_LINKNOTE) 1523 logdebug("process_rtsock: " 1524 "link up/down notification(s) seen\n"); 1525 process_link_state_changes(); 1526 } 1527 1528 if (need_rt_scan) 1529 init_router_targets(); 1530 } 1531 1532 /* 1533 * Look if the phyint instance or one of its logints have been removed from 1534 * the kernel and take appropriate action. 1535 * Uses {pii,li}_in_use. 1536 */ 1537 static void 1538 check_if_removed(struct phyint_instance *pii) 1539 { 1540 struct logint *li; 1541 struct logint *next_li; 1542 1543 /* Detect phyints that have been removed from the kernel. */ 1544 if (!pii->pii_in_use) { 1545 logtrace("%s %s has been removed from kernel\n", 1546 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1547 phyint_inst_delete(pii); 1548 } else { 1549 /* Detect logints that have been removed. */ 1550 for (li = pii->pii_logint; li != NULL; li = next_li) { 1551 next_li = li->li_next; 1552 if (!li->li_in_use) { 1553 logint_delete(li); 1554 } 1555 } 1556 } 1557 } 1558 1559 /* 1560 * Parse the supplied mib2 information to extract the routing information 1561 * table. Process the routing table to get the list of known onlink routers 1562 * and update our database. These onlink routers will serve as probe 1563 * targets. 1564 */ 1565 static void 1566 update_router_list(mib_item_t *item) 1567 { 1568 for (; item != NULL; item = item->mi_next) { 1569 if (item->mi_opthdr.name == 0) 1570 continue; 1571 if (item->mi_opthdr.level == MIB2_IP && 1572 item->mi_opthdr.name == MIB2_IP_ROUTE) { 1573 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp, 1574 item->mi_opthdr.len); 1575 } else if (item->mi_opthdr.level == MIB2_IP6 && 1576 item->mi_opthdr.name == MIB2_IP6_ROUTE) { 1577 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp, 1578 item->mi_opthdr.len); 1579 } 1580 } 1581 } 1582 1583 1584 /* 1585 * Convert octet `octp' to a phyint name and store in `ifname' 1586 */ 1587 static void 1588 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) 1589 { 1590 char *cp; 1591 size_t len = MIN(octp->o_length, ifsize - 1); 1592 1593 (void) strncpy(ifname, octp->o_bytes, len); 1594 ifname[len] = '\0'; 1595 1596 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) 1597 *cp = '\0'; 1598 } 1599 1600 /* 1601 * Examine the IPv4 routing table `buf' for possible targets. For each 1602 * possible target, if it's on the same subnet an interface route, pass 1603 * it to router_add_common() for further consideration. 1604 */ 1605 static void 1606 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1607 { 1608 char ifname[LIFNAMSIZ]; 1609 mib2_ipRouteEntry_t *rp, *rp1, *endp; 1610 struct in_addr nexthop_v4; 1611 struct in6_addr nexthop; 1612 1613 if (debug & D_TARGET) 1614 logdebug("ire_process_v4(len %d)\n", len); 1615 1616 if (len == 0) 1617 return; 1618 1619 assert((len % ipRouteEntrySize) == 0); 1620 endp = buf + (len / ipRouteEntrySize); 1621 1622 /* 1623 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1624 * cross-reference them with the interface routes to determine if 1625 * they're possible probe targets. 1626 */ 1627 for (rp = buf; rp < endp; rp++) { 1628 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1629 continue; 1630 1631 /* Get the nexthop address. */ 1632 nexthop_v4.s_addr = rp->ipRouteNextHop; 1633 1634 /* 1635 * Rescan the routing table looking for interface routes that 1636 * are on the same subnet, and try to add them. If they're 1637 * not relevant (e.g., the interface route isn't part of an 1638 * IPMP group, router_add_common() will discard). 1639 */ 1640 for (rp1 = buf; rp1 < endp; rp1++) { 1641 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || 1642 rp1->ipRouteIfIndex.o_length == 0) 1643 continue; 1644 1645 if ((rp1->ipRouteDest & rp1->ipRouteMask) != 1646 (nexthop_v4.s_addr & rp1->ipRouteMask)) 1647 continue; 1648 1649 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); 1650 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1651 router_add_common(AF_INET, ifname, nexthop); 1652 } 1653 } 1654 } 1655 1656 void 1657 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1658 { 1659 struct phyint_instance *pii; 1660 struct phyint *pi; 1661 1662 if (debug & D_TARGET) 1663 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1664 1665 /* 1666 * Retrieve the phyint instance; bail if it's not known to us yet. 1667 */ 1668 pii = phyint_inst_lookup(af, ifname); 1669 if (pii == NULL) 1670 return; 1671 1672 /* 1673 * Don't use our own addresses as targets. 1674 */ 1675 if (own_address(nexthop)) 1676 return; 1677 1678 /* 1679 * If the phyint is part a named group, then add the address to all 1680 * members of the group; note that this is suboptimal in the IPv4 case 1681 * as it has already been added to all matching interfaces in 1682 * ire_process_v4(). Otherwise, add the address only to the phyint 1683 * itself, since other phyints in the anongroup may not be on the same 1684 * subnet. 1685 */ 1686 pi = pii->pii_phyint; 1687 if (pi->pi_group == phyint_anongroup) { 1688 target_add(pii, nexthop, _B_TRUE); 1689 } else { 1690 pi = pi->pi_group->pg_phyint; 1691 for (; pi != NULL; pi = pi->pi_pgnext) 1692 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1693 } 1694 } 1695 1696 /* 1697 * Examine the IPv6 routing table `buf' for possible link-local targets, and 1698 * pass any contenders to router_add_common() for further consideration. 1699 */ 1700 static void 1701 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1702 { 1703 struct lifreq lifr; 1704 char ifname[LIFNAMSIZ]; 1705 char grname[LIFGRNAMSIZ]; 1706 mib2_ipv6RouteEntry_t *rp, *rp1, *endp; 1707 struct in6_addr nexthop_v6; 1708 1709 if (debug & D_TARGET) 1710 logdebug("ire_process_v6(len %d)\n", len); 1711 1712 if (len == 0) 1713 return; 1714 1715 assert((len % ipv6RouteEntrySize) == 0); 1716 endp = buf + (len / ipv6RouteEntrySize); 1717 1718 /* 1719 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1720 * cross-reference them with the interface routes to determine if 1721 * they're possible probe targets. 1722 */ 1723 for (rp = buf; rp < endp; rp++) { 1724 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || 1725 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) 1726 continue; 1727 1728 /* Get the nexthop address. */ 1729 nexthop_v6 = rp->ipv6RouteNextHop; 1730 1731 /* 1732 * The interface name should always exist for link-locals; 1733 * we use it to map this entry to an IPMP group name. 1734 */ 1735 if (rp->ipv6RouteIfIndex.o_length == 0) 1736 continue; 1737 1738 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); 1739 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || 1740 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { 1741 continue; 1742 } 1743 1744 /* 1745 * Rescan the list of routes for interface routes, and add the 1746 * above target to any interfaces in the same IPMP group. 1747 */ 1748 for (rp1 = buf; rp1 < endp; rp1++) { 1749 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || 1750 rp1->ipv6RouteIfIndex.o_length == 0) { 1751 continue; 1752 } 1753 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); 1754 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); 1755 1756 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && 1757 strcmp(lifr.lifr_groupname, grname) == 0) { 1758 router_add_common(AF_INET6, ifname, nexthop_v6); 1759 } 1760 } 1761 } 1762 } 1763 1764 /* 1765 * Build a list of target routers, by scanning the routing tables. 1766 * It is assumed that interface routes exist, to reach the routers. 1767 */ 1768 static void 1769 init_router_targets(void) 1770 { 1771 struct target *tg; 1772 struct target *next_tg; 1773 struct phyint_instance *pii; 1774 struct phyint *pi; 1775 1776 if (force_mcast) 1777 return; 1778 1779 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1780 pi = pii->pii_phyint; 1781 /* 1782 * Set tg_in_use to false only for router targets. 1783 */ 1784 if (!pii->pii_targets_are_routers) 1785 continue; 1786 1787 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1788 tg->tg_in_use = 0; 1789 } 1790 1791 if (mibwalk(update_router_list) == -1) 1792 exit(1); 1793 1794 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1795 pi = pii->pii_phyint; 1796 if (!pii->pii_targets_are_routers) 1797 continue; 1798 1799 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1800 next_tg = tg->tg_next; 1801 /* 1802 * If the group has failed, it's likely the route was 1803 * removed by an application affected by that failure. 1804 * In that case, we keep the target so that we can 1805 * reliably repair, at which point we'll refresh the 1806 * target list again. 1807 */ 1808 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) 1809 target_delete(tg); 1810 } 1811 } 1812 } 1813 1814 /* 1815 * Attempt to assign host targets to any interfaces that do not currently 1816 * have probe targets by sharing targets with other interfaces in the group. 1817 */ 1818 static void 1819 init_host_targets(void) 1820 { 1821 struct phyint_instance *pii; 1822 struct phyint_group *pg; 1823 1824 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1825 pg = pii->pii_phyint->pi_group; 1826 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1827 dup_host_targets(pii); 1828 } 1829 } 1830 1831 /* 1832 * Duplicate host targets from other phyints of the group to 1833 * the phyint instance 'desired_pii'. 1834 */ 1835 static void 1836 dup_host_targets(struct phyint_instance *desired_pii) 1837 { 1838 int af; 1839 struct phyint *pi; 1840 struct phyint_instance *pii; 1841 struct target *tg; 1842 1843 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 1844 1845 af = desired_pii->pii_af; 1846 1847 /* 1848 * For every phyint in the same group as desired_pii, check if 1849 * it has any host targets. If so add them to desired_pii. 1850 */ 1851 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 1852 pii = PHYINT_INSTANCE(pi, af); 1853 /* 1854 * We know that we don't have targets on this phyint instance 1855 * since we have been called. But we still check for 1856 * pii_targets_are_routers because another phyint instance 1857 * could have router targets, since IFF_NOFAILOVER addresses 1858 * on different phyint instances may belong to different 1859 * subnets. 1860 */ 1861 if ((pii == NULL) || (pii == desired_pii) || 1862 pii->pii_targets_are_routers) 1863 continue; 1864 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1865 target_create(desired_pii, tg->tg_address, _B_FALSE); 1866 } 1867 } 1868 } 1869 1870 static void 1871 usage(char *cmd) 1872 { 1873 (void) fprintf(stderr, "usage: %s\n", cmd); 1874 } 1875 1876 1877 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 1878 1879 /* Get an option from the /etc/default/mpathd file */ 1880 static char * 1881 getdefault(char *name) 1882 { 1883 char namebuf[BUFSIZ]; 1884 char *value = NULL; 1885 1886 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 1887 char *cp; 1888 int flags; 1889 1890 /* 1891 * ignore case 1892 */ 1893 flags = defcntl(DC_GETFLAGS, 0); 1894 TURNOFF(flags, DC_CASE); 1895 (void) defcntl(DC_SETFLAGS, flags); 1896 1897 /* Add "=" to the name */ 1898 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 1899 (void) strncat(namebuf, "=", 2); 1900 1901 if ((cp = defread(namebuf)) != NULL) 1902 value = strdup(cp); 1903 1904 /* close */ 1905 (void) defopen((char *)NULL); 1906 } 1907 return (value); 1908 } 1909 1910 1911 /* 1912 * Command line options below 1913 */ 1914 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 1915 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ 1916 static boolean_t adopt = _B_FALSE; 1917 static boolean_t foreground = _B_FALSE; 1918 1919 int 1920 main(int argc, char *argv[]) 1921 { 1922 int i; 1923 int c; 1924 struct phyint *pi; 1925 struct phyint_instance *pii; 1926 char *value; 1927 1928 argv0 = argv; /* Saved for re-exec on SIGHUP */ 1929 srandom(gethostid()); /* Initialize the random number generator */ 1930 1931 /* 1932 * NOTE: The messages output by in.mpathd are not suitable for 1933 * translation, so we do not call textdomain(). 1934 */ 1935 (void) setlocale(LC_ALL, ""); 1936 1937 /* 1938 * Get the user specified value of 'failure detection time' 1939 * from /etc/default/mpathd 1940 */ 1941 value = getdefault("FAILURE_DETECTION_TIME"); 1942 if (value != NULL) { 1943 user_failure_detection_time = 1944 (int)strtol((char *)value, NULL, 0); 1945 1946 if (user_failure_detection_time <= 0) { 1947 user_failure_detection_time = FAILURE_DETECTION_TIME; 1948 logerr("Invalid failure detection time %s, assuming " 1949 "default of %d ms\n", value, 1950 user_failure_detection_time); 1951 1952 } else if (user_failure_detection_time < 1953 MIN_FAILURE_DETECTION_TIME) { 1954 user_failure_detection_time = 1955 MIN_FAILURE_DETECTION_TIME; 1956 logerr("Too small failure detection time of %s, " 1957 "assuming minimum of %d ms\n", value, 1958 user_failure_detection_time); 1959 } 1960 free(value); 1961 } else { 1962 /* User has not specified the parameter, Use default value */ 1963 user_failure_detection_time = FAILURE_DETECTION_TIME; 1964 } 1965 1966 /* 1967 * This gives the frequency at which probes will be sent. 1968 * When fdt ms elapses, we should be able to determine 1969 * whether 5 consecutive probes have failed or not. 1970 * 1 probe will be sent in every user_probe_interval ms, 1971 * randomly anytime in the (0.5 - 1.0) 2nd half of every 1972 * user_probe_interval. Thus when we send out probe 'n' we 1973 * can be sure that probe 'n - 2' is lost, if we have not 1974 * got the ack. (since the probe interval is > crtt). But 1975 * probe 'n - 1' may be a valid unacked probe, since the 1976 * time between 2 successive probes could be as small as 1977 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 1978 */ 1979 user_probe_interval = user_failure_detection_time / 1980 (NUM_PROBE_FAILS + 2); 1981 1982 /* 1983 * Get the user specified value of failback_enabled from 1984 * /etc/default/mpathd 1985 */ 1986 value = getdefault("FAILBACK"); 1987 if (value != NULL) { 1988 if (strcasecmp(value, "yes") == 0) 1989 failback_enabled = _B_TRUE; 1990 else if (strcasecmp(value, "no") == 0) 1991 failback_enabled = _B_FALSE; 1992 else 1993 logerr("Invalid value for FAILBACK %s\n", value); 1994 free(value); 1995 } else { 1996 failback_enabled = _B_TRUE; 1997 } 1998 1999 /* 2000 * Get the user specified value of track_all_phyints from 2001 * /etc/default/mpathd. The sense is reversed in 2002 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2003 */ 2004 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2005 if (value != NULL) { 2006 if (strcasecmp(value, "yes") == 0) 2007 track_all_phyints = _B_FALSE; 2008 else if (strcasecmp(value, "no") == 0) 2009 track_all_phyints = _B_TRUE; 2010 else 2011 logerr("Invalid value for " 2012 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2013 free(value); 2014 } else { 2015 track_all_phyints = _B_FALSE; 2016 } 2017 2018 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2019 switch (c) { 2020 case 'a': 2021 adopt = _B_TRUE; 2022 break; 2023 case 'm': 2024 force_mcast = _B_TRUE; 2025 break; 2026 case 'd': 2027 debug = D_ALL; 2028 foreground = _B_TRUE; 2029 break; 2030 case 'D': 2031 i = (int)strtol(optarg, NULL, 0); 2032 if (i == 0) { 2033 (void) fprintf(stderr, "Bad debug flags: %s\n", 2034 optarg); 2035 exit(1); 2036 } 2037 debug |= i; 2038 foreground = _B_TRUE; 2039 break; 2040 case 'l': 2041 /* 2042 * Turn off link state notification handling. 2043 * Undocumented command line flag, for debugging 2044 * purposes. 2045 */ 2046 handle_link_notifications = _B_FALSE; 2047 break; 2048 default: 2049 usage(argv[0]); 2050 exit(1); 2051 } 2052 } 2053 2054 /* 2055 * The sockets for the loopback command interface should be listening 2056 * before we fork and exit in daemonize(). This way, whoever started us 2057 * can use the loopback interface as soon as they get a zero exit 2058 * status. 2059 */ 2060 lsock_v4 = setup_listener(AF_INET); 2061 lsock_v6 = setup_listener(AF_INET6); 2062 2063 if (lsock_v4 < 0 && lsock_v6 < 0) { 2064 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2065 exit(1); 2066 } 2067 2068 if (!foreground) { 2069 if (!daemonize()) { 2070 logerr("cannot daemonize\n"); 2071 exit(EXIT_FAILURE); 2072 } 2073 initlog(); 2074 } 2075 2076 /* 2077 * Initializations: 2078 * 1. Create ifsock* sockets. These are used for performing SIOC* 2079 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2080 * 2. Initialize a pipe for handling/recording signal events. 2081 * 3. Create the routing sockets, used for listening 2082 * to routing / interface changes. 2083 * 4. phyint_init() - Initialize physical interface state 2084 * (in mpd_tables.c). Must be done before creating interfaces, 2085 * which timer_init() does indirectly. 2086 * 5. Query kernel for route entry sizes (v4 and v6). 2087 * 6. timer_init() - Initialize timer related stuff 2088 * 7. initifs() - Initialize our database of all known interfaces 2089 * 8. init_router_targets() - Initialize our database of all known 2090 * router targets. 2091 */ 2092 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2093 if (ifsock_v4 < 0) { 2094 logperror("main: IPv4 socket open"); 2095 exit(1); 2096 } 2097 2098 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2099 if (ifsock_v6 < 0) { 2100 logperror("main: IPv6 socket open"); 2101 exit(1); 2102 } 2103 2104 setup_eventpipe(); 2105 2106 rtsock_v4 = setup_rtsock(AF_INET); 2107 rtsock_v6 = setup_rtsock(AF_INET6); 2108 2109 if (phyint_init() == -1) { 2110 logerr("cannot initialize physical interface structures"); 2111 exit(1); 2112 } 2113 2114 if (mibwalk(mib_get_constants) == -1) 2115 exit(1); 2116 2117 timer_init(); 2118 2119 initifs(); 2120 2121 /* 2122 * If we're operating in "adopt" mode and no interfaces need to be 2123 * tracked, shut down (ifconfig(8) will restart us on demand if 2124 * interfaces are subsequently put into multipathing groups). 2125 */ 2126 if (adopt && phyint_instances == NULL) 2127 exit(0); 2128 2129 /* 2130 * Main body. Keep listening for activity on any of the sockets 2131 * that we are monitoring and take appropriate action as necessary. 2132 * signals are also handled synchronously. 2133 */ 2134 for (;;) { 2135 if (poll(pollfds, pollfd_num, -1) < 0) { 2136 if (errno == EINTR) 2137 continue; 2138 logperror("main: poll"); 2139 exit(1); 2140 } 2141 for (i = 0; i < pollfd_num; i++) { 2142 if ((pollfds[i].fd == -1) || 2143 !(pollfds[i].revents & POLLIN)) 2144 continue; 2145 if (pollfds[i].fd == eventpipe_read) { 2146 in_signal(eventpipe_read); 2147 break; 2148 } 2149 if (pollfds[i].fd == rtsock_v4 || 2150 pollfds[i].fd == rtsock_v6) { 2151 process_rtsock(rtsock_v4, rtsock_v6); 2152 break; 2153 } 2154 2155 for (pii = phyint_instances; pii != NULL; 2156 pii = pii->pii_next) { 2157 if (pollfds[i].fd == pii->pii_probe_sock) { 2158 if (pii->pii_af == AF_INET) 2159 in_data(pii); 2160 else 2161 in6_data(pii); 2162 break; 2163 } 2164 } 2165 2166 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2167 if (pi->pi_notes != 0 && 2168 pollfds[i].fd == dlpi_fd(pi->pi_dh)) { 2169 (void) dlpi_recv(pi->pi_dh, NULL, NULL, 2170 NULL, NULL, 0, NULL); 2171 break; 2172 } 2173 } 2174 2175 if (pollfds[i].fd == lsock_v4) 2176 loopback_cmd(lsock_v4, AF_INET); 2177 else if (pollfds[i].fd == lsock_v6) 2178 loopback_cmd(lsock_v6, AF_INET6); 2179 } 2180 } 2181 /* NOTREACHED */ 2182 return (EXIT_SUCCESS); 2183 } 2184 2185 static int 2186 setup_listener(int af) 2187 { 2188 int sock; 2189 int on; 2190 int len; 2191 int ret; 2192 struct sockaddr_storage laddr; 2193 struct sockaddr_in *sin; 2194 struct sockaddr_in6 *sin6; 2195 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2196 2197 assert(af == AF_INET || af == AF_INET6); 2198 2199 sock = socket(af, SOCK_STREAM, 0); 2200 if (sock < 0) { 2201 logperror("setup_listener: socket"); 2202 exit(1); 2203 } 2204 2205 on = 1; 2206 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2207 sizeof (on)) < 0) { 2208 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2209 exit(1); 2210 } 2211 2212 bzero(&laddr, sizeof (laddr)); 2213 laddr.ss_family = af; 2214 2215 if (af == AF_INET) { 2216 sin = (struct sockaddr_in *)&laddr; 2217 sin->sin_port = htons(MPATHD_PORT); 2218 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2219 len = sizeof (struct sockaddr_in); 2220 } else { 2221 sin6 = (struct sockaddr_in6 *)&laddr; 2222 sin6->sin6_port = htons(MPATHD_PORT); 2223 sin6->sin6_addr = loopback_addr; 2224 len = sizeof (struct sockaddr_in6); 2225 } 2226 2227 ret = bind(sock, (struct sockaddr *)&laddr, len); 2228 if (ret < 0) { 2229 if (errno == EADDRINUSE) { 2230 /* 2231 * Another instance of mpathd may be already active. 2232 */ 2233 logerr("main: is another instance of in.mpathd " 2234 "already active?\n"); 2235 exit(1); 2236 } else { 2237 (void) close(sock); 2238 return (-1); 2239 } 2240 } 2241 if (listen(sock, 30) < 0) { 2242 logperror("main: listen"); 2243 exit(1); 2244 } 2245 if (poll_add(sock) == -1) { 2246 (void) close(sock); 2247 exit(1); 2248 } 2249 2250 return (sock); 2251 } 2252 2253 /* 2254 * Table of commands and their expected size; used by loopback_cmd(). 2255 */ 2256 static struct { 2257 const char *name; 2258 unsigned int size; 2259 } commands[] = { 2260 { "MI_PING", sizeof (uint32_t) }, 2261 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2262 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2263 { "MI_QUERY", sizeof (mi_query_t) } 2264 }; 2265 2266 /* 2267 * Commands received over the loopback interface come here (via libipmp). 2268 */ 2269 static void 2270 loopback_cmd(int sock, int family) 2271 { 2272 int newfd; 2273 ssize_t len; 2274 boolean_t is_priv = _B_FALSE; 2275 struct sockaddr_storage peer; 2276 struct sockaddr_in *peer_sin; 2277 struct sockaddr_in6 *peer_sin6; 2278 socklen_t peerlen; 2279 union mi_commands mpi; 2280 char abuf[INET6_ADDRSTRLEN]; 2281 uint_t cmd; 2282 int retval; 2283 2284 peerlen = sizeof (peer); 2285 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2286 if (newfd < 0) { 2287 logperror("loopback_cmd: accept"); 2288 return; 2289 } 2290 2291 switch (family) { 2292 case AF_INET: 2293 /* 2294 * Validate the address and port to make sure that 2295 * non privileged processes don't connect and start 2296 * talking to us. 2297 */ 2298 if (peerlen != sizeof (struct sockaddr_in)) { 2299 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2300 (void) close(newfd); 2301 return; 2302 } 2303 peer_sin = (struct sockaddr_in *)&peer; 2304 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; 2305 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2306 abuf, sizeof (abuf)); 2307 2308 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { 2309 logerr("Attempt to connect from addr %s port %d\n", 2310 abuf, ntohs(peer_sin->sin_port)); 2311 (void) close(newfd); 2312 return; 2313 } 2314 break; 2315 2316 case AF_INET6: 2317 if (peerlen != sizeof (struct sockaddr_in6)) { 2318 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2319 (void) close(newfd); 2320 return; 2321 } 2322 /* 2323 * Validate the address and port to make sure that 2324 * non privileged processes don't connect and start 2325 * talking to us. 2326 */ 2327 peer_sin6 = (struct sockaddr_in6 *)&peer; 2328 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; 2329 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2330 sizeof (abuf)); 2331 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { 2332 logerr("Attempt to connect from addr %s port %d\n", 2333 abuf, ntohs(peer_sin6->sin6_port)); 2334 (void) close(newfd); 2335 return; 2336 } 2337 break; 2338 2339 default: 2340 logdebug("loopback_cmd: family %d\n", family); 2341 (void) close(newfd); 2342 return; 2343 } 2344 2345 /* 2346 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2347 * all supported commands 2348 */ 2349 len = read(newfd, &mpi, sizeof (mpi)); 2350 2351 /* 2352 * In theory, we can receive any sized message for a stream socket, 2353 * but we don't expect that to happen for a small message over a 2354 * loopback connection. 2355 */ 2356 if (len < sizeof (uint32_t)) { 2357 logerr("loopback_cmd: bad command format or read returns " 2358 "partial data %d\n", len); 2359 (void) close(newfd); 2360 return; 2361 } 2362 2363 cmd = mpi.mi_command; 2364 if (cmd >= MI_NCMD) { 2365 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2366 (void) close(newfd); 2367 return; 2368 } 2369 2370 /* 2371 * Only MI_PING and MI_QUERY can come from unprivileged sources. 2372 */ 2373 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { 2374 logerr("Unprivileged request from %s for privileged " 2375 "command %s\n", abuf, commands[cmd].name); 2376 (void) close(newfd); 2377 return; 2378 } 2379 2380 if (len < commands[cmd].size) { 2381 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2382 commands[cmd].name, commands[cmd].size, len); 2383 (void) close(newfd); 2384 return; 2385 } 2386 2387 retval = process_cmd(newfd, &mpi); 2388 if (retval != IPMP_SUCCESS) { 2389 logerr("failed processing %s: %s\n", commands[cmd].name, 2390 ipmp_errmsg(retval)); 2391 } 2392 (void) close(newfd); 2393 } 2394 2395 /* 2396 * Process the commands received via libipmp. 2397 */ 2398 static unsigned int 2399 process_cmd(int newfd, union mi_commands *mpi) 2400 { 2401 struct phyint *pi; 2402 struct mi_offline *mio; 2403 struct mi_undo_offline *miu; 2404 unsigned int retval; 2405 2406 switch (mpi->mi_command) { 2407 case MI_PING: 2408 return (send_result(newfd, IPMP_SUCCESS, 0)); 2409 2410 case MI_OFFLINE: 2411 mio = &mpi->mi_ocmd; 2412 2413 pi = phyint_lookup(mio->mio_ifname); 2414 if (pi == NULL) 2415 return (send_result(newfd, IPMP_EUNKIF, 0)); 2416 2417 retval = phyint_offline(pi, mio->mio_min_redundancy); 2418 if (retval == IPMP_FAILURE) 2419 return (send_result(newfd, IPMP_FAILURE, errno)); 2420 2421 return (send_result(newfd, retval, 0)); 2422 2423 case MI_UNDO_OFFLINE: 2424 miu = &mpi->mi_ucmd; 2425 2426 pi = phyint_lookup(miu->miu_ifname); 2427 if (pi == NULL) 2428 return (send_result(newfd, IPMP_EUNKIF, 0)); 2429 2430 retval = phyint_undo_offline(pi); 2431 if (retval == IPMP_FAILURE) 2432 return (send_result(newfd, IPMP_FAILURE, errno)); 2433 2434 return (send_result(newfd, retval, 0)); 2435 2436 case MI_QUERY: 2437 return (process_query(newfd, &mpi->mi_qcmd)); 2438 2439 default: 2440 break; 2441 } 2442 2443 return (send_result(newfd, IPMP_EPROTO, 0)); 2444 } 2445 2446 /* 2447 * Process the query request pointed to by `miq' and send a reply on file 2448 * descriptor `fd'. Returns an IPMP error code. 2449 */ 2450 static unsigned int 2451 process_query(int fd, mi_query_t *miq) 2452 { 2453 ipmp_addrinfo_t *adinfop; 2454 ipmp_addrinfolist_t *adlp; 2455 ipmp_groupinfo_t *grinfop; 2456 ipmp_groupinfolist_t *grlp; 2457 ipmp_grouplist_t *grlistp; 2458 ipmp_ifinfo_t *ifinfop; 2459 ipmp_ifinfolist_t *iflp; 2460 ipmp_snap_t *snap; 2461 unsigned int retval; 2462 2463 switch (miq->miq_inforeq) { 2464 case IPMP_ADDRINFO: 2465 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, 2466 &adinfop); 2467 if (retval != IPMP_SUCCESS) 2468 return (send_result(fd, retval, errno)); 2469 2470 retval = send_result(fd, IPMP_SUCCESS, 0); 2471 if (retval == IPMP_SUCCESS) 2472 retval = send_addrinfo(fd, adinfop); 2473 2474 ipmp_freeaddrinfo(adinfop); 2475 return (retval); 2476 2477 case IPMP_GROUPLIST: 2478 retval = getgrouplist(&grlistp); 2479 if (retval != IPMP_SUCCESS) 2480 return (send_result(fd, retval, errno)); 2481 2482 retval = send_result(fd, IPMP_SUCCESS, 0); 2483 if (retval == IPMP_SUCCESS) 2484 retval = send_grouplist(fd, grlistp); 2485 2486 ipmp_freegrouplist(grlistp); 2487 return (retval); 2488 2489 case IPMP_GROUPINFO: 2490 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2491 retval = getgroupinfo(miq->miq_grname, &grinfop); 2492 if (retval != IPMP_SUCCESS) 2493 return (send_result(fd, retval, errno)); 2494 2495 retval = send_result(fd, IPMP_SUCCESS, 0); 2496 if (retval == IPMP_SUCCESS) 2497 retval = send_groupinfo(fd, grinfop); 2498 2499 ipmp_freegroupinfo(grinfop); 2500 return (retval); 2501 2502 case IPMP_IFINFO: 2503 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2504 retval = getifinfo(miq->miq_ifname, &ifinfop); 2505 if (retval != IPMP_SUCCESS) 2506 return (send_result(fd, retval, errno)); 2507 2508 retval = send_result(fd, IPMP_SUCCESS, 0); 2509 if (retval == IPMP_SUCCESS) 2510 retval = send_ifinfo(fd, ifinfop); 2511 2512 ipmp_freeifinfo(ifinfop); 2513 return (retval); 2514 2515 case IPMP_SNAP: 2516 /* 2517 * Before taking the snapshot, sync with the kernel. 2518 */ 2519 initifs(); 2520 2521 retval = getsnap(&snap); 2522 if (retval != IPMP_SUCCESS) 2523 return (send_result(fd, retval, errno)); 2524 2525 retval = send_result(fd, IPMP_SUCCESS, 0); 2526 if (retval != IPMP_SUCCESS) 2527 goto out; 2528 2529 retval = send_grouplist(fd, snap->sn_grlistp); 2530 if (retval != IPMP_SUCCESS) 2531 goto out; 2532 2533 retval = ipmp_writetlv(fd, IPMP_IFCNT, sizeof (uint32_t), 2534 &snap->sn_nif); 2535 if (retval != IPMP_SUCCESS) 2536 goto out; 2537 2538 iflp = snap->sn_ifinfolistp; 2539 for (; iflp != NULL; iflp = iflp->ifl_next) { 2540 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2541 if (retval != IPMP_SUCCESS) 2542 goto out; 2543 } 2544 2545 retval = ipmp_writetlv(fd, IPMP_GROUPCNT, sizeof (uint32_t), 2546 &snap->sn_ngroup); 2547 if (retval != IPMP_SUCCESS) 2548 goto out; 2549 2550 grlp = snap->sn_grinfolistp; 2551 for (; grlp != NULL; grlp = grlp->grl_next) { 2552 retval = send_groupinfo(fd, grlp->grl_grinfop); 2553 if (retval != IPMP_SUCCESS) 2554 goto out; 2555 } 2556 2557 retval = ipmp_writetlv(fd, IPMP_ADDRCNT, sizeof (uint32_t), 2558 &snap->sn_naddr); 2559 if (retval != IPMP_SUCCESS) 2560 goto out; 2561 2562 adlp = snap->sn_adinfolistp; 2563 for (; adlp != NULL; adlp = adlp->adl_next) { 2564 retval = send_addrinfo(fd, adlp->adl_adinfop); 2565 if (retval != IPMP_SUCCESS) 2566 goto out; 2567 } 2568 out: 2569 ipmp_snap_free(snap); 2570 return (retval); 2571 2572 default: 2573 break; 2574 2575 } 2576 return (send_result(fd, IPMP_EPROTO, 0)); 2577 } 2578 2579 /* 2580 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2581 * Returns an IPMP error code. 2582 */ 2583 static unsigned int 2584 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2585 { 2586 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2587 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; 2588 ipmp_groupinfo_xfer_t grxfer; 2589 unsigned int retval; 2590 2591 /* 2592 * We can't directly transfer an ipmp_groupinfo_t due to the embedded 2593 * pointers to ipmp_iflist_t and ipmp_addr_list_t. Copy the data over 2594 * to a temporary transfer structure that doesn't have these embedded 2595 * pointers. 2596 */ 2597 memset(&grxfer, 0, sizeof (grxfer)); 2598 2599 grxfer.grx_sig = grinfop->gr_sig; 2600 grxfer.grx_state = grinfop->gr_state; 2601 grxfer.grx_fdt = grinfop->gr_fdt; 2602 2603 memcpy(grxfer.grx_name, grinfop->gr_name, sizeof (grxfer.grx_name)); 2604 memcpy(grxfer.grx_ifname, grinfop->gr_ifname, 2605 sizeof (grxfer.grx_ifname)); 2606 memcpy(grxfer.grx_m4ifname, grinfop->gr_m4ifname, 2607 sizeof (grxfer.grx_m4ifname)); 2608 memcpy(grxfer.grx_m6ifname, grinfop->gr_m6ifname, 2609 sizeof (grxfer.grx_m6ifname)); 2610 memcpy(grxfer.grx_bcifname, grinfop->gr_bcifname, 2611 sizeof (grxfer.grx_bcifname)); 2612 2613 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (grxfer), &grxfer); 2614 if (retval != IPMP_SUCCESS) 2615 return (retval); 2616 2617 retval = ipmp_writetlv(fd, IPMP_IFLIST, 2618 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); 2619 if (retval != IPMP_SUCCESS) 2620 return (retval); 2621 2622 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2623 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); 2624 } 2625 2626 /* 2627 * Send the interface information pointed to by `ifinfop' on file descriptor 2628 * `fd'. Returns an IPMP error code. 2629 */ 2630 static unsigned int 2631 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2632 { 2633 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; 2634 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; 2635 ipmp_ifinfo_xfer_t ifxfer; 2636 unsigned int retval; 2637 2638 /* 2639 * We can't directly tranfer an ipmp_ifinfo_t due to the embedded 2640 * ipmp_addrlist_t pointer in if_targinfo_t. Copy the data over to 2641 * a temporary transfer structure that doesn't have that embedded 2642 * pointer. 2643 */ 2644 memset(&ifxfer, 0, sizeof (ifxfer)); 2645 2646 ifxfer.ifx_state = ifinfop->if_state; 2647 ifxfer.ifx_type = ifinfop->if_type; 2648 ifxfer.ifx_linkstate = ifinfop->if_linkstate; 2649 ifxfer.ifx_probestate = ifinfop->if_probestate; 2650 ifxfer.ifx_flags = ifinfop->if_flags; 2651 ifxfer.ifx_targinfo4.itx_testaddr = ifinfop->if_targinfo4.it_testaddr; 2652 ifxfer.ifx_targinfo4.itx_targmode = ifinfop->if_targinfo4.it_targmode; 2653 ifxfer.ifx_targinfo6.itx_testaddr = ifinfop->if_targinfo6.it_testaddr; 2654 ifxfer.ifx_targinfo6.itx_targmode = ifinfop->if_targinfo6.it_targmode; 2655 2656 memcpy(ifxfer.ifx_name, ifinfop->if_name, sizeof (ifxfer.ifx_name)); 2657 memcpy(ifxfer.ifx_group, ifinfop->if_group, sizeof (ifxfer.ifx_group)); 2658 memcpy(ifxfer.ifx_targinfo4.itx_name, ifinfop->if_targinfo4.it_name, 2659 sizeof (ifxfer.ifx_targinfo4.itx_name)); 2660 memcpy(ifxfer.ifx_targinfo6.itx_name, ifinfop->if_targinfo6.it_name, 2661 sizeof (ifxfer.ifx_targinfo6.itx_name)); 2662 2663 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (ifxfer), &ifxfer); 2664 if (retval != IPMP_SUCCESS) 2665 return (retval); 2666 2667 retval = ipmp_writetlv(fd, IPMP_ADDRLIST, 2668 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); 2669 if (retval != IPMP_SUCCESS) 2670 return (retval); 2671 2672 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2673 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); 2674 } 2675 2676 /* 2677 * Send the address information pointed to by `adinfop' on file descriptor 2678 * `fd'. Returns an IPMP error code. 2679 */ 2680 static unsigned int 2681 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) 2682 { 2683 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); 2684 } 2685 2686 /* 2687 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2688 * Returns an IPMP error code. 2689 */ 2690 static unsigned int 2691 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2692 { 2693 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2694 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2695 } 2696 2697 /* 2698 * Initialize an mi_result_t structure using `error' and `syserror' and 2699 * send it on file descriptor `fd'. Returns an IPMP error code. 2700 */ 2701 static unsigned int 2702 send_result(int fd, unsigned int error, int syserror) 2703 { 2704 mi_result_t me; 2705 2706 me.me_mpathd_error = error; 2707 if (error == IPMP_FAILURE) 2708 me.me_sys_error = syserror; 2709 else 2710 me.me_sys_error = 0; 2711 2712 return (ipmp_write(fd, &me, sizeof (me))); 2713 } 2714 2715 /* 2716 * Daemonize the process. 2717 */ 2718 static boolean_t 2719 daemonize(void) 2720 { 2721 switch (fork()) { 2722 case -1: 2723 return (_B_FALSE); 2724 2725 case 0: 2726 /* 2727 * Lose our controlling terminal, and become both a session 2728 * leader and a process group leader. 2729 */ 2730 if (setsid() == -1) 2731 return (_B_FALSE); 2732 2733 /* 2734 * Under POSIX, a session leader can accidentally (through 2735 * open(2)) acquire a controlling terminal if it does not 2736 * have one. Just to be safe, fork() again so we are not a 2737 * session leader. 2738 */ 2739 switch (fork()) { 2740 case -1: 2741 return (_B_FALSE); 2742 2743 case 0: 2744 (void) chdir("/"); 2745 (void) umask(022); 2746 (void) fdwalk(closefunc, NULL); 2747 break; 2748 2749 default: 2750 _exit(EXIT_SUCCESS); 2751 } 2752 break; 2753 2754 default: 2755 _exit(EXIT_SUCCESS); 2756 } 2757 2758 return (_B_TRUE); 2759 } 2760 2761 /* 2762 * The parent has created some fds before forking on purpose, keep them open. 2763 */ 2764 static int 2765 closefunc(void *not_used, int fd) 2766 { 2767 if (fd != lsock_v4 && fd != lsock_v6) 2768 (void) close(fd); 2769 return (0); 2770 } 2771 2772 /* LOGGER */ 2773 2774 #include <syslog.h> 2775 2776 /* 2777 * Logging routines. All routines log to syslog, unless the daemon is 2778 * running in the foreground, in which case the logging goes to stderr. 2779 * 2780 * The following routines are available: 2781 * 2782 * logdebug(): A printf-like function for outputting debug messages 2783 * (messages at LOG_DEBUG) that are only of use to developers. 2784 * 2785 * logtrace(): A printf-like function for outputting tracing messages 2786 * (messages at LOG_INFO) from the daemon. This is typically used 2787 * to log the receipt of interesting network-related conditions. 2788 * 2789 * logerr(): A printf-like function for outputting error messages 2790 * (messages at LOG_ERR) from the daemon. 2791 * 2792 * logperror*(): A set of functions used to output error messages 2793 * (messages at LOG_ERR); these automatically append strerror(errno) 2794 * and a newline to the message passed to them. 2795 * 2796 * NOTE: since the logging functions write to syslog, the messages passed 2797 * to them are not eligible for localization. Thus, gettext() must 2798 * *not* be used. 2799 */ 2800 2801 static int logging = 0; 2802 2803 static void 2804 initlog(void) 2805 { 2806 logging++; 2807 openlog("in.mpathd", LOG_PID, LOG_DAEMON); 2808 } 2809 2810 /* PRINTFLIKE2 */ 2811 void 2812 logmsg(int pri, const char *fmt, ...) 2813 { 2814 va_list ap; 2815 2816 va_start(ap, fmt); 2817 2818 if (logging) 2819 vsyslog(pri, fmt, ap); 2820 else 2821 (void) vfprintf(stderr, fmt, ap); 2822 va_end(ap); 2823 } 2824 2825 /* PRINTFLIKE1 */ 2826 void 2827 logperror(const char *str) 2828 { 2829 if (logging) 2830 syslog(LOG_ERR, "%s: %m\n", str); 2831 else 2832 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 2833 } 2834 2835 void 2836 logperror_pii(struct phyint_instance *pii, const char *str) 2837 { 2838 if (logging) { 2839 syslog(LOG_ERR, "%s (%s %s): %m\n", 2840 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 2841 } else { 2842 (void) fprintf(stderr, "%s (%s %s): %s\n", 2843 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 2844 strerror(errno)); 2845 } 2846 } 2847 2848 void 2849 logperror_li(struct logint *li, const char *str) 2850 { 2851 struct phyint_instance *pii = li->li_phyint_inst; 2852 2853 if (logging) { 2854 syslog(LOG_ERR, "%s (%s %s): %m\n", 2855 str, AF_STR(pii->pii_af), li->li_name); 2856 } else { 2857 (void) fprintf(stderr, "%s (%s %s): %s\n", 2858 str, AF_STR(pii->pii_af), li->li_name, 2859 strerror(errno)); 2860 } 2861 } 2862 2863 void 2864 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 2865 { 2866 if (polled) 2867 (void) poll_remove(pii->pii_probe_sock); 2868 (void) close(pii->pii_probe_sock); 2869 pii->pii_probe_sock = -1; 2870 pii->pii_basetime_inited = 0; 2871 } 2872 2873 boolean_t 2874 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, 2875 struct sockaddr_storage *ssp) 2876 { 2877 addrlist_t *addrp; 2878 2879 if ((addrp = malloc(sizeof (addrlist_t))) == NULL) 2880 return (_B_FALSE); 2881 2882 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); 2883 addrp->al_flags = flags; 2884 addrp->al_addr = *ssp; 2885 addrp->al_next = *addrsp; 2886 *addrsp = addrp; 2887 return (_B_TRUE); 2888 } 2889 2890 void 2891 addrlist_free(addrlist_t **addrsp) 2892 { 2893 addrlist_t *addrp, *next_addrp; 2894 2895 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { 2896 next_addrp = addrp->al_next; 2897 free(addrp); 2898 } 2899 *addrsp = NULL; 2900 } 2901 2902 /* 2903 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 2904 * tables defined by mib2.h. Pass the table information returned to the 2905 * supplied function. 2906 */ 2907 static int 2908 mibwalk(void (*proc)(mib_item_t *)) 2909 { 2910 mib_item_t *head_item = NULL; 2911 mib_item_t *last_item = NULL; 2912 mib_item_t *tmp; 2913 struct strbuf ctlbuf, databuf; 2914 int flags; 2915 int rval; 2916 uintptr_t buf[512 / sizeof (uintptr_t)]; 2917 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf; 2918 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf; 2919 struct T_error_ack *tea = (struct T_error_ack *)buf; 2920 struct opthdr *req, *optp; 2921 int status = -1; 2922 2923 if (mibfd == -1) { 2924 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) { 2925 logperror("mibwalk(): ip open"); 2926 return (status); 2927 } 2928 } 2929 2930 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2931 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2932 tor->OPT_length = sizeof (struct opthdr); 2933 tor->MGMT_flags = T_CURRENT; 2934 2935 /* 2936 * Note: we use the special level value below so that IP will return 2937 * us information concerning IRE_MARK_TESTHIDDEN routes. 2938 */ 2939 req = (struct opthdr *)&tor[1]; 2940 req->level = EXPER_IP_AND_ALL_IRES; 2941 req->name = 0; 2942 req->len = 0; 2943 2944 ctlbuf.buf = (char *)&buf; 2945 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 2946 2947 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) { 2948 logperror("mibwalk(): putmsg(ctl)"); 2949 return (status); 2950 } 2951 2952 /* 2953 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 2954 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 2955 * a control and data part. The control part contains a struct 2956 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 2957 * the level, name and length of the data in the data part. The 2958 * data part contains the actual table data. The last message 2959 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 2960 * single option with zero optlen. 2961 */ 2962 for (;;) { 2963 errno = flags = 0; 2964 ctlbuf.maxlen = sizeof (buf); 2965 rval = getmsg(mibfd, &ctlbuf, NULL, &flags); 2966 if (rval & MORECTL || rval < 0) { 2967 if (errno == EINTR) 2968 continue; 2969 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n", 2970 rval, errno); 2971 goto error; 2972 } 2973 if (ctlbuf.len < sizeof (t_scalar_t)) { 2974 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len); 2975 goto error; 2976 } 2977 2978 switch (toa->PRIM_type) { 2979 case T_ERROR_ACK: 2980 if (ctlbuf.len < sizeof (struct T_error_ack)) { 2981 logerr("mibwalk(): T_ERROR_ACK ctlbuf " 2982 "too short: %d\n", ctlbuf.len); 2983 goto error; 2984 } 2985 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n" 2986 " UNIX_err = 0x%lx\n", tea->TLI_error, 2987 t_strerror(tea->TLI_error), tea->UNIX_error); 2988 goto error; 2989 2990 case T_OPTMGMT_ACK: 2991 optp = (struct opthdr *)&toa[1]; 2992 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + 2993 sizeof (struct opthdr))) { 2994 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too " 2995 "short: %d\n", ctlbuf.len); 2996 goto error; 2997 } 2998 if (toa->MGMT_flags != T_SUCCESS) { 2999 logerr("mibwalk(): MGMT_flags != T_SUCCESS: " 3000 "0x%lx\n", toa->MGMT_flags); 3001 goto error; 3002 } 3003 break; 3004 3005 default: 3006 goto error; 3007 } 3008 /* The following assert also implies MGMT_flags == T_SUCCESS */ 3009 assert(toa->PRIM_type == T_OPTMGMT_ACK); 3010 3011 /* 3012 * We have reached the end of this T_OPTMGMT_ACK 3013 * message. If this is the last message i.e EOD, 3014 * break, else process the next T_OPTMGMT_ACK msg. 3015 */ 3016 if (rval == 0) { 3017 if (optp->len == 0 && optp->name == 0 && 3018 optp->level == 0) { 3019 /* This is the EOD message. */ 3020 break; 3021 } 3022 /* Not EOD but no data to retrieve */ 3023 continue; 3024 } 3025 3026 /* 3027 * We should only be here if MOREDATA was set. 3028 * Allocate an empty mib_item_t and link into the list 3029 * of MIB items. 3030 */ 3031 if ((tmp = malloc(sizeof (*tmp))) == NULL) { 3032 logperror("mibwalk(): malloc() failed."); 3033 goto error; 3034 } 3035 if (last_item != NULL) 3036 last_item->mi_next = tmp; 3037 else 3038 head_item = tmp; 3039 last_item = tmp; 3040 last_item->mi_next = NULL; 3041 last_item->mi_opthdr = *optp; 3042 last_item->mi_valp = malloc(optp->len); 3043 if (last_item->mi_valp == NULL) { 3044 logperror("mibwalk(): malloc() failed."); 3045 goto error; 3046 } 3047 3048 databuf.maxlen = last_item->mi_opthdr.len; 3049 databuf.buf = (char *)last_item->mi_valp; 3050 databuf.len = 0; 3051 3052 /* Retrieve the actual MIB data */ 3053 for (;;) { 3054 flags = 0; 3055 if ((rval = getmsg(mibfd, NULL, &databuf, 3056 &flags)) != 0) { 3057 if (rval < 0 && errno == EINTR) 3058 continue; 3059 /* 3060 * We shouldn't get MOREDATA here so treat that 3061 * as an error. 3062 */ 3063 logperror("mibwalk(): getmsg(data)"); 3064 goto error; 3065 } 3066 break; 3067 } 3068 } 3069 status = 0; 3070 /* Pass the accumulated MIB data to the supplied function pointer */ 3071 (*proc)(head_item); 3072 error: 3073 while (head_item != NULL) { 3074 tmp = head_item; 3075 head_item = tmp->mi_next; 3076 free(tmp->mi_valp); 3077 free(tmp); 3078 } 3079 return (status); 3080 } 3081 3082 /* 3083 * Parse the supplied mib2 information to get the size of routing table 3084 * entries. This is needed when running in a branded zone where the 3085 * Solaris application environment and the Solaris kernel may not be the 3086 * the same release version. 3087 */ 3088 static void 3089 mib_get_constants(mib_item_t *item) 3090 { 3091 mib2_ip_t *ipv4; 3092 mib2_ipv6IfStatsEntry_t *ipv6; 3093 3094 for (; item != NULL; item = item->mi_next) { 3095 if (item->mi_opthdr.name != 0) 3096 continue; 3097 if (item->mi_opthdr.level == MIB2_IP) { 3098 ipv4 = (mib2_ip_t *)item->mi_valp; 3099 ipRouteEntrySize = ipv4->ipRouteEntrySize; 3100 } else if (item->mi_opthdr.level == MIB2_IP6) { 3101 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp; 3102 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize; 3103 } 3104 } 3105 } 3106