1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include "mpd_defs.h" 27 #include "mpd_tables.h" 28 29 int debug = 0; /* Debug flag */ 30 static int pollfd_num = 0; /* Num. of poll descriptors */ 31 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 32 /* All times below in ms */ 33 int user_failure_detection_time; /* user specified failure detection */ 34 /* time (fdt) */ 35 int user_probe_interval; /* derived from user specified fdt */ 36 37 /* 38 * Structure to store mib2 information returned by the kernel. 39 * This is used to process routing table information. 40 */ 41 typedef struct mib_item_s { 42 struct mib_item_s *mi_next; 43 struct opthdr mi_opthdr; 44 void *mi_valp; 45 } mib_item_t; 46 47 static int rtsock_v4; /* AF_INET routing socket */ 48 static int rtsock_v6; /* AF_INET6 routing socket */ 49 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 50 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 51 static int lsock_v4; /* Listen socket to detect mpathd */ 52 static int lsock_v6; /* Listen socket to detect mpathd */ 53 static int mibfd = -1; /* fd to get mib info */ 54 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 55 56 static uint_t last_initifs_time; /* Time when initifs was last run */ 57 static char **argv0; /* Saved for re-exec on SIGHUP */ 58 boolean_t handle_link_notifications = _B_TRUE; 59 static int ipRouteEntrySize; /* Size of IPv4 route entry */ 60 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */ 61 62 static void initlog(void); 63 static void run_timeouts(void); 64 static void initifs(void); 65 static void check_if_removed(struct phyint_instance *pii); 66 static void select_test_ifs(void); 67 static void update_router_list(mib_item_t *item); 68 static void mib_get_constants(mib_item_t *item); 69 static int mibwalk(void (*proc)(mib_item_t *)); 70 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 71 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 72 static void router_add_common(int af, char *ifname, 73 struct in6_addr nexthop); 74 static void init_router_targets(); 75 static void cleanup(void); 76 static int setup_listener(int af); 77 static void check_config(void); 78 static void check_testconfig(void); 79 static void check_addr_unique(struct phyint_instance *, 80 struct sockaddr_storage *); 81 static void init_host_targets(void); 82 static void dup_host_targets(struct phyint_instance *desired_pii); 83 static void loopback_cmd(int sock, int family); 84 static boolean_t daemonize(void); 85 static int closefunc(void *, int); 86 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 87 static unsigned int process_query(int fd, mi_query_t *miq); 88 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); 89 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 90 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 91 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 92 static unsigned int send_result(int fd, unsigned int error, int syserror); 93 94 addrlist_t *localaddrs; 95 96 /* 97 * Return the current time in milliseconds (from an arbitrary reference) 98 * truncated to fit into an int. Truncation is ok since we are interested 99 * only in differences and not the absolute values. 100 */ 101 uint_t 102 getcurrenttime(void) 103 { 104 uint_t cur_time; /* In ms */ 105 106 /* 107 * Use of a non-user-adjustable source of time is 108 * required. However millisecond precision is sufficient. 109 * divide by 10^6 110 */ 111 cur_time = (uint_t)(gethrtime() / 1000000LL); 112 return (cur_time); 113 } 114 115 uint64_t 116 getcurrentsec(void) 117 { 118 return (gethrtime() / NANOSEC); 119 } 120 121 /* 122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 123 */ 124 int 125 poll_add(int fd) 126 { 127 int i; 128 int new_num; 129 struct pollfd *newfds; 130 retry: 131 /* Check if already present */ 132 for (i = 0; i < pollfd_num; i++) { 133 if (pollfds[i].fd == fd) 134 return (0); 135 } 136 /* Check for empty spot already present */ 137 for (i = 0; i < pollfd_num; i++) { 138 if (pollfds[i].fd == -1) { 139 pollfds[i].fd = fd; 140 return (0); 141 } 142 } 143 144 /* Allocate space for 32 more fds and initialize to -1 */ 145 new_num = pollfd_num + 32; 146 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 147 if (newfds == NULL) { 148 logperror("poll_add: realloc"); 149 return (-1); 150 } 151 for (i = pollfd_num; i < new_num; i++) { 152 newfds[i].fd = -1; 153 newfds[i].events = POLLIN; 154 } 155 pollfd_num = new_num; 156 pollfds = newfds; 157 goto retry; 158 } 159 160 /* 161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 162 */ 163 int 164 poll_remove(int fd) 165 { 166 int i; 167 168 /* Check if already present */ 169 for (i = 0; i < pollfd_num; i++) { 170 if (pollfds[i].fd == fd) { 171 pollfds[i].fd = -1; 172 return (0); 173 } 174 } 175 return (-1); 176 } 177 178 /* 179 * Extract information about the phyint instance. If the phyint instance still 180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 181 * will use it to detect phyint instances that don't exist any longer and 182 * remove them, from our database of phyint instances. 183 * Return value: 184 * returns true if the phyint instance exists in the kernel, 185 * returns false otherwise 186 */ 187 static boolean_t 188 pii_process(int af, char *name, struct phyint_instance **pii_p) 189 { 190 int err; 191 struct phyint_instance *pii; 192 struct phyint_instance *pii_other; 193 194 if (debug & D_PHYINT) 195 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 196 197 pii = phyint_inst_lookup(af, name); 198 if (pii == NULL) { 199 /* 200 * Phyint instance does not exist in our tables, 201 * create new phyint instance 202 */ 203 pii = phyint_inst_init_from_k(af, name); 204 } else { 205 /* Phyint exists in our tables */ 206 err = phyint_inst_update_from_k(pii); 207 208 switch (err) { 209 case PI_IOCTL_ERROR: 210 /* Some ioctl error. don't change anything */ 211 pii->pii_in_use = 1; 212 break; 213 214 case PI_GROUP_CHANGED: 215 case PI_IFINDEX_CHANGED: 216 /* 217 * Interface index or group membership has changed. 218 * Delete the old state and recreate based on the new 219 * state (it may no longer be in a group). 220 */ 221 pii_other = phyint_inst_other(pii); 222 if (pii_other != NULL) 223 phyint_inst_delete(pii_other); 224 phyint_inst_delete(pii); 225 pii = phyint_inst_init_from_k(af, name); 226 break; 227 228 case PI_DELETED: 229 /* Phyint instance has disappeared from kernel */ 230 pii->pii_in_use = 0; 231 break; 232 233 case PI_OK: 234 /* Phyint instance exists and is fine */ 235 pii->pii_in_use = 1; 236 break; 237 238 default: 239 /* Unknown status */ 240 logerr("pii_process: Unknown status %d\n", err); 241 break; 242 } 243 } 244 245 *pii_p = pii; 246 if (pii != NULL) 247 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 248 else 249 return (_B_FALSE); 250 } 251 252 /* 253 * Scan all interfaces to detect changes as well as new and deleted interfaces 254 */ 255 static void 256 initifs() 257 { 258 int i, nlifr; 259 int af; 260 char *cp; 261 char *buf; 262 int sockfd; 263 uint64_t flags; 264 struct lifnum lifn; 265 struct lifconf lifc; 266 struct lifreq lifreq; 267 struct lifreq *lifr; 268 struct logint *li; 269 struct phyint_instance *pii; 270 struct phyint_instance *next_pii; 271 struct phyint_group *pg, *next_pg; 272 char pi_name[LIFNAMSIZ + 1]; 273 274 if (debug & D_PHYINT) 275 logdebug("initifs: Scanning interfaces\n"); 276 277 last_initifs_time = getcurrenttime(); 278 279 /* 280 * Free the existing local address list; we'll build a new list below. 281 */ 282 addrlist_free(&localaddrs); 283 284 /* 285 * Mark the interfaces so that we can find phyints and logints 286 * which have disappeared from the kernel. pii_process() and 287 * logint_init_from_k() will set {pii,li}_in_use when they find 288 * the interface in the kernel. Also, clear dupaddr bit on probe 289 * logint. check_addr_unique() will set the dupaddr bit on the 290 * probe logint, if the testaddress is not unique. 291 */ 292 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 293 pii->pii_in_use = 0; 294 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 295 li->li_in_use = 0; 296 if (pii->pii_probe_logint == li) 297 li->li_dupaddr = 0; 298 } 299 } 300 301 /* 302 * As above, mark groups so that we can detect IPMP interfaces which 303 * have been removed from the kernel. Also, delete the group address 304 * list since we'll iteratively recreate it below. 305 */ 306 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 307 pg->pg_in_use = _B_FALSE; 308 addrlist_free(&pg->pg_addrs); 309 } 310 311 lifn.lifn_family = AF_UNSPEC; 312 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 313 again: 314 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 315 logperror("initifs: ioctl (get interface count)"); 316 return; 317 } 318 /* 319 * Pad the interface count to detect when additional interfaces have 320 * been configured between SIOCGLIFNUM and SIOCGLIFCONF. 321 */ 322 lifn.lifn_count += 4; 323 324 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { 325 logperror("initifs: calloc"); 326 return; 327 } 328 329 lifc.lifc_family = AF_UNSPEC; 330 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 331 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); 332 lifc.lifc_buf = buf; 333 334 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 335 logperror("initifs: ioctl (get interface configuration)"); 336 free(buf); 337 return; 338 } 339 340 /* 341 * If every lifr_req slot is taken, then additional interfaces must 342 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. 343 * Recalculate to make sure we didn't miss any interfaces. 344 */ 345 nlifr = lifc.lifc_len / sizeof (struct lifreq); 346 if (nlifr >= lifn.lifn_count) { 347 free(buf); 348 goto again; 349 } 350 351 /* 352 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the 353 * global list of addresses, phyint groups, phyints, and logints. 354 */ 355 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { 356 af = lifr->lifr_addr.ss_family; 357 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 358 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); 359 360 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 361 if (errno != ENXIO) 362 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 363 continue; 364 } 365 flags = lifreq.lifr_flags; 366 367 /* 368 * If the address is IFF_UP, add it to the local address list. 369 * (We ignore addresses that aren't IFF_UP since another node 370 * might legitimately have that address IFF_UP.) 371 */ 372 if (flags & IFF_UP) { 373 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, 374 &lifr->lifr_addr); 375 } 376 377 /* 378 * If this address is on an IPMP meta-interface, update our 379 * phyint_group information (either by recording that group 380 * still exists or creating a new group), and track what 381 * group the address is part of. 382 */ 383 if (flags & IFF_IPMP) { 384 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { 385 if (errno != ENXIO) 386 logperror("initifs: ioctl " 387 "(SIOCGLIFGROUPNAME)"); 388 continue; 389 } 390 391 pg = phyint_group_lookup(lifreq.lifr_groupname); 392 if (pg == NULL) { 393 pg = phyint_group_create(lifreq.lifr_groupname); 394 if (pg == NULL) { 395 logerr("initifs: cannot create group " 396 "%s\n", lifreq.lifr_groupname); 397 continue; 398 } 399 phyint_group_insert(pg); 400 } 401 pg->pg_in_use = _B_TRUE; 402 403 /* 404 * Add this to the group's list of data addresses. 405 */ 406 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, 407 &lifr->lifr_addr)) { 408 logerr("initifs: insufficient memory to track " 409 "data address information for %s\n", 410 lifr->lifr_name); 411 } 412 continue; 413 } 414 415 /* 416 * This isn't an address on an IPMP meta-interface, so it's 417 * either on an underlying interface or not related to any 418 * group. Update our phyint and logint information (via 419 * pii_process() and logint_init_from_k()) -- but first, 420 * convert the logint name to a phyint name so we can call 421 * pii_process(). 422 */ 423 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 424 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 425 *cp = '\0'; 426 427 if (pii_process(af, pi_name, &pii)) { 428 /* The phyint is fine. So process the logint */ 429 logint_init_from_k(pii, lifr->lifr_name); 430 check_addr_unique(pii, &lifr->lifr_addr); 431 } 432 } 433 free(buf); 434 435 /* 436 * Scan for groups, phyints and logints that have disappeared from the 437 * kernel, and delete them. 438 */ 439 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 440 next_pii = pii->pii_next; 441 check_if_removed(pii); 442 } 443 444 for (pg = phyint_groups; pg != NULL; pg = next_pg) { 445 next_pg = pg->pg_next; 446 if (!pg->pg_in_use) { 447 phyint_group_delete(pg); 448 continue; 449 } 450 /* 451 * Refresh the group's state. This is necessary since the 452 * group's state is defined by the set of usable interfaces in 453 * the group, and an interface is considered unusable if all 454 * of its addresses are down. When an address goes down/up, 455 * the RTM_DELADDR/RTM_NEWADDR brings us through here. 456 */ 457 phyint_group_refresh_state(pg); 458 } 459 460 /* 461 * Select a test address for sending probes on each phyint instance 462 */ 463 select_test_ifs(); 464 465 /* 466 * Handle link up/down notifications. 467 */ 468 process_link_state_changes(); 469 } 470 471 /* 472 * Check that a given test address is unique across all of the interfaces in a 473 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 475 * Any issues will be reported by check_testconfig(). 476 */ 477 static void 478 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 479 { 480 struct phyint *pi; 481 struct phyint_group *pg; 482 struct in6_addr addr; 483 struct phyint_instance *pii; 484 struct sockaddr_in *sin; 485 486 if (ss->ss_family == AF_INET) { 487 sin = (struct sockaddr_in *)ss; 488 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 489 } else { 490 assert(ss->ss_family == AF_INET6); 491 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 492 } 493 494 /* 495 * For anonymous groups, every interface is assumed to be on its own 496 * link, so there is no chance of overlapping addresses. 497 */ 498 pg = ourpii->pii_phyint->pi_group; 499 if (pg == phyint_anongroup) 500 return; 501 502 /* 503 * Walk the list of phyint instances in the group and check for test 504 * addresses matching ours. Of course, we skip ourself. 505 */ 506 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 507 pii = PHYINT_INSTANCE(pi, ss->ss_family); 508 if (pii == NULL || pii == ourpii || 509 pii->pii_probe_logint == NULL) 510 continue; 511 512 /* 513 * If this test address is not unique, set the dupaddr bit. 514 */ 515 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 516 pii->pii_probe_logint->li_dupaddr = 1; 517 } 518 } 519 520 /* 521 * Stop probing an interface. Called when an interface is offlined. 522 * The probe socket is closed on each interface instance, and the 523 * interface state set to PI_OFFLINE. 524 */ 525 void 526 stop_probing(struct phyint *pi) 527 { 528 struct phyint_instance *pii; 529 530 pii = pi->pi_v4; 531 if (pii != NULL) { 532 if (pii->pii_probe_sock != -1) 533 close_probe_socket(pii, _B_TRUE); 534 pii->pii_probe_logint = NULL; 535 } 536 537 pii = pi->pi_v6; 538 if (pii != NULL) { 539 if (pii->pii_probe_sock != -1) 540 close_probe_socket(pii, _B_TRUE); 541 pii->pii_probe_logint = NULL; 542 } 543 544 phyint_chstate(pi, PI_OFFLINE); 545 } 546 547 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 548 549 /* 550 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 551 * IFF_UP must also be set so that the associated address can be used as a 552 * source address. Further, we must be able to exchange packets with local 553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 555 */ 556 static int 557 rate_testflags(uint64_t flags) 558 { 559 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 560 return (BAD_TESTFLAGS); 561 562 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 563 return (BAD_TESTFLAGS); 564 565 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 566 return (BEST_TESTFLAGS); 567 568 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 569 return (BEST_TESTFLAGS); 570 571 return (OK_TESTFLAGS); 572 } 573 574 /* 575 * Attempt to select a test address for each phyint instance. 576 * Call phyint_inst_sockinit() to complete the initializations. 577 */ 578 static void 579 select_test_ifs(void) 580 { 581 struct phyint *pi; 582 struct phyint_instance *pii; 583 struct phyint_instance *next_pii; 584 struct logint *li; 585 struct logint *probe_logint; 586 boolean_t target_scan_reqd = _B_FALSE; 587 int rating; 588 589 if (debug & D_PHYINT) 590 logdebug("select_test_ifs\n"); 591 592 /* 593 * For each phyint instance, do the test address selection 594 */ 595 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 596 next_pii = pii->pii_next; 597 probe_logint = NULL; 598 599 /* 600 * An interface that is offline should not be probed. 601 * IFF_OFFLINE interfaces should always be PI_OFFLINE 602 * unless some other entity has set the offline flag. 603 */ 604 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 605 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 606 logerr("shouldn't be probing offline" 607 " interface %s (state is: %u)." 608 " Stopping probes.\n", 609 pii->pii_phyint->pi_name, 610 pii->pii_phyint->pi_state); 611 stop_probing(pii->pii_phyint); 612 } 613 continue; 614 } else { 615 /* 616 * If something cleared IFF_OFFLINE (e.g., by accident 617 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is 618 * inherently racy), the phyint may still be offline. 619 * Just ignore it. 620 */ 621 if (pii->pii_phyint->pi_state == PI_OFFLINE) 622 continue; 623 } 624 625 li = pii->pii_probe_logint; 626 if (li != NULL) { 627 /* 628 * We've already got a test address; only proceed 629 * if it's suboptimal. 630 */ 631 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 632 continue; 633 } 634 635 /* 636 * Walk the logints of this phyint instance, and select 637 * the best available test address 638 */ 639 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 640 /* 641 * Skip 0.0.0.0 addresses, as those are never 642 * actually usable. 643 */ 644 if (pii->pii_af == AF_INET && 645 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 646 continue; 647 648 /* 649 * Skip any IPv6 logints that are not link-local, 650 * since we should always have a link-local address 651 * anyway and in6_data() expects link-local replies. 652 */ 653 if (pii->pii_af == AF_INET6 && 654 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 655 continue; 656 657 /* 658 * Rate the testflags. If we've found an optimal 659 * match, then break out; otherwise, record the most 660 * recent OK one. 661 */ 662 rating = rate_testflags(li->li_flags); 663 if (rating == BAD_TESTFLAGS) 664 continue; 665 666 probe_logint = li; 667 if (rating == BEST_TESTFLAGS) 668 break; 669 } 670 671 /* 672 * If the probe logint has changed, ditch the old one. 673 */ 674 if (pii->pii_probe_logint != NULL && 675 pii->pii_probe_logint != probe_logint) { 676 if (pii->pii_probe_sock != -1) 677 close_probe_socket(pii, _B_TRUE); 678 pii->pii_probe_logint = NULL; 679 } 680 681 if (probe_logint == NULL) { 682 /* 683 * We don't have a test address; zero out the probe 684 * stats array since it is no longer relevant. 685 * Optimize by checking if it is already zeroed out. 686 */ 687 int pr_ndx; 688 689 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 690 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 691 clear_pii_probe_stats(pii); 692 reset_crtt_all(pii->pii_phyint); 693 } 694 continue; 695 } else if (probe_logint == pii->pii_probe_logint) { 696 /* 697 * If we didn't find any new test addr, go to the 698 * next phyint. 699 */ 700 continue; 701 } 702 703 /* 704 * The phyint is either being assigned a new testaddr 705 * or is being assigned a testaddr for the 1st time. 706 * Need to initialize the phyint socket 707 */ 708 pii->pii_probe_logint = probe_logint; 709 if (!phyint_inst_sockinit(pii)) { 710 if (debug & D_PHYINT) { 711 logdebug("select_test_ifs: " 712 "phyint_sockinit failed\n"); 713 } 714 phyint_inst_delete(pii); 715 continue; 716 } 717 718 /* 719 * This phyint instance is now enabled for probes; this 720 * impacts our state machine in two ways: 721 * 722 * 1. If we're probe *capable* as well (i.e., we have 723 * probe targets) and the interface is in PI_NOTARGETS, 724 * then transition to PI_RUNNING. 725 * 726 * 2. If we're not probe capable, and the other phyint 727 * instance is also not probe capable, and we were in 728 * PI_RUNNING, then transition to PI_NOTARGETS. 729 * 730 * Also see the state diagram in mpd_probe.c. 731 */ 732 if (PROBE_CAPABLE(pii)) { 733 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 734 phyint_chstate(pii->pii_phyint, PI_RUNNING); 735 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 736 if (pii->pii_phyint->pi_state == PI_RUNNING) 737 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 738 } 739 740 /* 741 * If no targets are currently known for this phyint 742 * we need to call init_router_targets. Since 743 * init_router_targets() initializes the list of targets 744 * for all phyints it is done below the loop. 745 */ 746 if (pii->pii_targets == NULL) 747 target_scan_reqd = _B_TRUE; 748 749 /* 750 * Start the probe timer for this instance. 751 */ 752 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 753 start_timer(pii); 754 pii->pii_basetime_inited = 1; 755 } 756 } 757 758 /* 759 * Scan the interface list for any interfaces that are PI_FAILED or 760 * PI_NOTARGETS but no longer enabled to send probes, and call 761 * phyint_check_for_repair() to see if the link state indicates that 762 * the interface should be repaired. Also see the state diagram in 763 * mpd_probe.c. 764 */ 765 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 766 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && 767 (pi->pi_state == PI_FAILED || 768 pi->pi_state == PI_NOTARGETS)) { 769 phyint_check_for_repair(pi); 770 } 771 } 772 773 check_testconfig(); 774 775 /* 776 * Try to populate the target list. init_router_targets populates 777 * the target list from the routing table. If our target list is 778 * still empty, init_host_targets adds host targets based on the 779 * host target list of other phyints in the group. 780 */ 781 if (target_scan_reqd) { 782 init_router_targets(); 783 init_host_targets(); 784 } 785 } 786 787 /* 788 * Check test address configuration, and log notices/errors if appropriate. 789 * Note that this function only logs pre-existing conditions (e.g., that 790 * probe-based failure detection is disabled). 791 */ 792 static void 793 check_testconfig(void) 794 { 795 struct phyint *pi; 796 struct logint *li; 797 char abuf[INET6_ADDRSTRLEN]; 798 int pri; 799 800 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 801 if (pi->pi_flags & IFF_OFFLINE) 802 continue; 803 804 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 805 if (pi->pi_taddrmsg_printed || 806 pi->pi_duptaddrmsg_printed) { 807 if (pi->pi_duptaddrmsg_printed) 808 pri = LOG_ERR; 809 else 810 pri = LOG_INFO; 811 logmsg(pri, "Test address now configured on " 812 "interface %s; enabling probe-based " 813 "failure detection on it\n", pi->pi_name); 814 pi->pi_taddrmsg_printed = 0; 815 pi->pi_duptaddrmsg_printed = 0; 816 } 817 continue; 818 } 819 820 li = NULL; 821 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 822 pi->pi_v4->pii_probe_logint->li_dupaddr) 823 li = pi->pi_v4->pii_probe_logint; 824 825 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 826 pi->pi_v6->pii_probe_logint->li_dupaddr) 827 li = pi->pi_v6->pii_probe_logint; 828 829 if (li != NULL && li->li_dupaddr) { 830 if (pi->pi_duptaddrmsg_printed) 831 continue; 832 logerr("Test address %s is not unique in group; " 833 "disabling probe-based failure detection on %s\n", 834 pr_addr(li->li_phyint_inst->pii_af, 835 li->li_addr, abuf, sizeof (abuf)), pi->pi_name); 836 pi->pi_duptaddrmsg_printed = 1; 837 continue; 838 } 839 840 if (getcurrentsec() < pi->pi_taddrthresh) 841 continue; 842 843 if (!pi->pi_taddrmsg_printed) { 844 logtrace("No test address configured on interface %s; " 845 "disabling probe-based failure detection on it\n", 846 pi->pi_name); 847 pi->pi_taddrmsg_printed = 1; 848 } 849 } 850 } 851 852 /* 853 * Check phyint group configuration, to detect any inconsistencies, 854 * and log an error message. This is called from runtimeouts every 855 * 20 secs. But the error message is displayed once. If the 856 * consistency is resolved by the admin, a recovery message is displayed 857 * once. 858 */ 859 static void 860 check_config(void) 861 { 862 struct phyint_group *pg; 863 struct phyint *pi; 864 boolean_t v4_in_group; 865 boolean_t v6_in_group; 866 867 /* 868 * All phyints of a group must be homogeneous to ensure that they can 869 * take over for one another. If any phyint in a group has IPv4 870 * plumbed, check that all phyints have IPv4 plumbed. Do a similar 871 * check for IPv6. 872 */ 873 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 874 if (pg == phyint_anongroup) 875 continue; 876 877 v4_in_group = _B_FALSE; 878 v6_in_group = _B_FALSE; 879 /* 880 * 1st pass. Determine if at least 1 phyint in the group 881 * has IPv4 plumbed and if so set v4_in_group to true. 882 * Repeat similarly for IPv6. 883 */ 884 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 885 if (pi->pi_v4 != NULL) 886 v4_in_group = _B_TRUE; 887 if (pi->pi_v6 != NULL) 888 v6_in_group = _B_TRUE; 889 } 890 891 /* 892 * 2nd pass. If v4_in_group is true, check that phyint 893 * has IPv4 plumbed. Repeat similarly for IPv6. Print 894 * out a message the 1st time only. 895 */ 896 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 897 if (pi->pi_flags & IFF_OFFLINE) 898 continue; 899 900 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 901 if (!pi->pi_cfgmsg_printed) { 902 logerr("IP interface %s in group %s is" 903 " not plumbed for IPv4, affecting" 904 " IPv4 connectivity\n", 905 pi->pi_name, 906 pi->pi_group->pg_name); 907 pi->pi_cfgmsg_printed = 1; 908 } 909 } else if (v6_in_group == _B_TRUE && 910 pi->pi_v6 == NULL) { 911 if (!pi->pi_cfgmsg_printed) { 912 logerr("IP interface %s in group %s is" 913 " not plumbed for IPv6, affecting" 914 " IPv6 connectivity\n", 915 pi->pi_name, 916 pi->pi_group->pg_name); 917 pi->pi_cfgmsg_printed = 1; 918 } 919 } else { 920 /* 921 * The phyint matches the group configuration, 922 * if we have reached this point. If it was 923 * improperly configured earlier, log an 924 * error recovery message 925 */ 926 if (pi->pi_cfgmsg_printed) { 927 logerr("IP interface %s is now" 928 " consistent with group %s " 929 " and connectivity is restored\n", 930 pi->pi_name, pi->pi_group->pg_name); 931 pi->pi_cfgmsg_printed = 0; 932 } 933 } 934 935 } 936 } 937 } 938 939 /* 940 * Timer mechanism using relative time (in milliseconds) from the 941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 942 * will fire after TIMER_INFINITY milliseconds. 943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 944 * time values. Hence 2 consecutive timer events cannot be spaced farther 945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 946 * that can be passed for the delay parameter of timer_schedule() 947 */ 948 static uint_t timer_next; /* Currently scheduled timeout */ 949 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 950 951 static void 952 timer_init(void) 953 { 954 timer_next = getcurrenttime() + TIMER_INFINITY; 955 /* 956 * The call to run_timeouts() will get the timer started 957 * Since there are no phyints at this point, the timer will 958 * be set for IF_SCAN_INTERVAL ms. 959 */ 960 run_timeouts(); 961 } 962 963 /* 964 * Make sure the next SIGALRM occurs delay milliseconds from the current 965 * time if not earlier. We are interested only in time differences. 966 */ 967 void 968 timer_schedule(uint_t delay) 969 { 970 uint_t now; 971 struct itimerval itimerval; 972 973 if (debug & D_TIMER) 974 logdebug("timer_schedule(%u)\n", delay); 975 976 assert(delay <= TIMER_INFINITY); 977 978 now = getcurrenttime(); 979 if (delay == 0) { 980 /* Minimum allowed delay */ 981 delay = 1; 982 } 983 /* Will this timer occur before the currently scheduled SIGALRM? */ 984 if (timer_active && TIME_GE(now + delay, timer_next)) { 985 if (debug & D_TIMER) { 986 logdebug("timer_schedule(%u) - no action: " 987 "now %u next %u\n", delay, now, timer_next); 988 } 989 return; 990 } 991 timer_next = now + delay; 992 993 itimerval.it_value.tv_sec = delay / 1000; 994 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 995 itimerval.it_interval.tv_sec = 0; 996 itimerval.it_interval.tv_usec = 0; 997 if (debug & D_TIMER) { 998 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 999 delay, itimerval.it_value.tv_sec, 1000 itimerval.it_value.tv_usec); 1001 } 1002 timer_active = _B_TRUE; 1003 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1004 logperror("timer_schedule: setitimer"); 1005 exit(2); 1006 } 1007 } 1008 1009 static void 1010 timer_cancel(void) 1011 { 1012 struct itimerval itimerval; 1013 1014 if (debug & D_TIMER) 1015 logdebug("timer_cancel()\n"); 1016 1017 bzero(&itimerval, sizeof (itimerval)); 1018 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) 1019 logperror("timer_cancel: setitimer"); 1020 } 1021 1022 /* 1023 * Timer has fired. Determine when the next timer event will occur by asking 1024 * all the timer routines. Should not be called from a timer routine. 1025 */ 1026 static void 1027 run_timeouts(void) 1028 { 1029 uint_t next; 1030 uint_t next_event_time; 1031 struct phyint_instance *pii; 1032 struct phyint_instance *next_pii; 1033 static boolean_t timeout_running; 1034 1035 /* assert that recursive timeouts don't happen. */ 1036 assert(!timeout_running); 1037 1038 timeout_running = _B_TRUE; 1039 1040 if (debug & D_TIMER) 1041 logdebug("run_timeouts()\n"); 1042 1043 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1044 initifs(); 1045 check_config(); 1046 } 1047 1048 next = TIMER_INFINITY; 1049 1050 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1051 next_pii = pii->pii_next; 1052 next_event_time = phyint_inst_timer(pii); 1053 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1054 next = next_event_time; 1055 1056 if (debug & D_TIMER) { 1057 logdebug("run_timeouts(%s %s): next scheduled for" 1058 " this phyint inst %u, next scheduled global" 1059 " %u ms\n", 1060 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1061 next_event_time, next); 1062 } 1063 } 1064 1065 /* 1066 * Make sure initifs() is called at least once every 1067 * IF_SCAN_INTERVAL, to make sure that we are in sync 1068 * with the kernel, in case we have missed any routing 1069 * socket messages. 1070 */ 1071 if (next > IF_SCAN_INTERVAL) 1072 next = IF_SCAN_INTERVAL; 1073 1074 if (debug & D_TIMER) 1075 logdebug("run_timeouts: %u ms\n", next); 1076 1077 timer_schedule(next); 1078 timeout_running = _B_FALSE; 1079 } 1080 1081 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1082 static int eventpipe_write = -1; 1083 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ 1084 1085 /* 1086 * Ensure that signals are processed synchronously with the rest of 1087 * the code by just writing a one character signal number on the pipe. 1088 * The poll loop will pick this up and process the signal event. 1089 */ 1090 static void 1091 sig_handler(int signo) 1092 { 1093 uchar_t buf = (uchar_t)signo; 1094 1095 /* 1096 * Don't write to pipe if cleanup has already begun. cleanup() 1097 * might have closed the pipe already 1098 */ 1099 if (cleanup_started) 1100 return; 1101 1102 if (eventpipe_write == -1) { 1103 logerr("sig_handler: no pipe found\n"); 1104 return; 1105 } 1106 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1107 logperror("sig_handler: write"); 1108 } 1109 1110 extern struct probes_missed probes_missed; 1111 1112 /* 1113 * Pick up a signal "byte" from the pipe and process it. 1114 */ 1115 static void 1116 in_signal(int fd) 1117 { 1118 uchar_t buf; 1119 uint64_t sent, acked, lost, unacked, unknown; 1120 struct phyint_instance *pii; 1121 int pr_ndx; 1122 1123 switch (read(fd, &buf, sizeof (buf))) { 1124 case -1: 1125 logperror("in_signal: read"); 1126 exit(1); 1127 /* NOTREACHED */ 1128 case 1: 1129 break; 1130 case 0: 1131 logerr("in_signal: read end of file\n"); 1132 exit(1); 1133 /* NOTREACHED */ 1134 default: 1135 logerr("in_signal: read > 1\n"); 1136 exit(1); 1137 } 1138 1139 if (debug & D_TIMER) 1140 logdebug("in_signal() got %d\n", buf); 1141 1142 switch (buf) { 1143 case SIGALRM: 1144 if (debug & D_TIMER) { 1145 uint_t now = getcurrenttime(); 1146 1147 logdebug("in_signal(SIGALRM) delta %u\n", 1148 now - timer_next); 1149 } 1150 timer_active = _B_FALSE; 1151 run_timeouts(); 1152 break; 1153 case SIGUSR1: 1154 logdebug("Printing configuration:\n"); 1155 /* Print out the internal tables */ 1156 phyint_inst_print_all(); 1157 1158 /* 1159 * Print out the accumulated statistics about missed 1160 * probes (happens due to scheduling delay). 1161 */ 1162 logerr("Missed sending total of %d probes spread over" 1163 " %d occurrences\n", probes_missed.pm_nprobes, 1164 probes_missed.pm_ntimes); 1165 1166 /* 1167 * Print out the accumulated statistics about probes 1168 * that were sent. 1169 */ 1170 for (pii = phyint_instances; pii != NULL; 1171 pii = pii->pii_next) { 1172 unacked = 0; 1173 acked = pii->pii_cum_stats.acked; 1174 lost = pii->pii_cum_stats.lost; 1175 sent = pii->pii_cum_stats.sent; 1176 unknown = pii->pii_cum_stats.unknown; 1177 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1178 switch (pii->pii_probes[pr_ndx].pr_status) { 1179 case PR_ACKED: 1180 acked++; 1181 break; 1182 case PR_LOST: 1183 lost++; 1184 break; 1185 case PR_UNACKED: 1186 unacked++; 1187 break; 1188 } 1189 } 1190 logerr("\nProbe stats on (%s %s)\n" 1191 "Number of probes sent %lld\n" 1192 "Number of probe acks received %lld\n" 1193 "Number of probes/acks lost %lld\n" 1194 "Number of valid unacknowledged probes %lld\n" 1195 "Number of ambiguous probe acks received %lld\n", 1196 AF_STR(pii->pii_af), pii->pii_name, 1197 sent, acked, lost, unacked, unknown); 1198 } 1199 break; 1200 case SIGHUP: 1201 logerr("SIGHUP: restart and reread config file\n"); 1202 /* 1203 * Cancel the interval timer. Needed since setitimer() uses 1204 * alarm() and the time left is inherited across exec(), and 1205 * thus the SIGALRM may be delivered before a handler has been 1206 * setup, causing in.mpathd to erroneously exit. 1207 */ 1208 timer_cancel(); 1209 cleanup(); 1210 (void) execv(argv0[0], argv0); 1211 _exit(0177); 1212 /* NOTREACHED */ 1213 case SIGINT: 1214 case SIGTERM: 1215 case SIGQUIT: 1216 cleanup(); 1217 exit(0); 1218 /* NOTREACHED */ 1219 default: 1220 logerr("in_signal: unknown signal: %d\n", buf); 1221 } 1222 } 1223 1224 static void 1225 cleanup(void) 1226 { 1227 struct phyint_instance *pii; 1228 struct phyint_instance *next_pii; 1229 1230 /* 1231 * Make sure that we don't write to eventpipe in 1232 * sig_handler() if any signal notably SIGALRM, 1233 * occurs after we close the eventpipe descriptor below 1234 */ 1235 cleanup_started = _B_TRUE; 1236 1237 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1238 next_pii = pii->pii_next; 1239 phyint_inst_delete(pii); 1240 } 1241 1242 (void) close(ifsock_v4); 1243 (void) close(ifsock_v6); 1244 (void) close(rtsock_v4); 1245 (void) close(rtsock_v6); 1246 (void) close(lsock_v4); 1247 (void) close(lsock_v6); 1248 (void) close(0); 1249 (void) close(1); 1250 (void) close(2); 1251 (void) close(mibfd); 1252 (void) close(eventpipe_read); 1253 (void) close(eventpipe_write); 1254 } 1255 1256 /* 1257 * Create pipe for signal delivery and set up signal handlers. 1258 */ 1259 static void 1260 setup_eventpipe(void) 1261 { 1262 int fds[2]; 1263 struct sigaction act; 1264 1265 if ((pipe(fds)) < 0) { 1266 logperror("setup_eventpipe: pipe"); 1267 exit(1); 1268 } 1269 eventpipe_read = fds[0]; 1270 eventpipe_write = fds[1]; 1271 if (poll_add(eventpipe_read) == -1) { 1272 exit(1); 1273 } 1274 1275 act.sa_handler = sig_handler; 1276 act.sa_flags = SA_RESTART; 1277 (void) sigaction(SIGALRM, &act, NULL); 1278 1279 (void) sigset(SIGHUP, sig_handler); 1280 (void) sigset(SIGUSR1, sig_handler); 1281 (void) sigset(SIGTERM, sig_handler); 1282 (void) sigset(SIGINT, sig_handler); 1283 (void) sigset(SIGQUIT, sig_handler); 1284 } 1285 1286 /* 1287 * Create a routing socket for receiving RTM_IFINFO messages. 1288 */ 1289 static int 1290 setup_rtsock(int af) 1291 { 1292 int s; 1293 int flags; 1294 int aware = RTAW_UNDER_IPMP; 1295 1296 s = socket(PF_ROUTE, SOCK_RAW, af); 1297 if (s == -1) { 1298 logperror("setup_rtsock: socket PF_ROUTE"); 1299 exit(1); 1300 } 1301 1302 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { 1303 logperror("setup_rtsock: setsockopt RT_AWARE"); 1304 (void) close(s); 1305 exit(1); 1306 } 1307 1308 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1309 logperror("setup_rtsock: fcntl F_GETFL"); 1310 (void) close(s); 1311 exit(1); 1312 } 1313 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1314 logperror("setup_rtsock: fcntl F_SETFL"); 1315 (void) close(s); 1316 exit(1); 1317 } 1318 if (poll_add(s) == -1) { 1319 (void) close(s); 1320 exit(1); 1321 } 1322 return (s); 1323 } 1324 1325 /* 1326 * Process an RTM_IFINFO message received on a routing socket. 1327 * The return value indicates whether a full interface scan is required. 1328 * Link up/down notifications are reflected in the IFF_RUNNING flag. 1329 * If just the state of the IFF_RUNNING interface flag has changed, a 1330 * a full interface scan isn't required. 1331 */ 1332 static boolean_t 1333 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1334 { 1335 struct sockaddr_dl *sdl; 1336 struct phyint *pi; 1337 uint64_t old_flags; 1338 struct phyint_instance *pii; 1339 1340 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1341 1342 /* 1343 * Although the sockaddr_dl structure is directly after the 1344 * if_msghdr_t structure. At the time of writing, the size of the 1345 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1346 * to the presence of a timeval structure, which contains longs, 1347 * in the if_data structure. Anyway, we know where the message ends, 1348 * so we work backwards to get the start of the sockaddr_dl structure. 1349 */ 1350 /*LINTED*/ 1351 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1352 sizeof (struct sockaddr_dl)); 1353 1354 assert(sdl->sdl_family == AF_LINK); 1355 1356 /* 1357 * The interface name is in sdl_data. 1358 * RTM_IFINFO messages are only generated for logical interface 1359 * zero, so there is no colon and logical interface number to 1360 * strip from the name. The name is not null terminated, but 1361 * there should be enough space in sdl_data to add the null. 1362 */ 1363 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1364 if (debug & D_LINKNOTE) 1365 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1366 return (_B_TRUE); 1367 } 1368 sdl->sdl_data[sdl->sdl_nlen] = 0; 1369 1370 pi = phyint_lookup(sdl->sdl_data); 1371 if (pi == NULL) { 1372 if (debug & D_LINKNOTE) 1373 logdebug("process_rtm_ifinfo: phyint lookup failed" 1374 " for %s\n", sdl->sdl_data); 1375 return (_B_TRUE); 1376 } 1377 1378 /* 1379 * We want to try and avoid doing a full interface scan for 1380 * link state notifications from the datalink layer, as indicated 1381 * by the state of the IFF_RUNNING flag. If just the 1382 * IFF_RUNNING flag has changed state, the link state changes 1383 * are processed without a full scan. 1384 * If there is both an IPv4 and IPv6 instance associated with 1385 * the physical interface, we will get an RTM_IFINFO message 1386 * for each instance. If we just maintained a single copy of 1387 * the physical interface flags, it would appear that no flags 1388 * had changed when the second message is processed, leading us 1389 * to believe that the message wasn't generated by a flags change, 1390 * and that a full interface scan is required. 1391 * To get around this problem, two additional copies of the flags 1392 * are kept, one copy for each instance. These are only used in 1393 * this routine. At any one time, all three copies of the flags 1394 * should be identical except for the IFF_RUNNING flag. The 1395 * copy of the flags in the "phyint" structure is always up to 1396 * date. 1397 */ 1398 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1399 if (pii == NULL) { 1400 if (debug & D_LINKNOTE) 1401 logdebug("process_rtm_ifinfo: no instance of address " 1402 "family %s for %s\n", AF_STR(type), pi->pi_name); 1403 return (_B_TRUE); 1404 } 1405 1406 old_flags = pii->pii_flags; 1407 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1408 pi->pi_flags = pii->pii_flags; 1409 1410 if (debug & D_LINKNOTE) { 1411 logdebug("process_rtm_ifinfo: %s address family: %s, " 1412 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1413 AF_STR(type), old_flags, pi->pi_flags); 1414 } 1415 1416 /* 1417 * If IFF_STANDBY has changed, indicate that the interface has changed 1418 * types and refresh IFF_INACTIVE if need be. 1419 */ 1420 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) { 1421 phyint_changed(pi); 1422 if (pii->pii_flags & IFF_STANDBY) 1423 phyint_standby_refresh_inactive(pi); 1424 } 1425 1426 /* Has just the IFF_RUNNING flag changed state ? */ 1427 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1428 struct phyint_instance *pii_other; 1429 /* 1430 * It wasn't just a link state change. Update 1431 * the other instance's copy of the flags. 1432 */ 1433 pii_other = phyint_inst_other(pii); 1434 if (pii_other != NULL) 1435 pii_other->pii_flags = pii->pii_flags; 1436 return (_B_TRUE); 1437 } 1438 1439 return (_B_FALSE); 1440 } 1441 1442 /* 1443 * Retrieve as many routing socket messages as possible, and try to 1444 * empty the routing sockets. Initiate full scan of targets or interfaces 1445 * as needed. 1446 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1447 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1448 */ 1449 static void 1450 process_rtsock(int rtsock_v4, int rtsock_v6) 1451 { 1452 int nbytes; 1453 int64_t msg[2048 / 8]; 1454 struct rt_msghdr *rtm; 1455 boolean_t need_if_scan = _B_FALSE; 1456 boolean_t need_rt_scan = _B_FALSE; 1457 boolean_t rtm_ifinfo_seen = _B_FALSE; 1458 int type; 1459 1460 /* Read as many messages as possible and try to empty the sockets */ 1461 for (type = AF_INET; ; type = AF_INET6) { 1462 for (;;) { 1463 nbytes = read((type == AF_INET) ? rtsock_v4 : 1464 rtsock_v6, msg, sizeof (msg)); 1465 if (nbytes <= 0) { 1466 /* No more messages */ 1467 break; 1468 } 1469 rtm = (struct rt_msghdr *)msg; 1470 if (rtm->rtm_version != RTM_VERSION) { 1471 logerr("process_rtsock: version %d " 1472 "not understood\n", rtm->rtm_version); 1473 break; 1474 } 1475 1476 if (debug & D_PHYINT) { 1477 logdebug("process_rtsock: message %d\n", 1478 rtm->rtm_type); 1479 } 1480 1481 switch (rtm->rtm_type) { 1482 case RTM_NEWADDR: 1483 case RTM_DELADDR: 1484 /* 1485 * Some logical interface has changed, 1486 * have to scan everything to determine 1487 * what actually changed. 1488 */ 1489 need_if_scan = _B_TRUE; 1490 break; 1491 1492 case RTM_IFINFO: 1493 rtm_ifinfo_seen = _B_TRUE; 1494 need_if_scan |= process_rtm_ifinfo( 1495 (if_msghdr_t *)rtm, type); 1496 break; 1497 1498 case RTM_ADD: 1499 case RTM_DELETE: 1500 case RTM_CHANGE: 1501 case RTM_OLDADD: 1502 case RTM_OLDDEL: 1503 need_rt_scan = _B_TRUE; 1504 break; 1505 1506 default: 1507 /* Not interesting */ 1508 break; 1509 } 1510 } 1511 if (type == AF_INET6) 1512 break; 1513 } 1514 1515 if (need_if_scan) { 1516 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1517 logdebug("process_rtsock: synchronizing with kernel\n"); 1518 initifs(); 1519 } else if (rtm_ifinfo_seen) { 1520 if (debug & D_LINKNOTE) 1521 logdebug("process_rtsock: " 1522 "link up/down notification(s) seen\n"); 1523 process_link_state_changes(); 1524 } 1525 1526 if (need_rt_scan) 1527 init_router_targets(); 1528 } 1529 1530 /* 1531 * Look if the phyint instance or one of its logints have been removed from 1532 * the kernel and take appropriate action. 1533 * Uses {pii,li}_in_use. 1534 */ 1535 static void 1536 check_if_removed(struct phyint_instance *pii) 1537 { 1538 struct logint *li; 1539 struct logint *next_li; 1540 1541 /* Detect phyints that have been removed from the kernel. */ 1542 if (!pii->pii_in_use) { 1543 logtrace("%s %s has been removed from kernel\n", 1544 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1545 phyint_inst_delete(pii); 1546 } else { 1547 /* Detect logints that have been removed. */ 1548 for (li = pii->pii_logint; li != NULL; li = next_li) { 1549 next_li = li->li_next; 1550 if (!li->li_in_use) { 1551 logint_delete(li); 1552 } 1553 } 1554 } 1555 } 1556 1557 /* 1558 * Parse the supplied mib2 information to extract the routing information 1559 * table. Process the routing table to get the list of known onlink routers 1560 * and update our database. These onlink routers will serve as probe 1561 * targets. 1562 */ 1563 static void 1564 update_router_list(mib_item_t *item) 1565 { 1566 for (; item != NULL; item = item->mi_next) { 1567 if (item->mi_opthdr.name == 0) 1568 continue; 1569 if (item->mi_opthdr.level == MIB2_IP && 1570 item->mi_opthdr.name == MIB2_IP_ROUTE) { 1571 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp, 1572 item->mi_opthdr.len); 1573 } else if (item->mi_opthdr.level == MIB2_IP6 && 1574 item->mi_opthdr.name == MIB2_IP6_ROUTE) { 1575 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp, 1576 item->mi_opthdr.len); 1577 } 1578 } 1579 } 1580 1581 1582 /* 1583 * Convert octet `octp' to a phyint name and store in `ifname' 1584 */ 1585 static void 1586 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) 1587 { 1588 char *cp; 1589 size_t len = MIN(octp->o_length, ifsize - 1); 1590 1591 (void) strncpy(ifname, octp->o_bytes, len); 1592 ifname[len] = '\0'; 1593 1594 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) 1595 *cp = '\0'; 1596 } 1597 1598 /* 1599 * Examine the IPv4 routing table `buf' for possible targets. For each 1600 * possible target, if it's on the same subnet an interface route, pass 1601 * it to router_add_common() for further consideration. 1602 */ 1603 static void 1604 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1605 { 1606 char ifname[LIFNAMSIZ]; 1607 mib2_ipRouteEntry_t *rp, *rp1, *endp; 1608 struct in_addr nexthop_v4; 1609 struct in6_addr nexthop; 1610 1611 if (debug & D_TARGET) 1612 logdebug("ire_process_v4(len %d)\n", len); 1613 1614 if (len == 0) 1615 return; 1616 1617 assert((len % ipRouteEntrySize) == 0); 1618 endp = buf + (len / ipRouteEntrySize); 1619 1620 /* 1621 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1622 * cross-reference them with the interface routes to determine if 1623 * they're possible probe targets. 1624 */ 1625 for (rp = buf; rp < endp; rp++) { 1626 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1627 continue; 1628 1629 /* Get the nexthop address. */ 1630 nexthop_v4.s_addr = rp->ipRouteNextHop; 1631 1632 /* 1633 * Rescan the routing table looking for interface routes that 1634 * are on the same subnet, and try to add them. If they're 1635 * not relevant (e.g., the interface route isn't part of an 1636 * IPMP group, router_add_common() will discard). 1637 */ 1638 for (rp1 = buf; rp1 < endp; rp1++) { 1639 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || 1640 rp1->ipRouteIfIndex.o_length == 0) 1641 continue; 1642 1643 if ((rp1->ipRouteDest & rp1->ipRouteMask) != 1644 (nexthop_v4.s_addr & rp1->ipRouteMask)) 1645 continue; 1646 1647 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); 1648 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1649 router_add_common(AF_INET, ifname, nexthop); 1650 } 1651 } 1652 } 1653 1654 void 1655 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1656 { 1657 struct phyint_instance *pii; 1658 struct phyint *pi; 1659 1660 if (debug & D_TARGET) 1661 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1662 1663 /* 1664 * Retrieve the phyint instance; bail if it's not known to us yet. 1665 */ 1666 pii = phyint_inst_lookup(af, ifname); 1667 if (pii == NULL) 1668 return; 1669 1670 /* 1671 * Don't use our own addresses as targets. 1672 */ 1673 if (own_address(nexthop)) 1674 return; 1675 1676 /* 1677 * If the phyint is part a named group, then add the address to all 1678 * members of the group; note that this is suboptimal in the IPv4 case 1679 * as it has already been added to all matching interfaces in 1680 * ire_process_v4(). Otherwise, add the address only to the phyint 1681 * itself, since other phyints in the anongroup may not be on the same 1682 * subnet. 1683 */ 1684 pi = pii->pii_phyint; 1685 if (pi->pi_group == phyint_anongroup) { 1686 target_add(pii, nexthop, _B_TRUE); 1687 } else { 1688 pi = pi->pi_group->pg_phyint; 1689 for (; pi != NULL; pi = pi->pi_pgnext) 1690 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1691 } 1692 } 1693 1694 /* 1695 * Examine the IPv6 routing table `buf' for possible link-local targets, and 1696 * pass any contenders to router_add_common() for further consideration. 1697 */ 1698 static void 1699 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1700 { 1701 struct lifreq lifr; 1702 char ifname[LIFNAMSIZ]; 1703 char grname[LIFGRNAMSIZ]; 1704 mib2_ipv6RouteEntry_t *rp, *rp1, *endp; 1705 struct in6_addr nexthop_v6; 1706 1707 if (debug & D_TARGET) 1708 logdebug("ire_process_v6(len %d)\n", len); 1709 1710 if (len == 0) 1711 return; 1712 1713 assert((len % ipv6RouteEntrySize) == 0); 1714 endp = buf + (len / ipv6RouteEntrySize); 1715 1716 /* 1717 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1718 * cross-reference them with the interface routes to determine if 1719 * they're possible probe targets. 1720 */ 1721 for (rp = buf; rp < endp; rp++) { 1722 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || 1723 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) 1724 continue; 1725 1726 /* Get the nexthop address. */ 1727 nexthop_v6 = rp->ipv6RouteNextHop; 1728 1729 /* 1730 * The interface name should always exist for link-locals; 1731 * we use it to map this entry to an IPMP group name. 1732 */ 1733 if (rp->ipv6RouteIfIndex.o_length == 0) 1734 continue; 1735 1736 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); 1737 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || 1738 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { 1739 continue; 1740 } 1741 1742 /* 1743 * Rescan the list of routes for interface routes, and add the 1744 * above target to any interfaces in the same IPMP group. 1745 */ 1746 for (rp1 = buf; rp1 < endp; rp1++) { 1747 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || 1748 rp1->ipv6RouteIfIndex.o_length == 0) { 1749 continue; 1750 } 1751 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); 1752 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); 1753 1754 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && 1755 strcmp(lifr.lifr_groupname, grname) == 0) { 1756 router_add_common(AF_INET6, ifname, nexthop_v6); 1757 } 1758 } 1759 } 1760 } 1761 1762 /* 1763 * Build a list of target routers, by scanning the routing tables. 1764 * It is assumed that interface routes exist, to reach the routers. 1765 */ 1766 static void 1767 init_router_targets(void) 1768 { 1769 struct target *tg; 1770 struct target *next_tg; 1771 struct phyint_instance *pii; 1772 struct phyint *pi; 1773 1774 if (force_mcast) 1775 return; 1776 1777 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1778 pi = pii->pii_phyint; 1779 /* 1780 * Set tg_in_use to false only for router targets. 1781 */ 1782 if (!pii->pii_targets_are_routers) 1783 continue; 1784 1785 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1786 tg->tg_in_use = 0; 1787 } 1788 1789 if (mibwalk(update_router_list) == -1) 1790 exit(1); 1791 1792 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1793 pi = pii->pii_phyint; 1794 if (!pii->pii_targets_are_routers) 1795 continue; 1796 1797 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1798 next_tg = tg->tg_next; 1799 /* 1800 * If the group has failed, it's likely the route was 1801 * removed by an application affected by that failure. 1802 * In that case, we keep the target so that we can 1803 * reliably repair, at which point we'll refresh the 1804 * target list again. 1805 */ 1806 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) 1807 target_delete(tg); 1808 } 1809 } 1810 } 1811 1812 /* 1813 * Attempt to assign host targets to any interfaces that do not currently 1814 * have probe targets by sharing targets with other interfaces in the group. 1815 */ 1816 static void 1817 init_host_targets(void) 1818 { 1819 struct phyint_instance *pii; 1820 struct phyint_group *pg; 1821 1822 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1823 pg = pii->pii_phyint->pi_group; 1824 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1825 dup_host_targets(pii); 1826 } 1827 } 1828 1829 /* 1830 * Duplicate host targets from other phyints of the group to 1831 * the phyint instance 'desired_pii'. 1832 */ 1833 static void 1834 dup_host_targets(struct phyint_instance *desired_pii) 1835 { 1836 int af; 1837 struct phyint *pi; 1838 struct phyint_instance *pii; 1839 struct target *tg; 1840 1841 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 1842 1843 af = desired_pii->pii_af; 1844 1845 /* 1846 * For every phyint in the same group as desired_pii, check if 1847 * it has any host targets. If so add them to desired_pii. 1848 */ 1849 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 1850 pii = PHYINT_INSTANCE(pi, af); 1851 /* 1852 * We know that we don't have targets on this phyint instance 1853 * since we have been called. But we still check for 1854 * pii_targets_are_routers because another phyint instance 1855 * could have router targets, since IFF_NOFAILOVER addresses 1856 * on different phyint instances may belong to different 1857 * subnets. 1858 */ 1859 if ((pii == NULL) || (pii == desired_pii) || 1860 pii->pii_targets_are_routers) 1861 continue; 1862 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1863 target_create(desired_pii, tg->tg_address, _B_FALSE); 1864 } 1865 } 1866 } 1867 1868 static void 1869 usage(char *cmd) 1870 { 1871 (void) fprintf(stderr, "usage: %s\n", cmd); 1872 } 1873 1874 1875 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 1876 1877 /* Get an option from the /etc/default/mpathd file */ 1878 static char * 1879 getdefault(char *name) 1880 { 1881 char namebuf[BUFSIZ]; 1882 char *value = NULL; 1883 1884 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 1885 char *cp; 1886 int flags; 1887 1888 /* 1889 * ignore case 1890 */ 1891 flags = defcntl(DC_GETFLAGS, 0); 1892 TURNOFF(flags, DC_CASE); 1893 (void) defcntl(DC_SETFLAGS, flags); 1894 1895 /* Add "=" to the name */ 1896 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 1897 (void) strncat(namebuf, "=", 2); 1898 1899 if ((cp = defread(namebuf)) != NULL) 1900 value = strdup(cp); 1901 1902 /* close */ 1903 (void) defopen((char *)NULL); 1904 } 1905 return (value); 1906 } 1907 1908 1909 /* 1910 * Command line options below 1911 */ 1912 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 1913 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ 1914 static boolean_t adopt = _B_FALSE; 1915 static boolean_t foreground = _B_FALSE; 1916 1917 int 1918 main(int argc, char *argv[]) 1919 { 1920 int i; 1921 int c; 1922 struct phyint *pi; 1923 struct phyint_instance *pii; 1924 char *value; 1925 1926 argv0 = argv; /* Saved for re-exec on SIGHUP */ 1927 srandom(gethostid()); /* Initialize the random number generator */ 1928 1929 /* 1930 * NOTE: The messages output by in.mpathd are not suitable for 1931 * translation, so we do not call textdomain(). 1932 */ 1933 (void) setlocale(LC_ALL, ""); 1934 1935 /* 1936 * Get the user specified value of 'failure detection time' 1937 * from /etc/default/mpathd 1938 */ 1939 value = getdefault("FAILURE_DETECTION_TIME"); 1940 if (value != NULL) { 1941 user_failure_detection_time = 1942 (int)strtol((char *)value, NULL, 0); 1943 1944 if (user_failure_detection_time <= 0) { 1945 user_failure_detection_time = FAILURE_DETECTION_TIME; 1946 logerr("Invalid failure detection time %s, assuming " 1947 "default of %d ms\n", value, 1948 user_failure_detection_time); 1949 1950 } else if (user_failure_detection_time < 1951 MIN_FAILURE_DETECTION_TIME) { 1952 user_failure_detection_time = 1953 MIN_FAILURE_DETECTION_TIME; 1954 logerr("Too small failure detection time of %s, " 1955 "assuming minimum of %d ms\n", value, 1956 user_failure_detection_time); 1957 } 1958 free(value); 1959 } else { 1960 /* User has not specified the parameter, Use default value */ 1961 user_failure_detection_time = FAILURE_DETECTION_TIME; 1962 } 1963 1964 /* 1965 * This gives the frequency at which probes will be sent. 1966 * When fdt ms elapses, we should be able to determine 1967 * whether 5 consecutive probes have failed or not. 1968 * 1 probe will be sent in every user_probe_interval ms, 1969 * randomly anytime in the (0.5 - 1.0) 2nd half of every 1970 * user_probe_interval. Thus when we send out probe 'n' we 1971 * can be sure that probe 'n - 2' is lost, if we have not 1972 * got the ack. (since the probe interval is > crtt). But 1973 * probe 'n - 1' may be a valid unacked probe, since the 1974 * time between 2 successive probes could be as small as 1975 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 1976 */ 1977 user_probe_interval = user_failure_detection_time / 1978 (NUM_PROBE_FAILS + 2); 1979 1980 /* 1981 * Get the user specified value of failback_enabled from 1982 * /etc/default/mpathd 1983 */ 1984 value = getdefault("FAILBACK"); 1985 if (value != NULL) { 1986 if (strcasecmp(value, "yes") == 0) 1987 failback_enabled = _B_TRUE; 1988 else if (strcasecmp(value, "no") == 0) 1989 failback_enabled = _B_FALSE; 1990 else 1991 logerr("Invalid value for FAILBACK %s\n", value); 1992 free(value); 1993 } else { 1994 failback_enabled = _B_TRUE; 1995 } 1996 1997 /* 1998 * Get the user specified value of track_all_phyints from 1999 * /etc/default/mpathd. The sense is reversed in 2000 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 2001 */ 2002 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2003 if (value != NULL) { 2004 if (strcasecmp(value, "yes") == 0) 2005 track_all_phyints = _B_FALSE; 2006 else if (strcasecmp(value, "no") == 0) 2007 track_all_phyints = _B_TRUE; 2008 else 2009 logerr("Invalid value for " 2010 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2011 free(value); 2012 } else { 2013 track_all_phyints = _B_FALSE; 2014 } 2015 2016 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2017 switch (c) { 2018 case 'a': 2019 adopt = _B_TRUE; 2020 break; 2021 case 'm': 2022 force_mcast = _B_TRUE; 2023 break; 2024 case 'd': 2025 debug = D_ALL; 2026 foreground = _B_TRUE; 2027 break; 2028 case 'D': 2029 i = (int)strtol(optarg, NULL, 0); 2030 if (i == 0) { 2031 (void) fprintf(stderr, "Bad debug flags: %s\n", 2032 optarg); 2033 exit(1); 2034 } 2035 debug |= i; 2036 foreground = _B_TRUE; 2037 break; 2038 case 'l': 2039 /* 2040 * Turn off link state notification handling. 2041 * Undocumented command line flag, for debugging 2042 * purposes. 2043 */ 2044 handle_link_notifications = _B_FALSE; 2045 break; 2046 default: 2047 usage(argv[0]); 2048 exit(1); 2049 } 2050 } 2051 2052 /* 2053 * The sockets for the loopback command interface should be listening 2054 * before we fork and exit in daemonize(). This way, whoever started us 2055 * can use the loopback interface as soon as they get a zero exit 2056 * status. 2057 */ 2058 lsock_v4 = setup_listener(AF_INET); 2059 lsock_v6 = setup_listener(AF_INET6); 2060 2061 if (lsock_v4 < 0 && lsock_v6 < 0) { 2062 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2063 exit(1); 2064 } 2065 2066 if (!foreground) { 2067 if (!daemonize()) { 2068 logerr("cannot daemonize\n"); 2069 exit(EXIT_FAILURE); 2070 } 2071 initlog(); 2072 } 2073 2074 /* 2075 * Initializations: 2076 * 1. Create ifsock* sockets. These are used for performing SIOC* 2077 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2078 * 2. Initialize a pipe for handling/recording signal events. 2079 * 3. Create the routing sockets, used for listening 2080 * to routing / interface changes. 2081 * 4. phyint_init() - Initialize physical interface state 2082 * (in mpd_tables.c). Must be done before creating interfaces, 2083 * which timer_init() does indirectly. 2084 * 5. Query kernel for route entry sizes (v4 and v6). 2085 * 6. timer_init() - Initialize timer related stuff 2086 * 7. initifs() - Initialize our database of all known interfaces 2087 * 8. init_router_targets() - Initialize our database of all known 2088 * router targets. 2089 */ 2090 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2091 if (ifsock_v4 < 0) { 2092 logperror("main: IPv4 socket open"); 2093 exit(1); 2094 } 2095 2096 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2097 if (ifsock_v6 < 0) { 2098 logperror("main: IPv6 socket open"); 2099 exit(1); 2100 } 2101 2102 setup_eventpipe(); 2103 2104 rtsock_v4 = setup_rtsock(AF_INET); 2105 rtsock_v6 = setup_rtsock(AF_INET6); 2106 2107 if (phyint_init() == -1) { 2108 logerr("cannot initialize physical interface structures"); 2109 exit(1); 2110 } 2111 2112 if (mibwalk(mib_get_constants) == -1) 2113 exit(1); 2114 2115 timer_init(); 2116 2117 initifs(); 2118 2119 /* 2120 * If we're operating in "adopt" mode and no interfaces need to be 2121 * tracked, shut down (ifconfig(1M) will restart us on demand if 2122 * interfaces are subsequently put into multipathing groups). 2123 */ 2124 if (adopt && phyint_instances == NULL) 2125 exit(0); 2126 2127 /* 2128 * Main body. Keep listening for activity on any of the sockets 2129 * that we are monitoring and take appropriate action as necessary. 2130 * signals are also handled synchronously. 2131 */ 2132 for (;;) { 2133 if (poll(pollfds, pollfd_num, -1) < 0) { 2134 if (errno == EINTR) 2135 continue; 2136 logperror("main: poll"); 2137 exit(1); 2138 } 2139 for (i = 0; i < pollfd_num; i++) { 2140 if ((pollfds[i].fd == -1) || 2141 !(pollfds[i].revents & POLLIN)) 2142 continue; 2143 if (pollfds[i].fd == eventpipe_read) { 2144 in_signal(eventpipe_read); 2145 break; 2146 } 2147 if (pollfds[i].fd == rtsock_v4 || 2148 pollfds[i].fd == rtsock_v6) { 2149 process_rtsock(rtsock_v4, rtsock_v6); 2150 break; 2151 } 2152 2153 for (pii = phyint_instances; pii != NULL; 2154 pii = pii->pii_next) { 2155 if (pollfds[i].fd == pii->pii_probe_sock) { 2156 if (pii->pii_af == AF_INET) 2157 in_data(pii); 2158 else 2159 in6_data(pii); 2160 break; 2161 } 2162 } 2163 2164 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2165 if (pi->pi_notes != 0 && 2166 pollfds[i].fd == dlpi_fd(pi->pi_dh)) { 2167 (void) dlpi_recv(pi->pi_dh, NULL, NULL, 2168 NULL, NULL, 0, NULL); 2169 break; 2170 } 2171 } 2172 2173 if (pollfds[i].fd == lsock_v4) 2174 loopback_cmd(lsock_v4, AF_INET); 2175 else if (pollfds[i].fd == lsock_v6) 2176 loopback_cmd(lsock_v6, AF_INET6); 2177 } 2178 } 2179 /* NOTREACHED */ 2180 return (EXIT_SUCCESS); 2181 } 2182 2183 static int 2184 setup_listener(int af) 2185 { 2186 int sock; 2187 int on; 2188 int len; 2189 int ret; 2190 struct sockaddr_storage laddr; 2191 struct sockaddr_in *sin; 2192 struct sockaddr_in6 *sin6; 2193 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2194 2195 assert(af == AF_INET || af == AF_INET6); 2196 2197 sock = socket(af, SOCK_STREAM, 0); 2198 if (sock < 0) { 2199 logperror("setup_listener: socket"); 2200 exit(1); 2201 } 2202 2203 on = 1; 2204 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2205 sizeof (on)) < 0) { 2206 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2207 exit(1); 2208 } 2209 2210 bzero(&laddr, sizeof (laddr)); 2211 laddr.ss_family = af; 2212 2213 if (af == AF_INET) { 2214 sin = (struct sockaddr_in *)&laddr; 2215 sin->sin_port = htons(MPATHD_PORT); 2216 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2217 len = sizeof (struct sockaddr_in); 2218 } else { 2219 sin6 = (struct sockaddr_in6 *)&laddr; 2220 sin6->sin6_port = htons(MPATHD_PORT); 2221 sin6->sin6_addr = loopback_addr; 2222 len = sizeof (struct sockaddr_in6); 2223 } 2224 2225 ret = bind(sock, (struct sockaddr *)&laddr, len); 2226 if (ret < 0) { 2227 if (errno == EADDRINUSE) { 2228 /* 2229 * Another instance of mpathd may be already active. 2230 */ 2231 logerr("main: is another instance of in.mpathd " 2232 "already active?\n"); 2233 exit(1); 2234 } else { 2235 (void) close(sock); 2236 return (-1); 2237 } 2238 } 2239 if (listen(sock, 30) < 0) { 2240 logperror("main: listen"); 2241 exit(1); 2242 } 2243 if (poll_add(sock) == -1) { 2244 (void) close(sock); 2245 exit(1); 2246 } 2247 2248 return (sock); 2249 } 2250 2251 /* 2252 * Table of commands and their expected size; used by loopback_cmd(). 2253 */ 2254 static struct { 2255 const char *name; 2256 unsigned int size; 2257 } commands[] = { 2258 { "MI_PING", sizeof (uint32_t) }, 2259 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2260 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2261 { "MI_QUERY", sizeof (mi_query_t) } 2262 }; 2263 2264 /* 2265 * Commands received over the loopback interface come here (via libipmp). 2266 */ 2267 static void 2268 loopback_cmd(int sock, int family) 2269 { 2270 int newfd; 2271 ssize_t len; 2272 boolean_t is_priv = _B_FALSE; 2273 struct sockaddr_storage peer; 2274 struct sockaddr_in *peer_sin; 2275 struct sockaddr_in6 *peer_sin6; 2276 socklen_t peerlen; 2277 union mi_commands mpi; 2278 char abuf[INET6_ADDRSTRLEN]; 2279 uint_t cmd; 2280 int retval; 2281 2282 peerlen = sizeof (peer); 2283 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2284 if (newfd < 0) { 2285 logperror("loopback_cmd: accept"); 2286 return; 2287 } 2288 2289 switch (family) { 2290 case AF_INET: 2291 /* 2292 * Validate the address and port to make sure that 2293 * non privileged processes don't connect and start 2294 * talking to us. 2295 */ 2296 if (peerlen != sizeof (struct sockaddr_in)) { 2297 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2298 (void) close(newfd); 2299 return; 2300 } 2301 peer_sin = (struct sockaddr_in *)&peer; 2302 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; 2303 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2304 abuf, sizeof (abuf)); 2305 2306 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { 2307 logerr("Attempt to connect from addr %s port %d\n", 2308 abuf, ntohs(peer_sin->sin_port)); 2309 (void) close(newfd); 2310 return; 2311 } 2312 break; 2313 2314 case AF_INET6: 2315 if (peerlen != sizeof (struct sockaddr_in6)) { 2316 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2317 (void) close(newfd); 2318 return; 2319 } 2320 /* 2321 * Validate the address and port to make sure that 2322 * non privileged processes don't connect and start 2323 * talking to us. 2324 */ 2325 peer_sin6 = (struct sockaddr_in6 *)&peer; 2326 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; 2327 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2328 sizeof (abuf)); 2329 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { 2330 logerr("Attempt to connect from addr %s port %d\n", 2331 abuf, ntohs(peer_sin6->sin6_port)); 2332 (void) close(newfd); 2333 return; 2334 } 2335 break; 2336 2337 default: 2338 logdebug("loopback_cmd: family %d\n", family); 2339 (void) close(newfd); 2340 return; 2341 } 2342 2343 /* 2344 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2345 * all supported commands 2346 */ 2347 len = read(newfd, &mpi, sizeof (mpi)); 2348 2349 /* 2350 * In theory, we can receive any sized message for a stream socket, 2351 * but we don't expect that to happen for a small message over a 2352 * loopback connection. 2353 */ 2354 if (len < sizeof (uint32_t)) { 2355 logerr("loopback_cmd: bad command format or read returns " 2356 "partial data %d\n", len); 2357 (void) close(newfd); 2358 return; 2359 } 2360 2361 cmd = mpi.mi_command; 2362 if (cmd >= MI_NCMD) { 2363 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2364 (void) close(newfd); 2365 return; 2366 } 2367 2368 /* 2369 * Only MI_PING and MI_QUERY can come from unprivileged sources. 2370 */ 2371 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { 2372 logerr("Unprivileged request from %s for privileged " 2373 "command %s\n", abuf, commands[cmd].name); 2374 (void) close(newfd); 2375 return; 2376 } 2377 2378 if (len < commands[cmd].size) { 2379 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2380 commands[cmd].name, commands[cmd].size, len); 2381 (void) close(newfd); 2382 return; 2383 } 2384 2385 retval = process_cmd(newfd, &mpi); 2386 if (retval != IPMP_SUCCESS) { 2387 logerr("failed processing %s: %s\n", commands[cmd].name, 2388 ipmp_errmsg(retval)); 2389 } 2390 (void) close(newfd); 2391 } 2392 2393 /* 2394 * Process the commands received via libipmp. 2395 */ 2396 static unsigned int 2397 process_cmd(int newfd, union mi_commands *mpi) 2398 { 2399 struct phyint *pi; 2400 struct mi_offline *mio; 2401 struct mi_undo_offline *miu; 2402 unsigned int retval; 2403 2404 switch (mpi->mi_command) { 2405 case MI_PING: 2406 return (send_result(newfd, IPMP_SUCCESS, 0)); 2407 2408 case MI_OFFLINE: 2409 mio = &mpi->mi_ocmd; 2410 2411 pi = phyint_lookup(mio->mio_ifname); 2412 if (pi == NULL) 2413 return (send_result(newfd, IPMP_EUNKIF, 0)); 2414 2415 retval = phyint_offline(pi, mio->mio_min_redundancy); 2416 if (retval == IPMP_FAILURE) 2417 return (send_result(newfd, IPMP_FAILURE, errno)); 2418 2419 return (send_result(newfd, retval, 0)); 2420 2421 case MI_UNDO_OFFLINE: 2422 miu = &mpi->mi_ucmd; 2423 2424 pi = phyint_lookup(miu->miu_ifname); 2425 if (pi == NULL) 2426 return (send_result(newfd, IPMP_EUNKIF, 0)); 2427 2428 retval = phyint_undo_offline(pi); 2429 if (retval == IPMP_FAILURE) 2430 return (send_result(newfd, IPMP_FAILURE, errno)); 2431 2432 return (send_result(newfd, retval, 0)); 2433 2434 case MI_QUERY: 2435 return (process_query(newfd, &mpi->mi_qcmd)); 2436 2437 default: 2438 break; 2439 } 2440 2441 return (send_result(newfd, IPMP_EPROTO, 0)); 2442 } 2443 2444 /* 2445 * Process the query request pointed to by `miq' and send a reply on file 2446 * descriptor `fd'. Returns an IPMP error code. 2447 */ 2448 static unsigned int 2449 process_query(int fd, mi_query_t *miq) 2450 { 2451 ipmp_addrinfo_t *adinfop; 2452 ipmp_addrinfolist_t *adlp; 2453 ipmp_groupinfo_t *grinfop; 2454 ipmp_groupinfolist_t *grlp; 2455 ipmp_grouplist_t *grlistp; 2456 ipmp_ifinfo_t *ifinfop; 2457 ipmp_ifinfolist_t *iflp; 2458 ipmp_snap_t *snap; 2459 unsigned int retval; 2460 2461 switch (miq->miq_inforeq) { 2462 case IPMP_ADDRINFO: 2463 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, 2464 &adinfop); 2465 if (retval != IPMP_SUCCESS) 2466 return (send_result(fd, retval, errno)); 2467 2468 retval = send_result(fd, IPMP_SUCCESS, 0); 2469 if (retval == IPMP_SUCCESS) 2470 retval = send_addrinfo(fd, adinfop); 2471 2472 ipmp_freeaddrinfo(adinfop); 2473 return (retval); 2474 2475 case IPMP_GROUPLIST: 2476 retval = getgrouplist(&grlistp); 2477 if (retval != IPMP_SUCCESS) 2478 return (send_result(fd, retval, errno)); 2479 2480 retval = send_result(fd, IPMP_SUCCESS, 0); 2481 if (retval == IPMP_SUCCESS) 2482 retval = send_grouplist(fd, grlistp); 2483 2484 ipmp_freegrouplist(grlistp); 2485 return (retval); 2486 2487 case IPMP_GROUPINFO: 2488 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2489 retval = getgroupinfo(miq->miq_grname, &grinfop); 2490 if (retval != IPMP_SUCCESS) 2491 return (send_result(fd, retval, errno)); 2492 2493 retval = send_result(fd, IPMP_SUCCESS, 0); 2494 if (retval == IPMP_SUCCESS) 2495 retval = send_groupinfo(fd, grinfop); 2496 2497 ipmp_freegroupinfo(grinfop); 2498 return (retval); 2499 2500 case IPMP_IFINFO: 2501 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2502 retval = getifinfo(miq->miq_ifname, &ifinfop); 2503 if (retval != IPMP_SUCCESS) 2504 return (send_result(fd, retval, errno)); 2505 2506 retval = send_result(fd, IPMP_SUCCESS, 0); 2507 if (retval == IPMP_SUCCESS) 2508 retval = send_ifinfo(fd, ifinfop); 2509 2510 ipmp_freeifinfo(ifinfop); 2511 return (retval); 2512 2513 case IPMP_SNAP: 2514 /* 2515 * Before taking the snapshot, sync with the kernel. 2516 */ 2517 initifs(); 2518 2519 retval = getsnap(&snap); 2520 if (retval != IPMP_SUCCESS) 2521 return (send_result(fd, retval, errno)); 2522 2523 retval = send_result(fd, IPMP_SUCCESS, 0); 2524 if (retval != IPMP_SUCCESS) 2525 goto out; 2526 2527 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2528 if (retval != IPMP_SUCCESS) 2529 goto out; 2530 2531 retval = send_grouplist(fd, snap->sn_grlistp); 2532 if (retval != IPMP_SUCCESS) 2533 goto out; 2534 2535 iflp = snap->sn_ifinfolistp; 2536 for (; iflp != NULL; iflp = iflp->ifl_next) { 2537 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2538 if (retval != IPMP_SUCCESS) 2539 goto out; 2540 } 2541 2542 grlp = snap->sn_grinfolistp; 2543 for (; grlp != NULL; grlp = grlp->grl_next) { 2544 retval = send_groupinfo(fd, grlp->grl_grinfop); 2545 if (retval != IPMP_SUCCESS) 2546 goto out; 2547 } 2548 2549 adlp = snap->sn_adinfolistp; 2550 for (; adlp != NULL; adlp = adlp->adl_next) { 2551 retval = send_addrinfo(fd, adlp->adl_adinfop); 2552 if (retval != IPMP_SUCCESS) 2553 goto out; 2554 } 2555 out: 2556 ipmp_snap_free(snap); 2557 return (retval); 2558 2559 default: 2560 break; 2561 2562 } 2563 return (send_result(fd, IPMP_EPROTO, 0)); 2564 } 2565 2566 /* 2567 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2568 * Returns an IPMP error code. 2569 */ 2570 static unsigned int 2571 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2572 { 2573 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2574 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; 2575 unsigned int retval; 2576 2577 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2578 if (retval != IPMP_SUCCESS) 2579 return (retval); 2580 2581 retval = ipmp_writetlv(fd, IPMP_IFLIST, 2582 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); 2583 if (retval != IPMP_SUCCESS) 2584 return (retval); 2585 2586 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2587 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); 2588 } 2589 2590 /* 2591 * Send the interface information pointed to by `ifinfop' on file descriptor 2592 * `fd'. Returns an IPMP error code. 2593 */ 2594 static unsigned int 2595 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2596 { 2597 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; 2598 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; 2599 unsigned int retval; 2600 2601 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); 2602 if (retval != IPMP_SUCCESS) 2603 return (retval); 2604 2605 retval = ipmp_writetlv(fd, IPMP_ADDRLIST, 2606 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); 2607 if (retval != IPMP_SUCCESS) 2608 return (retval); 2609 2610 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2611 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); 2612 } 2613 2614 /* 2615 * Send the address information pointed to by `adinfop' on file descriptor 2616 * `fd'. Returns an IPMP error code. 2617 */ 2618 static unsigned int 2619 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) 2620 { 2621 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); 2622 } 2623 2624 /* 2625 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2626 * Returns an IPMP error code. 2627 */ 2628 static unsigned int 2629 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2630 { 2631 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2632 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2633 } 2634 2635 /* 2636 * Initialize an mi_result_t structure using `error' and `syserror' and 2637 * send it on file descriptor `fd'. Returns an IPMP error code. 2638 */ 2639 static unsigned int 2640 send_result(int fd, unsigned int error, int syserror) 2641 { 2642 mi_result_t me; 2643 2644 me.me_mpathd_error = error; 2645 if (error == IPMP_FAILURE) 2646 me.me_sys_error = syserror; 2647 else 2648 me.me_sys_error = 0; 2649 2650 return (ipmp_write(fd, &me, sizeof (me))); 2651 } 2652 2653 /* 2654 * Daemonize the process. 2655 */ 2656 static boolean_t 2657 daemonize(void) 2658 { 2659 switch (fork()) { 2660 case -1: 2661 return (_B_FALSE); 2662 2663 case 0: 2664 /* 2665 * Lose our controlling terminal, and become both a session 2666 * leader and a process group leader. 2667 */ 2668 if (setsid() == -1) 2669 return (_B_FALSE); 2670 2671 /* 2672 * Under POSIX, a session leader can accidentally (through 2673 * open(2)) acquire a controlling terminal if it does not 2674 * have one. Just to be safe, fork() again so we are not a 2675 * session leader. 2676 */ 2677 switch (fork()) { 2678 case -1: 2679 return (_B_FALSE); 2680 2681 case 0: 2682 (void) chdir("/"); 2683 (void) umask(022); 2684 (void) fdwalk(closefunc, NULL); 2685 break; 2686 2687 default: 2688 _exit(EXIT_SUCCESS); 2689 } 2690 break; 2691 2692 default: 2693 _exit(EXIT_SUCCESS); 2694 } 2695 2696 return (_B_TRUE); 2697 } 2698 2699 /* 2700 * The parent has created some fds before forking on purpose, keep them open. 2701 */ 2702 static int 2703 closefunc(void *not_used, int fd) 2704 /* ARGSUSED */ 2705 { 2706 if (fd != lsock_v4 && fd != lsock_v6) 2707 (void) close(fd); 2708 return (0); 2709 } 2710 2711 /* LOGGER */ 2712 2713 #include <syslog.h> 2714 2715 /* 2716 * Logging routines. All routines log to syslog, unless the daemon is 2717 * running in the foreground, in which case the logging goes to stderr. 2718 * 2719 * The following routines are available: 2720 * 2721 * logdebug(): A printf-like function for outputting debug messages 2722 * (messages at LOG_DEBUG) that are only of use to developers. 2723 * 2724 * logtrace(): A printf-like function for outputting tracing messages 2725 * (messages at LOG_INFO) from the daemon. This is typically used 2726 * to log the receipt of interesting network-related conditions. 2727 * 2728 * logerr(): A printf-like function for outputting error messages 2729 * (messages at LOG_ERR) from the daemon. 2730 * 2731 * logperror*(): A set of functions used to output error messages 2732 * (messages at LOG_ERR); these automatically append strerror(errno) 2733 * and a newline to the message passed to them. 2734 * 2735 * NOTE: since the logging functions write to syslog, the messages passed 2736 * to them are not eligible for localization. Thus, gettext() must 2737 * *not* be used. 2738 */ 2739 2740 static int logging = 0; 2741 2742 static void 2743 initlog(void) 2744 { 2745 logging++; 2746 openlog("in.mpathd", LOG_PID, LOG_DAEMON); 2747 } 2748 2749 /* PRINTFLIKE2 */ 2750 void 2751 logmsg(int pri, const char *fmt, ...) 2752 { 2753 va_list ap; 2754 2755 va_start(ap, fmt); 2756 2757 if (logging) 2758 vsyslog(pri, fmt, ap); 2759 else 2760 (void) vfprintf(stderr, fmt, ap); 2761 va_end(ap); 2762 } 2763 2764 /* PRINTFLIKE1 */ 2765 void 2766 logperror(const char *str) 2767 { 2768 if (logging) 2769 syslog(LOG_ERR, "%s: %m\n", str); 2770 else 2771 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 2772 } 2773 2774 void 2775 logperror_pii(struct phyint_instance *pii, const char *str) 2776 { 2777 if (logging) { 2778 syslog(LOG_ERR, "%s (%s %s): %m\n", 2779 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 2780 } else { 2781 (void) fprintf(stderr, "%s (%s %s): %s\n", 2782 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 2783 strerror(errno)); 2784 } 2785 } 2786 2787 void 2788 logperror_li(struct logint *li, const char *str) 2789 { 2790 struct phyint_instance *pii = li->li_phyint_inst; 2791 2792 if (logging) { 2793 syslog(LOG_ERR, "%s (%s %s): %m\n", 2794 str, AF_STR(pii->pii_af), li->li_name); 2795 } else { 2796 (void) fprintf(stderr, "%s (%s %s): %s\n", 2797 str, AF_STR(pii->pii_af), li->li_name, 2798 strerror(errno)); 2799 } 2800 } 2801 2802 void 2803 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 2804 { 2805 if (polled) 2806 (void) poll_remove(pii->pii_probe_sock); 2807 (void) close(pii->pii_probe_sock); 2808 pii->pii_probe_sock = -1; 2809 pii->pii_basetime_inited = 0; 2810 } 2811 2812 boolean_t 2813 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, 2814 struct sockaddr_storage *ssp) 2815 { 2816 addrlist_t *addrp; 2817 2818 if ((addrp = malloc(sizeof (addrlist_t))) == NULL) 2819 return (_B_FALSE); 2820 2821 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); 2822 addrp->al_flags = flags; 2823 addrp->al_addr = *ssp; 2824 addrp->al_next = *addrsp; 2825 *addrsp = addrp; 2826 return (_B_TRUE); 2827 } 2828 2829 void 2830 addrlist_free(addrlist_t **addrsp) 2831 { 2832 addrlist_t *addrp, *next_addrp; 2833 2834 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { 2835 next_addrp = addrp->al_next; 2836 free(addrp); 2837 } 2838 *addrsp = NULL; 2839 } 2840 2841 /* 2842 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 2843 * tables defined by mib2.h. Pass the table information returned to the 2844 * supplied function. 2845 */ 2846 static int 2847 mibwalk(void (*proc)(mib_item_t *)) 2848 { 2849 mib_item_t *head_item = NULL; 2850 mib_item_t *last_item = NULL; 2851 mib_item_t *tmp; 2852 struct strbuf ctlbuf, databuf; 2853 int flags; 2854 int rval; 2855 uintptr_t buf[512 / sizeof (uintptr_t)]; 2856 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf; 2857 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf; 2858 struct T_error_ack *tea = (struct T_error_ack *)buf; 2859 struct opthdr *req, *optp; 2860 int status = -1; 2861 2862 if (mibfd == -1) { 2863 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) { 2864 logperror("mibwalk(): ip open"); 2865 return (status); 2866 } 2867 } 2868 2869 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2870 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2871 tor->OPT_length = sizeof (struct opthdr); 2872 tor->MGMT_flags = T_CURRENT; 2873 2874 /* 2875 * Note: we use the special level value below so that IP will return 2876 * us information concerning IRE_MARK_TESTHIDDEN routes. 2877 */ 2878 req = (struct opthdr *)&tor[1]; 2879 req->level = EXPER_IP_AND_ALL_IRES; 2880 req->name = 0; 2881 req->len = 0; 2882 2883 ctlbuf.buf = (char *)&buf; 2884 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 2885 2886 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) { 2887 logperror("mibwalk(): putmsg(ctl)"); 2888 return (status); 2889 } 2890 2891 /* 2892 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 2893 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 2894 * a control and data part. The control part contains a struct 2895 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 2896 * the level, name and length of the data in the data part. The 2897 * data part contains the actual table data. The last message 2898 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 2899 * single option with zero optlen. 2900 */ 2901 for (;;) { 2902 errno = flags = 0; 2903 ctlbuf.maxlen = sizeof (buf); 2904 rval = getmsg(mibfd, &ctlbuf, NULL, &flags); 2905 if (rval & MORECTL || rval < 0) { 2906 if (errno == EINTR) 2907 continue; 2908 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n", 2909 rval, errno); 2910 goto error; 2911 } 2912 if (ctlbuf.len < sizeof (t_scalar_t)) { 2913 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len); 2914 goto error; 2915 } 2916 2917 switch (toa->PRIM_type) { 2918 case T_ERROR_ACK: 2919 if (ctlbuf.len < sizeof (struct T_error_ack)) { 2920 logerr("mibwalk(): T_ERROR_ACK ctlbuf " 2921 "too short: %d\n", ctlbuf.len); 2922 goto error; 2923 } 2924 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n" 2925 " UNIX_err = 0x%lx\n", tea->TLI_error, 2926 t_strerror(tea->TLI_error), tea->UNIX_error); 2927 goto error; 2928 2929 case T_OPTMGMT_ACK: 2930 optp = (struct opthdr *)&toa[1]; 2931 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + 2932 sizeof (struct opthdr))) { 2933 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too " 2934 "short: %d\n", ctlbuf.len); 2935 goto error; 2936 } 2937 if (toa->MGMT_flags != T_SUCCESS) { 2938 logerr("mibwalk(): MGMT_flags != T_SUCCESS: " 2939 "0x%lx\n", toa->MGMT_flags); 2940 goto error; 2941 } 2942 break; 2943 2944 default: 2945 goto error; 2946 } 2947 /* The following assert also implies MGMT_flags == T_SUCCESS */ 2948 assert(toa->PRIM_type == T_OPTMGMT_ACK); 2949 2950 /* 2951 * We have reached the end of this T_OPTMGMT_ACK 2952 * message. If this is the last message i.e EOD, 2953 * break, else process the next T_OPTMGMT_ACK msg. 2954 */ 2955 if (rval == 0) { 2956 if (optp->len == 0 && optp->name == 0 && 2957 optp->level == 0) { 2958 /* This is the EOD message. */ 2959 break; 2960 } 2961 /* Not EOD but no data to retrieve */ 2962 continue; 2963 } 2964 2965 /* 2966 * We should only be here if MOREDATA was set. 2967 * Allocate an empty mib_item_t and link into the list 2968 * of MIB items. 2969 */ 2970 if ((tmp = malloc(sizeof (*tmp))) == NULL) { 2971 logperror("mibwalk(): malloc() failed."); 2972 goto error; 2973 } 2974 if (last_item != NULL) 2975 last_item->mi_next = tmp; 2976 else 2977 head_item = tmp; 2978 last_item = tmp; 2979 last_item->mi_next = NULL; 2980 last_item->mi_opthdr = *optp; 2981 last_item->mi_valp = malloc(optp->len); 2982 if (last_item->mi_valp == NULL) { 2983 logperror("mibwalk(): malloc() failed."); 2984 goto error; 2985 } 2986 2987 databuf.maxlen = last_item->mi_opthdr.len; 2988 databuf.buf = (char *)last_item->mi_valp; 2989 databuf.len = 0; 2990 2991 /* Retrieve the actual MIB data */ 2992 for (;;) { 2993 flags = 0; 2994 if ((rval = getmsg(mibfd, NULL, &databuf, 2995 &flags)) != 0) { 2996 if (rval < 0 && errno == EINTR) 2997 continue; 2998 /* 2999 * We shouldn't get MOREDATA here so treat that 3000 * as an error. 3001 */ 3002 logperror("mibwalk(): getmsg(data)"); 3003 goto error; 3004 } 3005 break; 3006 } 3007 } 3008 status = 0; 3009 /* Pass the accumulated MIB data to the supplied function pointer */ 3010 (*proc)(head_item); 3011 error: 3012 while (head_item != NULL) { 3013 tmp = head_item; 3014 head_item = tmp->mi_next; 3015 free(tmp->mi_valp); 3016 free(tmp); 3017 } 3018 return (status); 3019 } 3020 3021 /* 3022 * Parse the supplied mib2 information to get the size of routing table 3023 * entries. This is needed when running in a branded zone where the 3024 * Solaris application environment and the Solaris kernel may not be the 3025 * the same release version. 3026 */ 3027 static void 3028 mib_get_constants(mib_item_t *item) 3029 { 3030 mib2_ip_t *ipv4; 3031 mib2_ipv6IfStatsEntry_t *ipv6; 3032 3033 for (; item != NULL; item = item->mi_next) { 3034 if (item->mi_opthdr.name != 0) 3035 continue; 3036 if (item->mi_opthdr.level == MIB2_IP) { 3037 ipv4 = (mib2_ip_t *)item->mi_valp; 3038 ipRouteEntrySize = ipv4->ipRouteEntrySize; 3039 } else if (item->mi_opthdr.level == MIB2_IP6) { 3040 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp; 3041 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize; 3042 } 3043 } 3044 } 3045