1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 #include "mpd_defs.h" 27 #include "mpd_tables.h" 28 29 int debug = 0; /* Debug flag */ 30 static int pollfd_num = 0; /* Num. of poll descriptors */ 31 static struct pollfd *pollfds = NULL; /* Array of poll descriptors */ 32 /* All times below in ms */ 33 int user_failure_detection_time; /* user specified failure detection */ 34 /* time (fdt) */ 35 int user_probe_interval; /* derived from user specified fdt */ 36 37 /* 38 * Structure to store mib2 information returned by the kernel. 39 * This is used to process routing table information. 40 */ 41 typedef struct mib_item_s { 42 struct mib_item_s *mi_next; 43 struct opthdr mi_opthdr; 44 void *mi_valp; 45 } mib_item_t; 46 47 static int rtsock_v4; /* AF_INET routing socket */ 48 static int rtsock_v6; /* AF_INET6 routing socket */ 49 int ifsock_v4 = -1; /* IPv4 socket for ioctls */ 50 int ifsock_v6 = -1; /* IPv6 socket for ioctls */ 51 static int lsock_v4; /* Listen socket to detect mpathd */ 52 static int lsock_v6; /* Listen socket to detect mpathd */ 53 static int mibfd = -1; /* fd to get mib info */ 54 static boolean_t force_mcast = _B_FALSE; /* Only for test purposes */ 55 56 static uint_t last_initifs_time; /* Time when initifs was last run */ 57 static char **argv0; /* Saved for re-exec on SIGHUP */ 58 boolean_t handle_link_notifications = _B_TRUE; 59 static int ipRouteEntrySize; /* Size of IPv4 route entry */ 60 static int ipv6RouteEntrySize; /* Size of IPv6 route entry */ 61 62 static void initlog(void); 63 static void run_timeouts(void); 64 static void initifs(void); 65 static void check_if_removed(struct phyint_instance *pii); 66 static void select_test_ifs(void); 67 static void update_router_list(mib_item_t *item); 68 static void mib_get_constants(mib_item_t *item); 69 static int mibwalk(void (*proc)(mib_item_t *)); 70 static void ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len); 71 static void ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len); 72 static void router_add_common(int af, char *ifname, 73 struct in6_addr nexthop); 74 static void init_router_targets(); 75 static void cleanup(void); 76 static int setup_listener(int af); 77 static void check_config(void); 78 static void check_testconfig(void); 79 static void check_addr_unique(struct phyint_instance *, 80 struct sockaddr_storage *); 81 static void init_host_targets(void); 82 static void dup_host_targets(struct phyint_instance *desired_pii); 83 static void loopback_cmd(int sock, int family); 84 static boolean_t daemonize(void); 85 static int closefunc(void *, int); 86 static unsigned int process_cmd(int newfd, union mi_commands *mpi); 87 static unsigned int process_query(int fd, mi_query_t *miq); 88 static unsigned int send_addrinfo(int fd, ipmp_addrinfo_t *adinfop); 89 static unsigned int send_groupinfo(int fd, ipmp_groupinfo_t *grinfop); 90 static unsigned int send_grouplist(int fd, ipmp_grouplist_t *grlistp); 91 static unsigned int send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop); 92 static unsigned int send_result(int fd, unsigned int error, int syserror); 93 94 addrlist_t *localaddrs; 95 96 /* 97 * Return the current time in milliseconds (from an arbitrary reference) 98 * truncated to fit into an int. Truncation is ok since we are interested 99 * only in differences and not the absolute values. 100 */ 101 uint_t 102 getcurrenttime(void) 103 { 104 uint_t cur_time; /* In ms */ 105 106 /* 107 * Use of a non-user-adjustable source of time is 108 * required. However millisecond precision is sufficient. 109 * divide by 10^6 110 */ 111 cur_time = (uint_t)(gethrtime() / 1000000LL); 112 return (cur_time); 113 } 114 115 uint64_t 116 getcurrentsec(void) 117 { 118 return (gethrtime() / NANOSEC); 119 } 120 121 /* 122 * Add fd to the set being polled. Returns 0 if ok; -1 if failed. 123 */ 124 int 125 poll_add(int fd) 126 { 127 int i; 128 int new_num; 129 struct pollfd *newfds; 130 retry: 131 /* Check if already present */ 132 for (i = 0; i < pollfd_num; i++) { 133 if (pollfds[i].fd == fd) 134 return (0); 135 } 136 /* Check for empty spot already present */ 137 for (i = 0; i < pollfd_num; i++) { 138 if (pollfds[i].fd == -1) { 139 pollfds[i].fd = fd; 140 return (0); 141 } 142 } 143 144 /* Allocate space for 32 more fds and initialize to -1 */ 145 new_num = pollfd_num + 32; 146 newfds = realloc(pollfds, new_num * sizeof (struct pollfd)); 147 if (newfds == NULL) { 148 logperror("poll_add: realloc"); 149 return (-1); 150 } 151 for (i = pollfd_num; i < new_num; i++) { 152 newfds[i].fd = -1; 153 newfds[i].events = POLLIN; 154 } 155 pollfd_num = new_num; 156 pollfds = newfds; 157 goto retry; 158 } 159 160 /* 161 * Remove fd from the set being polled. Returns 0 if ok; -1 if failed. 162 */ 163 int 164 poll_remove(int fd) 165 { 166 int i; 167 168 /* Check if already present */ 169 for (i = 0; i < pollfd_num; i++) { 170 if (pollfds[i].fd == fd) { 171 pollfds[i].fd = -1; 172 return (0); 173 } 174 } 175 return (-1); 176 } 177 178 /* 179 * Extract information about the phyint instance. If the phyint instance still 180 * exists in the kernel then set pii_in_use, else clear it. check_if_removed() 181 * will use it to detect phyint instances that don't exist any longer and 182 * remove them, from our database of phyint instances. 183 * Return value: 184 * returns true if the phyint instance exists in the kernel, 185 * returns false otherwise 186 */ 187 static boolean_t 188 pii_process(int af, char *name, struct phyint_instance **pii_p) 189 { 190 int err; 191 struct phyint_instance *pii; 192 struct phyint_instance *pii_other; 193 194 if (debug & D_PHYINT) 195 logdebug("pii_process(%s %s)\n", AF_STR(af), name); 196 197 pii = phyint_inst_lookup(af, name); 198 if (pii == NULL) { 199 /* 200 * Phyint instance does not exist in our tables, 201 * create new phyint instance 202 */ 203 pii = phyint_inst_init_from_k(af, name); 204 } else { 205 /* Phyint exists in our tables */ 206 err = phyint_inst_update_from_k(pii); 207 208 switch (err) { 209 case PI_IOCTL_ERROR: 210 /* Some ioctl error. don't change anything */ 211 pii->pii_in_use = 1; 212 break; 213 214 case PI_GROUP_CHANGED: 215 case PI_IFINDEX_CHANGED: 216 /* 217 * Interface index or group membership has changed. 218 * Delete the old state and recreate based on the new 219 * state (it may no longer be in a group). 220 */ 221 pii_other = phyint_inst_other(pii); 222 if (pii_other != NULL) 223 phyint_inst_delete(pii_other); 224 phyint_inst_delete(pii); 225 pii = phyint_inst_init_from_k(af, name); 226 break; 227 228 case PI_DELETED: 229 /* Phyint instance has disappeared from kernel */ 230 pii->pii_in_use = 0; 231 break; 232 233 case PI_OK: 234 /* Phyint instance exists and is fine */ 235 pii->pii_in_use = 1; 236 break; 237 238 default: 239 /* Unknown status */ 240 logerr("pii_process: Unknown status %d\n", err); 241 break; 242 } 243 } 244 245 *pii_p = pii; 246 if (pii != NULL) 247 return (pii->pii_in_use ? _B_TRUE : _B_FALSE); 248 else 249 return (_B_FALSE); 250 } 251 252 /* 253 * Scan all interfaces to detect changes as well as new and deleted interfaces 254 */ 255 static void 256 initifs() 257 { 258 int i, nlifr; 259 int af; 260 char *cp; 261 char *buf; 262 int sockfd; 263 uint64_t flags; 264 struct lifnum lifn; 265 struct lifconf lifc; 266 struct lifreq lifreq; 267 struct lifreq *lifr; 268 struct logint *li; 269 struct phyint_instance *pii; 270 struct phyint_instance *next_pii; 271 struct phyint_group *pg, *next_pg; 272 char pi_name[LIFNAMSIZ + 1]; 273 274 if (debug & D_PHYINT) 275 logdebug("initifs: Scanning interfaces\n"); 276 277 last_initifs_time = getcurrenttime(); 278 279 /* 280 * Free the existing local address list; we'll build a new list below. 281 */ 282 addrlist_free(&localaddrs); 283 284 /* 285 * Mark the interfaces so that we can find phyints and logints 286 * which have disappeared from the kernel. pii_process() and 287 * logint_init_from_k() will set {pii,li}_in_use when they find 288 * the interface in the kernel. Also, clear dupaddr bit on probe 289 * logint. check_addr_unique() will set the dupaddr bit on the 290 * probe logint, if the testaddress is not unique. 291 */ 292 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 293 pii->pii_in_use = 0; 294 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 295 li->li_in_use = 0; 296 if (pii->pii_probe_logint == li) 297 li->li_dupaddr = 0; 298 } 299 } 300 301 /* 302 * As above, mark groups so that we can detect IPMP interfaces which 303 * have been removed from the kernel. Also, delete the group address 304 * list since we'll iteratively recreate it below. 305 */ 306 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 307 pg->pg_in_use = _B_FALSE; 308 addrlist_free(&pg->pg_addrs); 309 } 310 311 lifn.lifn_family = AF_UNSPEC; 312 lifn.lifn_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 313 again: 314 if (ioctl(ifsock_v4, SIOCGLIFNUM, (char *)&lifn) < 0) { 315 logperror("initifs: ioctl (get interface count)"); 316 return; 317 } 318 /* 319 * Pad the interface count to detect when additional interfaces have 320 * been configured between SIOCGLIFNUM and SIOCGLIFCONF. 321 */ 322 lifn.lifn_count += 4; 323 324 if ((buf = calloc(lifn.lifn_count, sizeof (struct lifreq))) == NULL) { 325 logperror("initifs: calloc"); 326 return; 327 } 328 329 lifc.lifc_family = AF_UNSPEC; 330 lifc.lifc_flags = LIFC_ALLZONES | LIFC_UNDER_IPMP; 331 lifc.lifc_len = lifn.lifn_count * sizeof (struct lifreq); 332 lifc.lifc_buf = buf; 333 334 if (ioctl(ifsock_v4, SIOCGLIFCONF, (char *)&lifc) < 0) { 335 logperror("initifs: ioctl (get interface configuration)"); 336 free(buf); 337 return; 338 } 339 340 /* 341 * If every lifr_req slot is taken, then additional interfaces must 342 * have been plumbed between the SIOCGLIFNUM and the SIOCGLIFCONF. 343 * Recalculate to make sure we didn't miss any interfaces. 344 */ 345 nlifr = lifc.lifc_len / sizeof (struct lifreq); 346 if (nlifr >= lifn.lifn_count) { 347 free(buf); 348 goto again; 349 } 350 351 /* 352 * Walk through the lifreqs returned by SIOGGLIFCONF, and refresh the 353 * global list of addresses, phyint groups, phyints, and logints. 354 */ 355 for (lifr = lifc.lifc_req, i = 0; i < nlifr; i++, lifr++) { 356 af = lifr->lifr_addr.ss_family; 357 sockfd = (af == AF_INET) ? ifsock_v4 : ifsock_v6; 358 (void) strlcpy(lifreq.lifr_name, lifr->lifr_name, LIFNAMSIZ); 359 360 if (ioctl(sockfd, SIOCGLIFFLAGS, &lifreq) == -1) { 361 if (errno != ENXIO) 362 logperror("initifs: ioctl (SIOCGLIFFLAGS)"); 363 continue; 364 } 365 flags = lifreq.lifr_flags; 366 367 /* 368 * If the address is IFF_UP, add it to the local address list. 369 * (We ignore addresses that aren't IFF_UP since another node 370 * might legitimately have that address IFF_UP.) 371 */ 372 if (flags & IFF_UP) { 373 (void) addrlist_add(&localaddrs, lifr->lifr_name, flags, 374 &lifr->lifr_addr); 375 } 376 377 /* 378 * If this address is on an IPMP meta-interface, update our 379 * phyint_group information (either by recording that group 380 * still exists or creating a new group), and track what 381 * group the address is part of. 382 */ 383 if (flags & IFF_IPMP) { 384 if (ioctl(sockfd, SIOCGLIFGROUPNAME, &lifreq) == -1) { 385 if (errno != ENXIO) 386 logperror("initifs: ioctl " 387 "(SIOCGLIFGROUPNAME)"); 388 continue; 389 } 390 391 pg = phyint_group_lookup(lifreq.lifr_groupname); 392 if (pg == NULL) { 393 pg = phyint_group_create(lifreq.lifr_groupname); 394 if (pg == NULL) { 395 logerr("initifs: cannot create group " 396 "%s\n", lifreq.lifr_groupname); 397 continue; 398 } 399 phyint_group_insert(pg); 400 } 401 pg->pg_in_use = _B_TRUE; 402 403 /* 404 * Add this to the group's list of data addresses. 405 */ 406 if (!addrlist_add(&pg->pg_addrs, lifr->lifr_name, flags, 407 &lifr->lifr_addr)) { 408 logerr("initifs: insufficient memory to track " 409 "data address information for %s\n", 410 lifr->lifr_name); 411 } 412 continue; 413 } 414 415 /* 416 * This isn't an address on an IPMP meta-interface, so it's 417 * either on an underlying interface or not related to any 418 * group. Update our phyint and logint information (via 419 * pii_process() and logint_init_from_k()) -- but first, 420 * convert the logint name to a phyint name so we can call 421 * pii_process(). 422 */ 423 (void) strlcpy(pi_name, lifr->lifr_name, sizeof (pi_name)); 424 if ((cp = strchr(pi_name, IF_SEPARATOR)) != NULL) 425 *cp = '\0'; 426 427 if (pii_process(af, pi_name, &pii)) { 428 /* The phyint is fine. So process the logint */ 429 logint_init_from_k(pii, lifr->lifr_name); 430 check_addr_unique(pii, &lifr->lifr_addr); 431 } 432 } 433 free(buf); 434 435 /* 436 * Scan for groups, phyints and logints that have disappeared from the 437 * kernel, and delete them. 438 */ 439 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 440 next_pii = pii->pii_next; 441 check_if_removed(pii); 442 } 443 444 for (pg = phyint_groups; pg != NULL; pg = next_pg) { 445 next_pg = pg->pg_next; 446 if (!pg->pg_in_use) { 447 phyint_group_delete(pg); 448 continue; 449 } 450 /* 451 * Refresh the group's state. This is necessary since the 452 * group's state is defined by the set of usable interfaces in 453 * the group, and an interface is considered unusable if all 454 * of its addresses are down. When an address goes down/up, 455 * the RTM_DELADDR/RTM_NEWADDR brings us through here. 456 */ 457 phyint_group_refresh_state(pg); 458 } 459 460 /* 461 * Select a test address for sending probes on each phyint instance 462 */ 463 select_test_ifs(); 464 465 /* 466 * Handle link up/down notifications. 467 */ 468 process_link_state_changes(); 469 } 470 471 /* 472 * Check that a given test address is unique across all of the interfaces in a 473 * group. (e.g., IPv6 link-locals may not be inherently unique, and binding 474 * to such an (IFF_NOFAILOVER) address can produce unexpected results.) 475 * Any issues will be reported by check_testconfig(). 476 */ 477 static void 478 check_addr_unique(struct phyint_instance *ourpii, struct sockaddr_storage *ss) 479 { 480 struct phyint *pi; 481 struct phyint_group *pg; 482 struct in6_addr addr; 483 struct phyint_instance *pii; 484 struct sockaddr_in *sin; 485 486 if (ss->ss_family == AF_INET) { 487 sin = (struct sockaddr_in *)ss; 488 IN6_INADDR_TO_V4MAPPED(&sin->sin_addr, &addr); 489 } else { 490 assert(ss->ss_family == AF_INET6); 491 addr = ((struct sockaddr_in6 *)ss)->sin6_addr; 492 } 493 494 /* 495 * For anonymous groups, every interface is assumed to be on its own 496 * link, so there is no chance of overlapping addresses. 497 */ 498 pg = ourpii->pii_phyint->pi_group; 499 if (pg == phyint_anongroup) 500 return; 501 502 /* 503 * Walk the list of phyint instances in the group and check for test 504 * addresses matching ours. Of course, we skip ourself. 505 */ 506 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 507 pii = PHYINT_INSTANCE(pi, ss->ss_family); 508 if (pii == NULL || pii == ourpii || 509 pii->pii_probe_logint == NULL) 510 continue; 511 512 /* 513 * If this test address is not unique, set the dupaddr bit. 514 */ 515 if (IN6_ARE_ADDR_EQUAL(&addr, &pii->pii_probe_logint->li_addr)) 516 pii->pii_probe_logint->li_dupaddr = 1; 517 } 518 } 519 520 /* 521 * Stop probing an interface. Called when an interface is offlined. 522 * The probe socket is closed on each interface instance, and the 523 * interface state set to PI_OFFLINE. 524 */ 525 void 526 stop_probing(struct phyint *pi) 527 { 528 struct phyint_instance *pii; 529 530 pii = pi->pi_v4; 531 if (pii != NULL) { 532 if (pii->pii_probe_sock != -1) 533 close_probe_socket(pii, _B_TRUE); 534 pii->pii_probe_logint = NULL; 535 } 536 537 pii = pi->pi_v6; 538 if (pii != NULL) { 539 if (pii->pii_probe_sock != -1) 540 close_probe_socket(pii, _B_TRUE); 541 pii->pii_probe_logint = NULL; 542 } 543 544 phyint_chstate(pi, PI_OFFLINE); 545 } 546 547 enum { BAD_TESTFLAGS, OK_TESTFLAGS, BEST_TESTFLAGS }; 548 549 /* 550 * Rate the provided test flags. By definition, IFF_NOFAILOVER must be set. 551 * IFF_UP must also be set so that the associated address can be used as a 552 * source address. Further, we must be able to exchange packets with local 553 * destinations, so IFF_NOXMIT and IFF_NOLOCAL must be clear. For historical 554 * reasons, we have a proclivity for IFF_DEPRECATED IPv4 test addresses. 555 */ 556 static int 557 rate_testflags(uint64_t flags) 558 { 559 if ((flags & (IFF_NOFAILOVER | IFF_UP)) != (IFF_NOFAILOVER | IFF_UP)) 560 return (BAD_TESTFLAGS); 561 562 if ((flags & (IFF_NOXMIT | IFF_NOLOCAL)) != 0) 563 return (BAD_TESTFLAGS); 564 565 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_DEPRECATED) 566 return (BEST_TESTFLAGS); 567 568 if ((flags & (IFF_IPV6 | IFF_DEPRECATED)) == IFF_IPV6) 569 return (BEST_TESTFLAGS); 570 571 return (OK_TESTFLAGS); 572 } 573 574 /* 575 * Attempt to select a test address for each phyint instance. 576 * Call phyint_inst_sockinit() to complete the initializations. 577 */ 578 static void 579 select_test_ifs(void) 580 { 581 struct phyint *pi; 582 struct phyint_instance *pii; 583 struct phyint_instance *next_pii; 584 struct logint *li; 585 struct logint *probe_logint; 586 boolean_t target_scan_reqd = _B_FALSE; 587 int rating; 588 589 if (debug & D_PHYINT) 590 logdebug("select_test_ifs\n"); 591 592 /* 593 * For each phyint instance, do the test address selection 594 */ 595 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 596 next_pii = pii->pii_next; 597 probe_logint = NULL; 598 599 /* 600 * An interface that is offline should not be probed. 601 * IFF_OFFLINE interfaces should always be PI_OFFLINE 602 * unless some other entity has set the offline flag. 603 */ 604 if (pii->pii_phyint->pi_flags & IFF_OFFLINE) { 605 if (pii->pii_phyint->pi_state != PI_OFFLINE) { 606 logerr("shouldn't be probing offline" 607 " interface %s (state is: %u)." 608 " Stopping probes.\n", 609 pii->pii_phyint->pi_name, 610 pii->pii_phyint->pi_state); 611 stop_probing(pii->pii_phyint); 612 } 613 continue; 614 } else { 615 /* 616 * If something cleared IFF_OFFLINE (e.g., by accident 617 * because the SIOCGLIFFLAGS/SIOCSLIFFLAGS sequence is 618 * inherently racy), the phyint may still be offline. 619 * Just ignore it. 620 */ 621 if (pii->pii_phyint->pi_state == PI_OFFLINE) 622 continue; 623 } 624 625 li = pii->pii_probe_logint; 626 if (li != NULL) { 627 /* 628 * We've already got a test address; only proceed 629 * if it's suboptimal. 630 */ 631 if (rate_testflags(li->li_flags) == BEST_TESTFLAGS) 632 continue; 633 } 634 635 /* 636 * Walk the logints of this phyint instance, and select 637 * the best available test address 638 */ 639 for (li = pii->pii_logint; li != NULL; li = li->li_next) { 640 /* 641 * Skip 0.0.0.0 addresses, as those are never 642 * actually usable. 643 */ 644 if (pii->pii_af == AF_INET && 645 IN6_IS_ADDR_V4MAPPED_ANY(&li->li_addr)) 646 continue; 647 648 /* 649 * Skip any IPv6 logints that are not link-local, 650 * since we should always have a link-local address 651 * anyway and in6_data() expects link-local replies. 652 */ 653 if (pii->pii_af == AF_INET6 && 654 !IN6_IS_ADDR_LINKLOCAL(&li->li_addr)) 655 continue; 656 657 /* 658 * Rate the testflags. If we've found an optimal 659 * match, then break out; otherwise, record the most 660 * recent OK one. 661 */ 662 rating = rate_testflags(li->li_flags); 663 if (rating == BAD_TESTFLAGS) 664 continue; 665 666 probe_logint = li; 667 if (rating == BEST_TESTFLAGS) 668 break; 669 } 670 671 /* 672 * If the probe logint has changed, ditch the old one. 673 */ 674 if (pii->pii_probe_logint != NULL && 675 pii->pii_probe_logint != probe_logint) { 676 if (pii->pii_probe_sock != -1) 677 close_probe_socket(pii, _B_TRUE); 678 pii->pii_probe_logint = NULL; 679 } 680 681 if (probe_logint == NULL) { 682 /* 683 * We don't have a test address; zero out the probe 684 * stats array since it is no longer relevant. 685 * Optimize by checking if it is already zeroed out. 686 */ 687 int pr_ndx; 688 689 pr_ndx = PROBE_INDEX_PREV(pii->pii_probe_next); 690 if (pii->pii_probes[pr_ndx].pr_status != PR_UNUSED) { 691 clear_pii_probe_stats(pii); 692 reset_crtt_all(pii->pii_phyint); 693 } 694 continue; 695 } else if (probe_logint == pii->pii_probe_logint) { 696 /* 697 * If we didn't find any new test addr, go to the 698 * next phyint. 699 */ 700 continue; 701 } 702 703 /* 704 * The phyint is either being assigned a new testaddr 705 * or is being assigned a testaddr for the 1st time. 706 * Need to initialize the phyint socket 707 */ 708 pii->pii_probe_logint = probe_logint; 709 if (!phyint_inst_sockinit(pii)) { 710 if (debug & D_PHYINT) { 711 logdebug("select_test_ifs: " 712 "phyint_sockinit failed\n"); 713 } 714 phyint_inst_delete(pii); 715 continue; 716 } 717 718 /* 719 * This phyint instance is now enabled for probes; this 720 * impacts our state machine in two ways: 721 * 722 * 1. If we're probe *capable* as well (i.e., we have 723 * probe targets) and the interface is in PI_NOTARGETS, 724 * then transition to PI_RUNNING. 725 * 726 * 2. If we're not probe capable, and the other phyint 727 * instance is also not probe capable, and we were in 728 * PI_RUNNING, then transition to PI_NOTARGETS. 729 * 730 * Also see the state diagram in mpd_probe.c. 731 */ 732 if (PROBE_CAPABLE(pii)) { 733 if (pii->pii_phyint->pi_state == PI_NOTARGETS) 734 phyint_chstate(pii->pii_phyint, PI_RUNNING); 735 } else if (!PROBE_CAPABLE(phyint_inst_other(pii))) { 736 if (pii->pii_phyint->pi_state == PI_RUNNING) 737 phyint_chstate(pii->pii_phyint, PI_NOTARGETS); 738 } 739 740 /* 741 * If no targets are currently known for this phyint 742 * we need to call init_router_targets. Since 743 * init_router_targets() initializes the list of targets 744 * for all phyints it is done below the loop. 745 */ 746 if (pii->pii_targets == NULL) 747 target_scan_reqd = _B_TRUE; 748 749 /* 750 * Start the probe timer for this instance. 751 */ 752 if (!pii->pii_basetime_inited && PROBE_ENABLED(pii)) { 753 start_timer(pii); 754 pii->pii_basetime_inited = 1; 755 } 756 } 757 758 /* 759 * Scan the interface list for any interfaces that are PI_FAILED or 760 * PI_NOTARGETS but no longer enabled to send probes, and call 761 * phyint_check_for_repair() to see if the link state indicates that 762 * the interface should be repaired. Also see the state diagram in 763 * mpd_probe.c. 764 */ 765 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 766 if ((!PROBE_ENABLED(pi->pi_v4) && !PROBE_ENABLED(pi->pi_v6)) && 767 (pi->pi_state == PI_FAILED || 768 pi->pi_state == PI_NOTARGETS)) { 769 phyint_check_for_repair(pi); 770 } 771 } 772 773 check_testconfig(); 774 775 /* 776 * Try to populate the target list. init_router_targets populates 777 * the target list from the routing table. If our target list is 778 * still empty, init_host_targets adds host targets based on the 779 * host target list of other phyints in the group. 780 */ 781 if (target_scan_reqd) { 782 init_router_targets(); 783 init_host_targets(); 784 } 785 } 786 787 /* 788 * Check test address configuration, and log notices/errors if appropriate. 789 * Note that this function only logs pre-existing conditions (e.g., that 790 * probe-based failure detection is disabled). 791 */ 792 static void 793 check_testconfig(void) 794 { 795 struct phyint *pi; 796 struct logint *li; 797 char abuf[INET6_ADDRSTRLEN]; 798 int pri; 799 800 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 801 if (pi->pi_flags & IFF_OFFLINE) 802 continue; 803 804 if (PROBE_ENABLED(pi->pi_v4) || PROBE_ENABLED(pi->pi_v6)) { 805 if (pi->pi_taddrmsg_printed || 806 pi->pi_duptaddrmsg_printed) { 807 if (pi->pi_duptaddrmsg_printed) 808 pri = LOG_ERR; 809 else 810 pri = LOG_INFO; 811 logmsg(pri, "Test address now configured on " 812 "interface %s; enabling probe-based " 813 "failure detection on it\n", pi->pi_name); 814 pi->pi_taddrmsg_printed = 0; 815 pi->pi_duptaddrmsg_printed = 0; 816 } 817 continue; 818 } 819 820 li = NULL; 821 if (pi->pi_v4 != NULL && pi->pi_v4->pii_probe_logint != NULL && 822 pi->pi_v4->pii_probe_logint->li_dupaddr) 823 li = pi->pi_v4->pii_probe_logint; 824 825 if (pi->pi_v6 != NULL && pi->pi_v6->pii_probe_logint != NULL && 826 pi->pi_v6->pii_probe_logint->li_dupaddr) 827 li = pi->pi_v6->pii_probe_logint; 828 829 if (li != NULL && li->li_dupaddr) { 830 if (pi->pi_duptaddrmsg_printed) 831 continue; 832 logerr("Test address %s is not unique in group; " 833 "disabling probe-based failure detection on %s\n", 834 pr_addr(li->li_phyint_inst->pii_af, 835 li->li_addr, abuf, sizeof (abuf)), pi->pi_name); 836 pi->pi_duptaddrmsg_printed = 1; 837 continue; 838 } 839 840 if (getcurrentsec() < pi->pi_taddrthresh) 841 continue; 842 843 if (!pi->pi_taddrmsg_printed) { 844 logtrace("No test address configured on interface %s; " 845 "disabling probe-based failure detection on it\n", 846 pi->pi_name); 847 pi->pi_taddrmsg_printed = 1; 848 } 849 } 850 } 851 852 /* 853 * Check phyint group configuration, to detect any inconsistencies, 854 * and log an error message. This is called from runtimeouts every 855 * 20 secs. But the error message is displayed once. If the 856 * consistency is resolved by the admin, a recovery message is displayed 857 * once. 858 */ 859 static void 860 check_config(void) 861 { 862 struct phyint_group *pg; 863 struct phyint *pi; 864 boolean_t v4_in_group; 865 boolean_t v6_in_group; 866 867 /* 868 * All phyints of a group must be homogeneous to ensure that they can 869 * take over for one another. If any phyint in a group has IPv4 870 * plumbed, check that all phyints have IPv4 plumbed. Do a similar 871 * check for IPv6. 872 */ 873 for (pg = phyint_groups; pg != NULL; pg = pg->pg_next) { 874 if (pg == phyint_anongroup) 875 continue; 876 877 v4_in_group = _B_FALSE; 878 v6_in_group = _B_FALSE; 879 /* 880 * 1st pass. Determine if at least 1 phyint in the group 881 * has IPv4 plumbed and if so set v4_in_group to true. 882 * Repeat similarly for IPv6. 883 */ 884 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 885 if (pi->pi_v4 != NULL) 886 v4_in_group = _B_TRUE; 887 if (pi->pi_v6 != NULL) 888 v6_in_group = _B_TRUE; 889 } 890 891 /* 892 * 2nd pass. If v4_in_group is true, check that phyint 893 * has IPv4 plumbed. Repeat similarly for IPv6. Print 894 * out a message the 1st time only. 895 */ 896 for (pi = pg->pg_phyint; pi != NULL; pi = pi->pi_pgnext) { 897 if (pi->pi_flags & IFF_OFFLINE) 898 continue; 899 900 if (v4_in_group == _B_TRUE && pi->pi_v4 == NULL) { 901 if (!pi->pi_cfgmsg_printed) { 902 logerr("IP interface %s in group %s is" 903 " not plumbed for IPv4, affecting" 904 " IPv4 connectivity\n", 905 pi->pi_name, 906 pi->pi_group->pg_name); 907 pi->pi_cfgmsg_printed = 1; 908 } 909 } else if (v6_in_group == _B_TRUE && 910 pi->pi_v6 == NULL) { 911 if (!pi->pi_cfgmsg_printed) { 912 logerr("IP interface %s in group %s is" 913 " not plumbed for IPv6, affecting" 914 " IPv6 connectivity\n", 915 pi->pi_name, 916 pi->pi_group->pg_name); 917 pi->pi_cfgmsg_printed = 1; 918 } 919 } else { 920 /* 921 * The phyint matches the group configuration, 922 * if we have reached this point. If it was 923 * improperly configured earlier, log an 924 * error recovery message 925 */ 926 if (pi->pi_cfgmsg_printed) { 927 logerr("IP interface %s is now" 928 " consistent with group %s " 929 " and connectivity is restored\n", 930 pi->pi_name, pi->pi_group->pg_name); 931 pi->pi_cfgmsg_printed = 0; 932 } 933 } 934 935 } 936 } 937 } 938 939 /* 940 * Timer mechanism using relative time (in milliseconds) from the 941 * previous timer event. Timers exceeding TIMER_INFINITY milliseconds 942 * will fire after TIMER_INFINITY milliseconds. 943 * Unsigned arithmetic note: We assume a 32-bit circular sequence space for 944 * time values. Hence 2 consecutive timer events cannot be spaced farther 945 * than 0x7fffffff. We call this TIMER_INFINITY, and it is the maximum value 946 * that can be passed for the delay parameter of timer_schedule() 947 */ 948 static uint_t timer_next; /* Currently scheduled timeout */ 949 static boolean_t timer_active = _B_FALSE; /* SIGALRM has not yet occurred */ 950 951 static void 952 timer_init(void) 953 { 954 timer_next = getcurrenttime() + TIMER_INFINITY; 955 /* 956 * The call to run_timeouts() will get the timer started 957 * Since there are no phyints at this point, the timer will 958 * be set for IF_SCAN_INTERVAL ms. 959 */ 960 run_timeouts(); 961 } 962 963 /* 964 * Make sure the next SIGALRM occurs delay milliseconds from the current 965 * time if not earlier. We are interested only in time differences. 966 */ 967 void 968 timer_schedule(uint_t delay) 969 { 970 uint_t now; 971 struct itimerval itimerval; 972 973 if (debug & D_TIMER) 974 logdebug("timer_schedule(%u)\n", delay); 975 976 assert(delay <= TIMER_INFINITY); 977 978 now = getcurrenttime(); 979 if (delay == 0) { 980 /* Minimum allowed delay */ 981 delay = 1; 982 } 983 /* Will this timer occur before the currently scheduled SIGALRM? */ 984 if (timer_active && TIME_GE(now + delay, timer_next)) { 985 if (debug & D_TIMER) { 986 logdebug("timer_schedule(%u) - no action: " 987 "now %u next %u\n", delay, now, timer_next); 988 } 989 return; 990 } 991 timer_next = now + delay; 992 993 itimerval.it_value.tv_sec = delay / 1000; 994 itimerval.it_value.tv_usec = (delay % 1000) * 1000; 995 itimerval.it_interval.tv_sec = 0; 996 itimerval.it_interval.tv_usec = 0; 997 if (debug & D_TIMER) { 998 logdebug("timer_schedule(%u): sec %ld usec %ld\n", 999 delay, itimerval.it_value.tv_sec, 1000 itimerval.it_value.tv_usec); 1001 } 1002 timer_active = _B_TRUE; 1003 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) { 1004 logperror("timer_schedule: setitimer"); 1005 exit(2); 1006 } 1007 } 1008 1009 static void 1010 timer_cancel(void) 1011 { 1012 struct itimerval itimerval; 1013 1014 if (debug & D_TIMER) 1015 logdebug("timer_cancel()\n"); 1016 1017 bzero(&itimerval, sizeof (itimerval)); 1018 if (setitimer(ITIMER_REAL, &itimerval, NULL) < 0) 1019 logperror("timer_cancel: setitimer"); 1020 } 1021 1022 /* 1023 * Timer has fired. Determine when the next timer event will occur by asking 1024 * all the timer routines. Should not be called from a timer routine. 1025 */ 1026 static void 1027 run_timeouts(void) 1028 { 1029 uint_t next; 1030 uint_t next_event_time; 1031 struct phyint_instance *pii; 1032 struct phyint_instance *next_pii; 1033 static boolean_t timeout_running; 1034 1035 /* assert that recursive timeouts don't happen. */ 1036 assert(!timeout_running); 1037 1038 timeout_running = _B_TRUE; 1039 1040 if (debug & D_TIMER) 1041 logdebug("run_timeouts()\n"); 1042 1043 if ((getcurrenttime() - last_initifs_time) > IF_SCAN_INTERVAL) { 1044 initifs(); 1045 check_config(); 1046 } 1047 1048 next = TIMER_INFINITY; 1049 1050 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1051 next_pii = pii->pii_next; 1052 next_event_time = phyint_inst_timer(pii); 1053 if (next_event_time != TIMER_INFINITY && next_event_time < next) 1054 next = next_event_time; 1055 1056 if (debug & D_TIMER) { 1057 logdebug("run_timeouts(%s %s): next scheduled for" 1058 " this phyint inst %u, next scheduled global" 1059 " %u ms\n", 1060 AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 1061 next_event_time, next); 1062 } 1063 } 1064 1065 /* 1066 * Make sure initifs() is called at least once every 1067 * IF_SCAN_INTERVAL, to make sure that we are in sync 1068 * with the kernel, in case we have missed any routing 1069 * socket messages. 1070 */ 1071 if (next > IF_SCAN_INTERVAL) 1072 next = IF_SCAN_INTERVAL; 1073 1074 if (debug & D_TIMER) 1075 logdebug("run_timeouts: %u ms\n", next); 1076 1077 timer_schedule(next); 1078 timeout_running = _B_FALSE; 1079 } 1080 1081 static int eventpipe_read = -1; /* Used for synchronous signal delivery */ 1082 static int eventpipe_write = -1; 1083 boolean_t cleanup_started = _B_FALSE; /* true if we're going away */ 1084 1085 /* 1086 * Ensure that signals are processed synchronously with the rest of 1087 * the code by just writing a one character signal number on the pipe. 1088 * The poll loop will pick this up and process the signal event. 1089 */ 1090 static void 1091 sig_handler(int signo) 1092 { 1093 uchar_t buf = (uchar_t)signo; 1094 1095 /* 1096 * Don't write to pipe if cleanup has already begun. cleanup() 1097 * might have closed the pipe already 1098 */ 1099 if (cleanup_started) 1100 return; 1101 1102 if (eventpipe_write == -1) { 1103 logerr("sig_handler: no pipe found\n"); 1104 return; 1105 } 1106 if (write(eventpipe_write, &buf, sizeof (buf)) < 0) 1107 logperror("sig_handler: write"); 1108 } 1109 1110 extern struct probes_missed probes_missed; 1111 1112 /* 1113 * Pick up a signal "byte" from the pipe and process it. 1114 */ 1115 static void 1116 in_signal(int fd) 1117 { 1118 uchar_t buf; 1119 uint64_t sent, acked, lost, unacked, unknown; 1120 struct phyint_instance *pii; 1121 int pr_ndx; 1122 1123 switch (read(fd, &buf, sizeof (buf))) { 1124 case -1: 1125 logperror("in_signal: read"); 1126 exit(1); 1127 /* NOTREACHED */ 1128 case 1: 1129 break; 1130 case 0: 1131 logerr("in_signal: read end of file\n"); 1132 exit(1); 1133 /* NOTREACHED */ 1134 default: 1135 logerr("in_signal: read > 1\n"); 1136 exit(1); 1137 } 1138 1139 if (debug & D_TIMER) 1140 logdebug("in_signal() got %d\n", buf); 1141 1142 switch (buf) { 1143 case SIGALRM: 1144 if (debug & D_TIMER) { 1145 uint_t now = getcurrenttime(); 1146 1147 logdebug("in_signal(SIGALRM) delta %u\n", 1148 now - timer_next); 1149 } 1150 timer_active = _B_FALSE; 1151 run_timeouts(); 1152 break; 1153 case SIGUSR1: 1154 logdebug("Printing configuration:\n"); 1155 /* Print out the internal tables */ 1156 phyint_inst_print_all(); 1157 1158 /* 1159 * Print out the accumulated statistics about missed 1160 * probes (happens due to scheduling delay). 1161 */ 1162 logerr("Missed sending total of %d probes spread over" 1163 " %d occurrences\n", probes_missed.pm_nprobes, 1164 probes_missed.pm_ntimes); 1165 1166 /* 1167 * Print out the accumulated statistics about probes 1168 * that were sent. 1169 */ 1170 for (pii = phyint_instances; pii != NULL; 1171 pii = pii->pii_next) { 1172 unacked = 0; 1173 acked = pii->pii_cum_stats.acked; 1174 lost = pii->pii_cum_stats.lost; 1175 sent = pii->pii_cum_stats.sent; 1176 unknown = pii->pii_cum_stats.unknown; 1177 for (pr_ndx = 0; pr_ndx < PROBE_STATS_COUNT; pr_ndx++) { 1178 switch (pii->pii_probes[pr_ndx].pr_status) { 1179 case PR_ACKED: 1180 acked++; 1181 break; 1182 case PR_LOST: 1183 lost++; 1184 break; 1185 case PR_UNACKED: 1186 unacked++; 1187 break; 1188 } 1189 } 1190 logerr("\nProbe stats on (%s %s)\n" 1191 "Number of probes sent %lld\n" 1192 "Number of probe acks received %lld\n" 1193 "Number of probes/acks lost %lld\n" 1194 "Number of valid unacknowledged probes %lld\n" 1195 "Number of ambiguous probe acks received %lld\n", 1196 AF_STR(pii->pii_af), pii->pii_name, 1197 sent, acked, lost, unacked, unknown); 1198 } 1199 break; 1200 case SIGHUP: 1201 logerr("SIGHUP: restart and reread config file\n"); 1202 /* 1203 * Cancel the interval timer. Needed since setitimer() uses 1204 * alarm() and the time left is inherited across exec(), and 1205 * thus the SIGALRM may be delivered before a handler has been 1206 * setup, causing in.mpathd to erroneously exit. 1207 */ 1208 timer_cancel(); 1209 cleanup(); 1210 (void) execv(argv0[0], argv0); 1211 _exit(0177); 1212 /* NOTREACHED */ 1213 case SIGINT: 1214 case SIGTERM: 1215 case SIGQUIT: 1216 cleanup(); 1217 exit(0); 1218 /* NOTREACHED */ 1219 default: 1220 logerr("in_signal: unknown signal: %d\n", buf); 1221 } 1222 } 1223 1224 static void 1225 cleanup(void) 1226 { 1227 struct phyint_instance *pii; 1228 struct phyint_instance *next_pii; 1229 1230 /* 1231 * Make sure that we don't write to eventpipe in 1232 * sig_handler() if any signal notably SIGALRM, 1233 * occurs after we close the eventpipe descriptor below 1234 */ 1235 cleanup_started = _B_TRUE; 1236 1237 for (pii = phyint_instances; pii != NULL; pii = next_pii) { 1238 next_pii = pii->pii_next; 1239 phyint_inst_delete(pii); 1240 } 1241 1242 (void) close(ifsock_v4); 1243 (void) close(ifsock_v6); 1244 (void) close(rtsock_v4); 1245 (void) close(rtsock_v6); 1246 (void) close(lsock_v4); 1247 (void) close(lsock_v6); 1248 (void) close(0); 1249 (void) close(1); 1250 (void) close(2); 1251 (void) close(mibfd); 1252 (void) close(eventpipe_read); 1253 (void) close(eventpipe_write); 1254 } 1255 1256 /* 1257 * Create pipe for signal delivery and set up signal handlers. 1258 */ 1259 static void 1260 setup_eventpipe(void) 1261 { 1262 int fds[2]; 1263 struct sigaction act; 1264 1265 if ((pipe(fds)) < 0) { 1266 logperror("setup_eventpipe: pipe"); 1267 exit(1); 1268 } 1269 eventpipe_read = fds[0]; 1270 eventpipe_write = fds[1]; 1271 if (poll_add(eventpipe_read) == -1) { 1272 exit(1); 1273 } 1274 1275 act.sa_handler = sig_handler; 1276 act.sa_flags = SA_RESTART; 1277 (void) sigaction(SIGALRM, &act, NULL); 1278 1279 (void) sigset(SIGHUP, sig_handler); 1280 (void) sigset(SIGUSR1, sig_handler); 1281 (void) sigset(SIGTERM, sig_handler); 1282 (void) sigset(SIGINT, sig_handler); 1283 (void) sigset(SIGQUIT, sig_handler); 1284 } 1285 1286 /* 1287 * Create a routing socket for receiving RTM_IFINFO messages. 1288 */ 1289 static int 1290 setup_rtsock(int af) 1291 { 1292 int s; 1293 int flags; 1294 int aware = RTAW_UNDER_IPMP; 1295 1296 s = socket(PF_ROUTE, SOCK_RAW, af); 1297 if (s == -1) { 1298 logperror("setup_rtsock: socket PF_ROUTE"); 1299 exit(1); 1300 } 1301 1302 if (setsockopt(s, SOL_ROUTE, RT_AWARE, &aware, sizeof (aware)) == -1) { 1303 logperror("setup_rtsock: setsockopt RT_AWARE"); 1304 (void) close(s); 1305 exit(1); 1306 } 1307 1308 if ((flags = fcntl(s, F_GETFL, 0)) < 0) { 1309 logperror("setup_rtsock: fcntl F_GETFL"); 1310 (void) close(s); 1311 exit(1); 1312 } 1313 if ((fcntl(s, F_SETFL, flags | O_NONBLOCK)) < 0) { 1314 logperror("setup_rtsock: fcntl F_SETFL"); 1315 (void) close(s); 1316 exit(1); 1317 } 1318 if (poll_add(s) == -1) { 1319 (void) close(s); 1320 exit(1); 1321 } 1322 return (s); 1323 } 1324 1325 /* 1326 * Process an RTM_IFINFO message received on a routing socket. 1327 * The return value indicates whether a full interface scan is required. 1328 * Link up/down notifications are reflected in the IFF_RUNNING flag. 1329 * If just the state of the IFF_RUNNING interface flag has changed, a 1330 * a full interface scan isn't required. 1331 */ 1332 static boolean_t 1333 process_rtm_ifinfo(if_msghdr_t *ifm, int type) 1334 { 1335 struct sockaddr_dl *sdl; 1336 struct phyint *pi; 1337 uint64_t old_flags; 1338 struct phyint_instance *pii; 1339 1340 assert(ifm->ifm_type == RTM_IFINFO && ifm->ifm_addrs == RTA_IFP); 1341 1342 /* 1343 * Although the sockaddr_dl structure is directly after the 1344 * if_msghdr_t structure. At the time of writing, the size of the 1345 * if_msghdr_t structure is different on 32 and 64 bit kernels, due 1346 * to the presence of a timeval structure, which contains longs, 1347 * in the if_data structure. Anyway, we know where the message ends, 1348 * so we work backwards to get the start of the sockaddr_dl structure. 1349 */ 1350 /*LINTED*/ 1351 sdl = (struct sockaddr_dl *)((char *)ifm + ifm->ifm_msglen - 1352 sizeof (struct sockaddr_dl)); 1353 1354 assert(sdl->sdl_family == AF_LINK); 1355 1356 /* 1357 * The interface name is in sdl_data. 1358 * RTM_IFINFO messages are only generated for logical interface 1359 * zero, so there is no colon and logical interface number to 1360 * strip from the name. The name is not null terminated, but 1361 * there should be enough space in sdl_data to add the null. 1362 */ 1363 if (sdl->sdl_nlen >= sizeof (sdl->sdl_data)) { 1364 if (debug & D_LINKNOTE) 1365 logdebug("process_rtm_ifinfo: phyint name too long\n"); 1366 return (_B_TRUE); 1367 } 1368 sdl->sdl_data[sdl->sdl_nlen] = 0; 1369 1370 pi = phyint_lookup(sdl->sdl_data); 1371 if (pi == NULL) { 1372 if (debug & D_LINKNOTE) 1373 logdebug("process_rtm_ifinfo: phyint lookup failed" 1374 " for %s\n", sdl->sdl_data); 1375 return (_B_TRUE); 1376 } 1377 1378 /* 1379 * We want to try and avoid doing a full interface scan for 1380 * link state notifications from the datalink layer, as indicated 1381 * by the state of the IFF_RUNNING flag. If just the 1382 * IFF_RUNNING flag has changed state, the link state changes 1383 * are processed without a full scan. 1384 * If there is both an IPv4 and IPv6 instance associated with 1385 * the physical interface, we will get an RTM_IFINFO message 1386 * for each instance. If we just maintained a single copy of 1387 * the physical interface flags, it would appear that no flags 1388 * had changed when the second message is processed, leading us 1389 * to believe that the message wasn't generated by a flags change, 1390 * and that a full interface scan is required. 1391 * To get around this problem, two additional copies of the flags 1392 * are kept, one copy for each instance. These are only used in 1393 * this routine. At any one time, all three copies of the flags 1394 * should be identical except for the IFF_RUNNING flag. The 1395 * copy of the flags in the "phyint" structure is always up to 1396 * date. 1397 */ 1398 pii = (type == AF_INET) ? pi->pi_v4 : pi->pi_v6; 1399 if (pii == NULL) { 1400 if (debug & D_LINKNOTE) 1401 logdebug("process_rtm_ifinfo: no instance of address " 1402 "family %s for %s\n", AF_STR(type), pi->pi_name); 1403 return (_B_TRUE); 1404 } 1405 1406 old_flags = pii->pii_flags; 1407 pii->pii_flags = PHYINT_FLAGS(ifm->ifm_flags); 1408 pi->pi_flags = pii->pii_flags; 1409 1410 if (debug & D_LINKNOTE) { 1411 logdebug("process_rtm_ifinfo: %s address family: %s, " 1412 "old flags: %llx, new flags: %llx\n", pi->pi_name, 1413 AF_STR(type), old_flags, pi->pi_flags); 1414 } 1415 1416 /* 1417 * If IFF_STANDBY has changed, indicate that the interface has changed 1418 * types. 1419 */ 1420 if ((old_flags ^ pii->pii_flags) & IFF_STANDBY) 1421 phyint_changed(pi); 1422 1423 /* Has just the IFF_RUNNING flag changed state ? */ 1424 if ((old_flags ^ pii->pii_flags) != IFF_RUNNING) { 1425 struct phyint_instance *pii_other; 1426 /* 1427 * It wasn't just a link state change. Update 1428 * the other instance's copy of the flags. 1429 */ 1430 pii_other = phyint_inst_other(pii); 1431 if (pii_other != NULL) 1432 pii_other->pii_flags = pii->pii_flags; 1433 return (_B_TRUE); 1434 } 1435 1436 return (_B_FALSE); 1437 } 1438 1439 /* 1440 * Retrieve as many routing socket messages as possible, and try to 1441 * empty the routing sockets. Initiate full scan of targets or interfaces 1442 * as needed. 1443 * We listen on separate IPv4 an IPv6 sockets so that we can accurately 1444 * detect changes in certain flags (see "process_rtm_ifinfo()" above). 1445 */ 1446 static void 1447 process_rtsock(int rtsock_v4, int rtsock_v6) 1448 { 1449 int nbytes; 1450 int64_t msg[2048 / 8]; 1451 struct rt_msghdr *rtm; 1452 boolean_t need_if_scan = _B_FALSE; 1453 boolean_t need_rt_scan = _B_FALSE; 1454 boolean_t rtm_ifinfo_seen = _B_FALSE; 1455 int type; 1456 1457 /* Read as many messages as possible and try to empty the sockets */ 1458 for (type = AF_INET; ; type = AF_INET6) { 1459 for (;;) { 1460 nbytes = read((type == AF_INET) ? rtsock_v4 : 1461 rtsock_v6, msg, sizeof (msg)); 1462 if (nbytes <= 0) { 1463 /* No more messages */ 1464 break; 1465 } 1466 rtm = (struct rt_msghdr *)msg; 1467 if (rtm->rtm_version != RTM_VERSION) { 1468 logerr("process_rtsock: version %d " 1469 "not understood\n", rtm->rtm_version); 1470 break; 1471 } 1472 1473 if (debug & D_PHYINT) { 1474 logdebug("process_rtsock: message %d\n", 1475 rtm->rtm_type); 1476 } 1477 1478 switch (rtm->rtm_type) { 1479 case RTM_NEWADDR: 1480 case RTM_DELADDR: 1481 /* 1482 * Some logical interface has changed, 1483 * have to scan everything to determine 1484 * what actually changed. 1485 */ 1486 need_if_scan = _B_TRUE; 1487 break; 1488 1489 case RTM_IFINFO: 1490 rtm_ifinfo_seen = _B_TRUE; 1491 need_if_scan |= process_rtm_ifinfo( 1492 (if_msghdr_t *)rtm, type); 1493 break; 1494 1495 case RTM_ADD: 1496 case RTM_DELETE: 1497 case RTM_CHANGE: 1498 case RTM_OLDADD: 1499 case RTM_OLDDEL: 1500 need_rt_scan = _B_TRUE; 1501 break; 1502 1503 default: 1504 /* Not interesting */ 1505 break; 1506 } 1507 } 1508 if (type == AF_INET6) 1509 break; 1510 } 1511 1512 if (need_if_scan) { 1513 if (debug & D_LINKNOTE && rtm_ifinfo_seen) 1514 logdebug("process_rtsock: synchronizing with kernel\n"); 1515 initifs(); 1516 } else if (rtm_ifinfo_seen) { 1517 if (debug & D_LINKNOTE) 1518 logdebug("process_rtsock: " 1519 "link up/down notification(s) seen\n"); 1520 process_link_state_changes(); 1521 } 1522 1523 if (need_rt_scan) 1524 init_router_targets(); 1525 } 1526 1527 /* 1528 * Look if the phyint instance or one of its logints have been removed from 1529 * the kernel and take appropriate action. 1530 * Uses {pii,li}_in_use. 1531 */ 1532 static void 1533 check_if_removed(struct phyint_instance *pii) 1534 { 1535 struct logint *li; 1536 struct logint *next_li; 1537 1538 /* Detect phyints that have been removed from the kernel. */ 1539 if (!pii->pii_in_use) { 1540 logtrace("%s %s has been removed from kernel\n", 1541 AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 1542 phyint_inst_delete(pii); 1543 } else { 1544 /* Detect logints that have been removed. */ 1545 for (li = pii->pii_logint; li != NULL; li = next_li) { 1546 next_li = li->li_next; 1547 if (!li->li_in_use) { 1548 logint_delete(li); 1549 } 1550 } 1551 } 1552 } 1553 1554 /* 1555 * Parse the supplied mib2 information to extract the routing information 1556 * table. Process the routing table to get the list of known onlink routers 1557 * and update our database. These onlink routers will serve as probe 1558 * targets. 1559 */ 1560 static void 1561 update_router_list(mib_item_t *item) 1562 { 1563 for (; item != NULL; item = item->mi_next) { 1564 if (item->mi_opthdr.name == 0) 1565 continue; 1566 if (item->mi_opthdr.level == MIB2_IP && 1567 item->mi_opthdr.name == MIB2_IP_ROUTE) { 1568 ire_process_v4((mib2_ipRouteEntry_t *)item->mi_valp, 1569 item->mi_opthdr.len); 1570 } else if (item->mi_opthdr.level == MIB2_IP6 && 1571 item->mi_opthdr.name == MIB2_IP6_ROUTE) { 1572 ire_process_v6((mib2_ipv6RouteEntry_t *)item->mi_valp, 1573 item->mi_opthdr.len); 1574 } 1575 } 1576 } 1577 1578 1579 /* 1580 * Convert octet `octp' to a phyint name and store in `ifname' 1581 */ 1582 static void 1583 oct2ifname(const Octet_t *octp, char *ifname, size_t ifsize) 1584 { 1585 char *cp; 1586 size_t len = MIN(octp->o_length, ifsize - 1); 1587 1588 (void) strncpy(ifname, octp->o_bytes, len); 1589 ifname[len] = '\0'; 1590 1591 if ((cp = strchr(ifname, IF_SEPARATOR)) != NULL) 1592 *cp = '\0'; 1593 } 1594 1595 /* 1596 * Examine the IPv4 routing table `buf' for possible targets. For each 1597 * possible target, if it's on the same subnet an interface route, pass 1598 * it to router_add_common() for further consideration. 1599 */ 1600 static void 1601 ire_process_v4(mib2_ipRouteEntry_t *buf, size_t len) 1602 { 1603 char ifname[LIFNAMSIZ]; 1604 mib2_ipRouteEntry_t *rp, *rp1, *endp; 1605 struct in_addr nexthop_v4; 1606 struct in6_addr nexthop; 1607 1608 if (debug & D_TARGET) 1609 logdebug("ire_process_v4(len %d)\n", len); 1610 1611 if (len == 0) 1612 return; 1613 1614 assert((len % ipRouteEntrySize) == 0); 1615 endp = buf + (len / ipRouteEntrySize); 1616 1617 /* 1618 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1619 * cross-reference them with the interface routes to determine if 1620 * they're possible probe targets. 1621 */ 1622 for (rp = buf; rp < endp; rp++) { 1623 if (!(rp->ipRouteInfo.re_ire_type & IRE_OFFSUBNET)) 1624 continue; 1625 1626 /* Get the nexthop address. */ 1627 nexthop_v4.s_addr = rp->ipRouteNextHop; 1628 1629 /* 1630 * Rescan the routing table looking for interface routes that 1631 * are on the same subnet, and try to add them. If they're 1632 * not relevant (e.g., the interface route isn't part of an 1633 * IPMP group, router_add_common() will discard). 1634 */ 1635 for (rp1 = buf; rp1 < endp; rp1++) { 1636 if (!(rp1->ipRouteInfo.re_ire_type & IRE_INTERFACE) || 1637 rp1->ipRouteIfIndex.o_length == 0) 1638 continue; 1639 1640 if ((rp1->ipRouteDest & rp1->ipRouteMask) != 1641 (nexthop_v4.s_addr & rp1->ipRouteMask)) 1642 continue; 1643 1644 oct2ifname(&rp1->ipRouteIfIndex, ifname, LIFNAMSIZ); 1645 IN6_INADDR_TO_V4MAPPED(&nexthop_v4, &nexthop); 1646 router_add_common(AF_INET, ifname, nexthop); 1647 } 1648 } 1649 } 1650 1651 void 1652 router_add_common(int af, char *ifname, struct in6_addr nexthop) 1653 { 1654 struct phyint_instance *pii; 1655 struct phyint *pi; 1656 1657 if (debug & D_TARGET) 1658 logdebug("router_add_common(%s %s)\n", AF_STR(af), ifname); 1659 1660 /* 1661 * Retrieve the phyint instance; bail if it's not known to us yet. 1662 */ 1663 pii = phyint_inst_lookup(af, ifname); 1664 if (pii == NULL) 1665 return; 1666 1667 /* 1668 * Don't use our own addresses as targets. 1669 */ 1670 if (own_address(nexthop)) 1671 return; 1672 1673 /* 1674 * If the phyint is part a named group, then add the address to all 1675 * members of the group; note that this is suboptimal in the IPv4 case 1676 * as it has already been added to all matching interfaces in 1677 * ire_process_v4(). Otherwise, add the address only to the phyint 1678 * itself, since other phyints in the anongroup may not be on the same 1679 * subnet. 1680 */ 1681 pi = pii->pii_phyint; 1682 if (pi->pi_group == phyint_anongroup) { 1683 target_add(pii, nexthop, _B_TRUE); 1684 } else { 1685 pi = pi->pi_group->pg_phyint; 1686 for (; pi != NULL; pi = pi->pi_pgnext) 1687 target_add(PHYINT_INSTANCE(pi, af), nexthop, _B_TRUE); 1688 } 1689 } 1690 1691 /* 1692 * Examine the IPv6 routing table `buf' for possible link-local targets, and 1693 * pass any contenders to router_add_common() for further consideration. 1694 */ 1695 static void 1696 ire_process_v6(mib2_ipv6RouteEntry_t *buf, size_t len) 1697 { 1698 struct lifreq lifr; 1699 char ifname[LIFNAMSIZ]; 1700 char grname[LIFGRNAMSIZ]; 1701 mib2_ipv6RouteEntry_t *rp, *rp1, *endp; 1702 struct in6_addr nexthop_v6; 1703 1704 if (debug & D_TARGET) 1705 logdebug("ire_process_v6(len %d)\n", len); 1706 1707 if (len == 0) 1708 return; 1709 1710 assert((len % ipv6RouteEntrySize) == 0); 1711 endp = buf + (len / ipv6RouteEntrySize); 1712 1713 /* 1714 * Scan the routing table entries for any IRE_OFFSUBNET entries, and 1715 * cross-reference them with the interface routes to determine if 1716 * they're possible probe targets. 1717 */ 1718 for (rp = buf; rp < endp; rp++) { 1719 if (!(rp->ipv6RouteInfo.re_ire_type & IRE_OFFSUBNET) || 1720 !IN6_IS_ADDR_LINKLOCAL(&rp->ipv6RouteNextHop)) 1721 continue; 1722 1723 /* Get the nexthop address. */ 1724 nexthop_v6 = rp->ipv6RouteNextHop; 1725 1726 /* 1727 * The interface name should always exist for link-locals; 1728 * we use it to map this entry to an IPMP group name. 1729 */ 1730 if (rp->ipv6RouteIfIndex.o_length == 0) 1731 continue; 1732 1733 oct2ifname(&rp->ipv6RouteIfIndex, lifr.lifr_name, LIFNAMSIZ); 1734 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) == -1 || 1735 strlcpy(grname, lifr.lifr_groupname, LIFGRNAMSIZ) == 0) { 1736 continue; 1737 } 1738 1739 /* 1740 * Rescan the list of routes for interface routes, and add the 1741 * above target to any interfaces in the same IPMP group. 1742 */ 1743 for (rp1 = buf; rp1 < endp; rp1++) { 1744 if (!(rp1->ipv6RouteInfo.re_ire_type & IRE_INTERFACE) || 1745 rp1->ipv6RouteIfIndex.o_length == 0) { 1746 continue; 1747 } 1748 oct2ifname(&rp1->ipv6RouteIfIndex, ifname, LIFNAMSIZ); 1749 (void) strlcpy(lifr.lifr_name, ifname, LIFNAMSIZ); 1750 1751 if (ioctl(ifsock_v6, SIOCGLIFGROUPNAME, &lifr) != -1 && 1752 strcmp(lifr.lifr_groupname, grname) == 0) { 1753 router_add_common(AF_INET6, ifname, nexthop_v6); 1754 } 1755 } 1756 } 1757 } 1758 1759 /* 1760 * Build a list of target routers, by scanning the routing tables. 1761 * It is assumed that interface routes exist, to reach the routers. 1762 */ 1763 static void 1764 init_router_targets(void) 1765 { 1766 struct target *tg; 1767 struct target *next_tg; 1768 struct phyint_instance *pii; 1769 struct phyint *pi; 1770 1771 if (force_mcast) 1772 return; 1773 1774 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1775 pi = pii->pii_phyint; 1776 /* 1777 * Set tg_in_use to false only for router targets. 1778 */ 1779 if (!pii->pii_targets_are_routers) 1780 continue; 1781 1782 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) 1783 tg->tg_in_use = 0; 1784 } 1785 1786 if (mibwalk(update_router_list) == -1) 1787 exit(1); 1788 1789 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1790 pi = pii->pii_phyint; 1791 if (!pii->pii_targets_are_routers) 1792 continue; 1793 1794 for (tg = pii->pii_targets; tg != NULL; tg = next_tg) { 1795 next_tg = tg->tg_next; 1796 /* 1797 * If the group has failed, it's likely the route was 1798 * removed by an application affected by that failure. 1799 * In that case, we keep the target so that we can 1800 * reliably repair, at which point we'll refresh the 1801 * target list again. 1802 */ 1803 if (!tg->tg_in_use && !GROUP_FAILED(pi->pi_group)) 1804 target_delete(tg); 1805 } 1806 } 1807 } 1808 1809 /* 1810 * Attempt to assign host targets to any interfaces that do not currently 1811 * have probe targets by sharing targets with other interfaces in the group. 1812 */ 1813 static void 1814 init_host_targets(void) 1815 { 1816 struct phyint_instance *pii; 1817 struct phyint_group *pg; 1818 1819 for (pii = phyint_instances; pii != NULL; pii = pii->pii_next) { 1820 pg = pii->pii_phyint->pi_group; 1821 if (pg != phyint_anongroup && pii->pii_targets == NULL) 1822 dup_host_targets(pii); 1823 } 1824 } 1825 1826 /* 1827 * Duplicate host targets from other phyints of the group to 1828 * the phyint instance 'desired_pii'. 1829 */ 1830 static void 1831 dup_host_targets(struct phyint_instance *desired_pii) 1832 { 1833 int af; 1834 struct phyint *pi; 1835 struct phyint_instance *pii; 1836 struct target *tg; 1837 1838 assert(desired_pii->pii_phyint->pi_group != phyint_anongroup); 1839 1840 af = desired_pii->pii_af; 1841 1842 /* 1843 * For every phyint in the same group as desired_pii, check if 1844 * it has any host targets. If so add them to desired_pii. 1845 */ 1846 for (pi = desired_pii->pii_phyint; pi != NULL; pi = pi->pi_pgnext) { 1847 pii = PHYINT_INSTANCE(pi, af); 1848 /* 1849 * We know that we don't have targets on this phyint instance 1850 * since we have been called. But we still check for 1851 * pii_targets_are_routers because another phyint instance 1852 * could have router targets, since IFF_NOFAILOVER addresses 1853 * on different phyint instances may belong to different 1854 * subnets. 1855 */ 1856 if ((pii == NULL) || (pii == desired_pii) || 1857 pii->pii_targets_are_routers) 1858 continue; 1859 for (tg = pii->pii_targets; tg != NULL; tg = tg->tg_next) { 1860 target_create(desired_pii, tg->tg_address, _B_FALSE); 1861 } 1862 } 1863 } 1864 1865 static void 1866 usage(char *cmd) 1867 { 1868 (void) fprintf(stderr, "usage: %s\n", cmd); 1869 } 1870 1871 1872 #define MPATHD_DEFAULT_FILE "/etc/default/mpathd" 1873 1874 /* Get an option from the /etc/default/mpathd file */ 1875 static char * 1876 getdefault(char *name) 1877 { 1878 char namebuf[BUFSIZ]; 1879 char *value = NULL; 1880 1881 if (defopen(MPATHD_DEFAULT_FILE) == 0) { 1882 char *cp; 1883 int flags; 1884 1885 /* 1886 * ignore case 1887 */ 1888 flags = defcntl(DC_GETFLAGS, 0); 1889 TURNOFF(flags, DC_CASE); 1890 (void) defcntl(DC_SETFLAGS, flags); 1891 1892 /* Add "=" to the name */ 1893 (void) strncpy(namebuf, name, sizeof (namebuf) - 2); 1894 (void) strncat(namebuf, "=", 2); 1895 1896 if ((cp = defread(namebuf)) != NULL) 1897 value = strdup(cp); 1898 1899 /* close */ 1900 (void) defopen((char *)NULL); 1901 } 1902 return (value); 1903 } 1904 1905 1906 /* 1907 * Command line options below 1908 */ 1909 boolean_t failback_enabled = _B_TRUE; /* failback enabled/disabled */ 1910 boolean_t track_all_phyints = _B_FALSE; /* track all IP interfaces */ 1911 static boolean_t adopt = _B_FALSE; 1912 static boolean_t foreground = _B_FALSE; 1913 1914 int 1915 main(int argc, char *argv[]) 1916 { 1917 int i; 1918 int c; 1919 struct phyint *pi; 1920 struct phyint_instance *pii; 1921 char *value; 1922 1923 argv0 = argv; /* Saved for re-exec on SIGHUP */ 1924 srandom(gethostid()); /* Initialize the random number generator */ 1925 1926 /* 1927 * NOTE: The messages output by in.mpathd are not suitable for 1928 * translation, so we do not call textdomain(). 1929 */ 1930 (void) setlocale(LC_ALL, ""); 1931 1932 /* 1933 * Get the user specified value of 'failure detection time' 1934 * from /etc/default/mpathd 1935 */ 1936 value = getdefault("FAILURE_DETECTION_TIME"); 1937 if (value != NULL) { 1938 user_failure_detection_time = 1939 (int)strtol((char *)value, NULL, 0); 1940 1941 if (user_failure_detection_time <= 0) { 1942 user_failure_detection_time = FAILURE_DETECTION_TIME; 1943 logerr("Invalid failure detection time %s, assuming " 1944 "default of %d ms\n", value, 1945 user_failure_detection_time); 1946 1947 } else if (user_failure_detection_time < 1948 MIN_FAILURE_DETECTION_TIME) { 1949 user_failure_detection_time = 1950 MIN_FAILURE_DETECTION_TIME; 1951 logerr("Too small failure detection time of %s, " 1952 "assuming minimum of %d ms\n", value, 1953 user_failure_detection_time); 1954 } 1955 free(value); 1956 } else { 1957 /* User has not specified the parameter, Use default value */ 1958 user_failure_detection_time = FAILURE_DETECTION_TIME; 1959 } 1960 1961 /* 1962 * This gives the frequency at which probes will be sent. 1963 * When fdt ms elapses, we should be able to determine 1964 * whether 5 consecutive probes have failed or not. 1965 * 1 probe will be sent in every user_probe_interval ms, 1966 * randomly anytime in the (0.5 - 1.0) 2nd half of every 1967 * user_probe_interval. Thus when we send out probe 'n' we 1968 * can be sure that probe 'n - 2' is lost, if we have not 1969 * got the ack. (since the probe interval is > crtt). But 1970 * probe 'n - 1' may be a valid unacked probe, since the 1971 * time between 2 successive probes could be as small as 1972 * 0.5 * user_probe_interval. Hence the NUM_PROBE_FAILS + 2 1973 */ 1974 user_probe_interval = user_failure_detection_time / 1975 (NUM_PROBE_FAILS + 2); 1976 1977 /* 1978 * Get the user specified value of failback_enabled from 1979 * /etc/default/mpathd 1980 */ 1981 value = getdefault("FAILBACK"); 1982 if (value != NULL) { 1983 if (strcasecmp(value, "yes") == 0) 1984 failback_enabled = _B_TRUE; 1985 else if (strcasecmp(value, "no") == 0) 1986 failback_enabled = _B_FALSE; 1987 else 1988 logerr("Invalid value for FAILBACK %s\n", value); 1989 free(value); 1990 } else { 1991 failback_enabled = _B_TRUE; 1992 } 1993 1994 /* 1995 * Get the user specified value of track_all_phyints from 1996 * /etc/default/mpathd. The sense is reversed in 1997 * TRACK_INTERFACES_ONLY_WITH_GROUPS. 1998 */ 1999 value = getdefault("TRACK_INTERFACES_ONLY_WITH_GROUPS"); 2000 if (value != NULL) { 2001 if (strcasecmp(value, "yes") == 0) 2002 track_all_phyints = _B_FALSE; 2003 else if (strcasecmp(value, "no") == 0) 2004 track_all_phyints = _B_TRUE; 2005 else 2006 logerr("Invalid value for " 2007 "TRACK_INTERFACES_ONLY_WITH_GROUPS %s\n", value); 2008 free(value); 2009 } else { 2010 track_all_phyints = _B_FALSE; 2011 } 2012 2013 while ((c = getopt(argc, argv, "adD:ml")) != EOF) { 2014 switch (c) { 2015 case 'a': 2016 adopt = _B_TRUE; 2017 break; 2018 case 'm': 2019 force_mcast = _B_TRUE; 2020 break; 2021 case 'd': 2022 debug = D_ALL; 2023 foreground = _B_TRUE; 2024 break; 2025 case 'D': 2026 i = (int)strtol(optarg, NULL, 0); 2027 if (i == 0) { 2028 (void) fprintf(stderr, "Bad debug flags: %s\n", 2029 optarg); 2030 exit(1); 2031 } 2032 debug |= i; 2033 foreground = _B_TRUE; 2034 break; 2035 case 'l': 2036 /* 2037 * Turn off link state notification handling. 2038 * Undocumented command line flag, for debugging 2039 * purposes. 2040 */ 2041 handle_link_notifications = _B_FALSE; 2042 break; 2043 default: 2044 usage(argv[0]); 2045 exit(1); 2046 } 2047 } 2048 2049 /* 2050 * The sockets for the loopback command interface should be listening 2051 * before we fork and exit in daemonize(). This way, whoever started us 2052 * can use the loopback interface as soon as they get a zero exit 2053 * status. 2054 */ 2055 lsock_v4 = setup_listener(AF_INET); 2056 lsock_v6 = setup_listener(AF_INET6); 2057 2058 if (lsock_v4 < 0 && lsock_v6 < 0) { 2059 logerr("main: setup_listener failed for both IPv4 and IPv6\n"); 2060 exit(1); 2061 } 2062 2063 if (!foreground) { 2064 if (!daemonize()) { 2065 logerr("cannot daemonize\n"); 2066 exit(EXIT_FAILURE); 2067 } 2068 initlog(); 2069 } 2070 2071 /* 2072 * Initializations: 2073 * 1. Create ifsock* sockets. These are used for performing SIOC* 2074 * ioctls. We have 2 sockets 1 each for IPv4 and IPv6. 2075 * 2. Initialize a pipe for handling/recording signal events. 2076 * 3. Create the routing sockets, used for listening 2077 * to routing / interface changes. 2078 * 4. phyint_init() - Initialize physical interface state 2079 * (in mpd_tables.c). Must be done before creating interfaces, 2080 * which timer_init() does indirectly. 2081 * 5. Query kernel for route entry sizes (v4 and v6). 2082 * 6. timer_init() - Initialize timer related stuff 2083 * 7. initifs() - Initialize our database of all known interfaces 2084 * 8. init_router_targets() - Initialize our database of all known 2085 * router targets. 2086 */ 2087 ifsock_v4 = socket(AF_INET, SOCK_DGRAM, 0); 2088 if (ifsock_v4 < 0) { 2089 logperror("main: IPv4 socket open"); 2090 exit(1); 2091 } 2092 2093 ifsock_v6 = socket(AF_INET6, SOCK_DGRAM, 0); 2094 if (ifsock_v6 < 0) { 2095 logperror("main: IPv6 socket open"); 2096 exit(1); 2097 } 2098 2099 setup_eventpipe(); 2100 2101 rtsock_v4 = setup_rtsock(AF_INET); 2102 rtsock_v6 = setup_rtsock(AF_INET6); 2103 2104 if (phyint_init() == -1) { 2105 logerr("cannot initialize physical interface structures"); 2106 exit(1); 2107 } 2108 2109 if (mibwalk(mib_get_constants) == -1) 2110 exit(1); 2111 2112 timer_init(); 2113 2114 initifs(); 2115 2116 /* 2117 * If we're operating in "adopt" mode and no interfaces need to be 2118 * tracked, shut down (ifconfig(1M) will restart us on demand if 2119 * interfaces are subsequently put into multipathing groups). 2120 */ 2121 if (adopt && phyint_instances == NULL) 2122 exit(0); 2123 2124 /* 2125 * Main body. Keep listening for activity on any of the sockets 2126 * that we are monitoring and take appropriate action as necessary. 2127 * signals are also handled synchronously. 2128 */ 2129 for (;;) { 2130 if (poll(pollfds, pollfd_num, -1) < 0) { 2131 if (errno == EINTR) 2132 continue; 2133 logperror("main: poll"); 2134 exit(1); 2135 } 2136 for (i = 0; i < pollfd_num; i++) { 2137 if ((pollfds[i].fd == -1) || 2138 !(pollfds[i].revents & POLLIN)) 2139 continue; 2140 if (pollfds[i].fd == eventpipe_read) { 2141 in_signal(eventpipe_read); 2142 break; 2143 } 2144 if (pollfds[i].fd == rtsock_v4 || 2145 pollfds[i].fd == rtsock_v6) { 2146 process_rtsock(rtsock_v4, rtsock_v6); 2147 break; 2148 } 2149 2150 for (pii = phyint_instances; pii != NULL; 2151 pii = pii->pii_next) { 2152 if (pollfds[i].fd == pii->pii_probe_sock) { 2153 if (pii->pii_af == AF_INET) 2154 in_data(pii); 2155 else 2156 in6_data(pii); 2157 break; 2158 } 2159 } 2160 2161 for (pi = phyints; pi != NULL; pi = pi->pi_next) { 2162 if (pi->pi_notes != 0 && 2163 pollfds[i].fd == dlpi_fd(pi->pi_dh)) { 2164 (void) dlpi_recv(pi->pi_dh, NULL, NULL, 2165 NULL, NULL, 0, NULL); 2166 break; 2167 } 2168 } 2169 2170 if (pollfds[i].fd == lsock_v4) 2171 loopback_cmd(lsock_v4, AF_INET); 2172 else if (pollfds[i].fd == lsock_v6) 2173 loopback_cmd(lsock_v6, AF_INET6); 2174 } 2175 } 2176 /* NOTREACHED */ 2177 return (EXIT_SUCCESS); 2178 } 2179 2180 static int 2181 setup_listener(int af) 2182 { 2183 int sock; 2184 int on; 2185 int len; 2186 int ret; 2187 struct sockaddr_storage laddr; 2188 struct sockaddr_in *sin; 2189 struct sockaddr_in6 *sin6; 2190 struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; 2191 2192 assert(af == AF_INET || af == AF_INET6); 2193 2194 sock = socket(af, SOCK_STREAM, 0); 2195 if (sock < 0) { 2196 logperror("setup_listener: socket"); 2197 exit(1); 2198 } 2199 2200 on = 1; 2201 if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, (char *)&on, 2202 sizeof (on)) < 0) { 2203 logperror("setup_listener: setsockopt (SO_REUSEADDR)"); 2204 exit(1); 2205 } 2206 2207 bzero(&laddr, sizeof (laddr)); 2208 laddr.ss_family = af; 2209 2210 if (af == AF_INET) { 2211 sin = (struct sockaddr_in *)&laddr; 2212 sin->sin_port = htons(MPATHD_PORT); 2213 sin->sin_addr.s_addr = htonl(INADDR_LOOPBACK); 2214 len = sizeof (struct sockaddr_in); 2215 } else { 2216 sin6 = (struct sockaddr_in6 *)&laddr; 2217 sin6->sin6_port = htons(MPATHD_PORT); 2218 sin6->sin6_addr = loopback_addr; 2219 len = sizeof (struct sockaddr_in6); 2220 } 2221 2222 ret = bind(sock, (struct sockaddr *)&laddr, len); 2223 if (ret < 0) { 2224 if (errno == EADDRINUSE) { 2225 /* 2226 * Another instance of mpathd may be already active. 2227 */ 2228 logerr("main: is another instance of in.mpathd " 2229 "already active?\n"); 2230 exit(1); 2231 } else { 2232 (void) close(sock); 2233 return (-1); 2234 } 2235 } 2236 if (listen(sock, 30) < 0) { 2237 logperror("main: listen"); 2238 exit(1); 2239 } 2240 if (poll_add(sock) == -1) { 2241 (void) close(sock); 2242 exit(1); 2243 } 2244 2245 return (sock); 2246 } 2247 2248 /* 2249 * Table of commands and their expected size; used by loopback_cmd(). 2250 */ 2251 static struct { 2252 const char *name; 2253 unsigned int size; 2254 } commands[] = { 2255 { "MI_PING", sizeof (uint32_t) }, 2256 { "MI_OFFLINE", sizeof (mi_offline_t) }, 2257 { "MI_UNDO_OFFLINE", sizeof (mi_undo_offline_t) }, 2258 { "MI_QUERY", sizeof (mi_query_t) } 2259 }; 2260 2261 /* 2262 * Commands received over the loopback interface come here (via libipmp). 2263 */ 2264 static void 2265 loopback_cmd(int sock, int family) 2266 { 2267 int newfd; 2268 ssize_t len; 2269 boolean_t is_priv = _B_FALSE; 2270 struct sockaddr_storage peer; 2271 struct sockaddr_in *peer_sin; 2272 struct sockaddr_in6 *peer_sin6; 2273 socklen_t peerlen; 2274 union mi_commands mpi; 2275 char abuf[INET6_ADDRSTRLEN]; 2276 uint_t cmd; 2277 int retval; 2278 2279 peerlen = sizeof (peer); 2280 newfd = accept(sock, (struct sockaddr *)&peer, &peerlen); 2281 if (newfd < 0) { 2282 logperror("loopback_cmd: accept"); 2283 return; 2284 } 2285 2286 switch (family) { 2287 case AF_INET: 2288 /* 2289 * Validate the address and port to make sure that 2290 * non privileged processes don't connect and start 2291 * talking to us. 2292 */ 2293 if (peerlen != sizeof (struct sockaddr_in)) { 2294 logerr("loopback_cmd: AF_INET peerlen %d\n", peerlen); 2295 (void) close(newfd); 2296 return; 2297 } 2298 peer_sin = (struct sockaddr_in *)&peer; 2299 is_priv = ntohs(peer_sin->sin_port) < IPPORT_RESERVED; 2300 (void) inet_ntop(AF_INET, &peer_sin->sin_addr.s_addr, 2301 abuf, sizeof (abuf)); 2302 2303 if (ntohl(peer_sin->sin_addr.s_addr) != INADDR_LOOPBACK) { 2304 logerr("Attempt to connect from addr %s port %d\n", 2305 abuf, ntohs(peer_sin->sin_port)); 2306 (void) close(newfd); 2307 return; 2308 } 2309 break; 2310 2311 case AF_INET6: 2312 if (peerlen != sizeof (struct sockaddr_in6)) { 2313 logerr("loopback_cmd: AF_INET6 peerlen %d\n", peerlen); 2314 (void) close(newfd); 2315 return; 2316 } 2317 /* 2318 * Validate the address and port to make sure that 2319 * non privileged processes don't connect and start 2320 * talking to us. 2321 */ 2322 peer_sin6 = (struct sockaddr_in6 *)&peer; 2323 is_priv = ntohs(peer_sin6->sin6_port) < IPPORT_RESERVED; 2324 (void) inet_ntop(AF_INET6, &peer_sin6->sin6_addr, abuf, 2325 sizeof (abuf)); 2326 if (!IN6_IS_ADDR_LOOPBACK(&peer_sin6->sin6_addr)) { 2327 logerr("Attempt to connect from addr %s port %d\n", 2328 abuf, ntohs(peer_sin6->sin6_port)); 2329 (void) close(newfd); 2330 return; 2331 } 2332 2333 default: 2334 logdebug("loopback_cmd: family %d\n", family); 2335 (void) close(newfd); 2336 return; 2337 } 2338 2339 /* 2340 * The sizeof the 'mpi' buffer corresponds to the maximum size of 2341 * all supported commands 2342 */ 2343 len = read(newfd, &mpi, sizeof (mpi)); 2344 2345 /* 2346 * In theory, we can receive any sized message for a stream socket, 2347 * but we don't expect that to happen for a small message over a 2348 * loopback connection. 2349 */ 2350 if (len < sizeof (uint32_t)) { 2351 logerr("loopback_cmd: bad command format or read returns " 2352 "partial data %d\n", len); 2353 (void) close(newfd); 2354 return; 2355 } 2356 2357 cmd = mpi.mi_command; 2358 if (cmd >= MI_NCMD) { 2359 logerr("loopback_cmd: unknown command id `%d'\n", cmd); 2360 (void) close(newfd); 2361 return; 2362 } 2363 2364 /* 2365 * Only MI_PING and MI_QUERY can come from unprivileged sources. 2366 */ 2367 if (!is_priv && (cmd != MI_QUERY && cmd != MI_PING)) { 2368 logerr("Unprivileged request from %s for privileged " 2369 "command %s\n", abuf, commands[cmd].name); 2370 (void) close(newfd); 2371 return; 2372 } 2373 2374 if (len < commands[cmd].size) { 2375 logerr("loopback_cmd: short %s command (expected %d, got %d)\n", 2376 commands[cmd].name, commands[cmd].size, len); 2377 (void) close(newfd); 2378 return; 2379 } 2380 2381 retval = process_cmd(newfd, &mpi); 2382 if (retval != IPMP_SUCCESS) { 2383 logerr("failed processing %s: %s\n", commands[cmd].name, 2384 ipmp_errmsg(retval)); 2385 } 2386 (void) close(newfd); 2387 } 2388 2389 /* 2390 * Process the commands received via libipmp. 2391 */ 2392 static unsigned int 2393 process_cmd(int newfd, union mi_commands *mpi) 2394 { 2395 struct phyint *pi; 2396 struct mi_offline *mio; 2397 struct mi_undo_offline *miu; 2398 unsigned int retval; 2399 2400 switch (mpi->mi_command) { 2401 case MI_PING: 2402 return (send_result(newfd, IPMP_SUCCESS, 0)); 2403 2404 case MI_OFFLINE: 2405 mio = &mpi->mi_ocmd; 2406 2407 pi = phyint_lookup(mio->mio_ifname); 2408 if (pi == NULL) 2409 return (send_result(newfd, IPMP_EUNKIF, 0)); 2410 2411 retval = phyint_offline(pi, mio->mio_min_redundancy); 2412 if (retval == IPMP_FAILURE) 2413 return (send_result(newfd, IPMP_FAILURE, errno)); 2414 2415 return (send_result(newfd, retval, 0)); 2416 2417 case MI_UNDO_OFFLINE: 2418 miu = &mpi->mi_ucmd; 2419 2420 pi = phyint_lookup(miu->miu_ifname); 2421 if (pi == NULL) 2422 return (send_result(newfd, IPMP_EUNKIF, 0)); 2423 2424 retval = phyint_undo_offline(pi); 2425 if (retval == IPMP_FAILURE) 2426 return (send_result(newfd, IPMP_FAILURE, errno)); 2427 2428 return (send_result(newfd, retval, 0)); 2429 2430 case MI_QUERY: 2431 return (process_query(newfd, &mpi->mi_qcmd)); 2432 2433 default: 2434 break; 2435 } 2436 2437 return (send_result(newfd, IPMP_EPROTO, 0)); 2438 } 2439 2440 /* 2441 * Process the query request pointed to by `miq' and send a reply on file 2442 * descriptor `fd'. Returns an IPMP error code. 2443 */ 2444 static unsigned int 2445 process_query(int fd, mi_query_t *miq) 2446 { 2447 ipmp_addrinfo_t *adinfop; 2448 ipmp_addrinfolist_t *adlp; 2449 ipmp_groupinfo_t *grinfop; 2450 ipmp_groupinfolist_t *grlp; 2451 ipmp_grouplist_t *grlistp; 2452 ipmp_ifinfo_t *ifinfop; 2453 ipmp_ifinfolist_t *iflp; 2454 ipmp_snap_t *snap; 2455 unsigned int retval; 2456 2457 switch (miq->miq_inforeq) { 2458 case IPMP_ADDRINFO: 2459 retval = getgraddrinfo(miq->miq_grname, &miq->miq_addr, 2460 &adinfop); 2461 if (retval != IPMP_SUCCESS) 2462 return (send_result(fd, retval, errno)); 2463 2464 retval = send_result(fd, IPMP_SUCCESS, 0); 2465 if (retval == IPMP_SUCCESS) 2466 retval = send_addrinfo(fd, adinfop); 2467 2468 ipmp_freeaddrinfo(adinfop); 2469 return (retval); 2470 2471 case IPMP_GROUPLIST: 2472 retval = getgrouplist(&grlistp); 2473 if (retval != IPMP_SUCCESS) 2474 return (send_result(fd, retval, errno)); 2475 2476 retval = send_result(fd, IPMP_SUCCESS, 0); 2477 if (retval == IPMP_SUCCESS) 2478 retval = send_grouplist(fd, grlistp); 2479 2480 ipmp_freegrouplist(grlistp); 2481 return (retval); 2482 2483 case IPMP_GROUPINFO: 2484 miq->miq_grname[LIFGRNAMSIZ - 1] = '\0'; 2485 retval = getgroupinfo(miq->miq_grname, &grinfop); 2486 if (retval != IPMP_SUCCESS) 2487 return (send_result(fd, retval, errno)); 2488 2489 retval = send_result(fd, IPMP_SUCCESS, 0); 2490 if (retval == IPMP_SUCCESS) 2491 retval = send_groupinfo(fd, grinfop); 2492 2493 ipmp_freegroupinfo(grinfop); 2494 return (retval); 2495 2496 case IPMP_IFINFO: 2497 miq->miq_ifname[LIFNAMSIZ - 1] = '\0'; 2498 retval = getifinfo(miq->miq_ifname, &ifinfop); 2499 if (retval != IPMP_SUCCESS) 2500 return (send_result(fd, retval, errno)); 2501 2502 retval = send_result(fd, IPMP_SUCCESS, 0); 2503 if (retval == IPMP_SUCCESS) 2504 retval = send_ifinfo(fd, ifinfop); 2505 2506 ipmp_freeifinfo(ifinfop); 2507 return (retval); 2508 2509 case IPMP_SNAP: 2510 /* 2511 * Before taking the snapshot, sync with the kernel. 2512 */ 2513 initifs(); 2514 2515 retval = getsnap(&snap); 2516 if (retval != IPMP_SUCCESS) 2517 return (send_result(fd, retval, errno)); 2518 2519 retval = send_result(fd, IPMP_SUCCESS, 0); 2520 if (retval != IPMP_SUCCESS) 2521 goto out; 2522 2523 retval = ipmp_writetlv(fd, IPMP_SNAP, sizeof (*snap), snap); 2524 if (retval != IPMP_SUCCESS) 2525 goto out; 2526 2527 retval = send_grouplist(fd, snap->sn_grlistp); 2528 if (retval != IPMP_SUCCESS) 2529 goto out; 2530 2531 iflp = snap->sn_ifinfolistp; 2532 for (; iflp != NULL; iflp = iflp->ifl_next) { 2533 retval = send_ifinfo(fd, iflp->ifl_ifinfop); 2534 if (retval != IPMP_SUCCESS) 2535 goto out; 2536 } 2537 2538 grlp = snap->sn_grinfolistp; 2539 for (; grlp != NULL; grlp = grlp->grl_next) { 2540 retval = send_groupinfo(fd, grlp->grl_grinfop); 2541 if (retval != IPMP_SUCCESS) 2542 goto out; 2543 } 2544 2545 adlp = snap->sn_adinfolistp; 2546 for (; adlp != NULL; adlp = adlp->adl_next) { 2547 retval = send_addrinfo(fd, adlp->adl_adinfop); 2548 if (retval != IPMP_SUCCESS) 2549 goto out; 2550 } 2551 out: 2552 ipmp_snap_free(snap); 2553 return (retval); 2554 2555 default: 2556 break; 2557 2558 } 2559 return (send_result(fd, IPMP_EPROTO, 0)); 2560 } 2561 2562 /* 2563 * Send the group information pointed to by `grinfop' on file descriptor `fd'. 2564 * Returns an IPMP error code. 2565 */ 2566 static unsigned int 2567 send_groupinfo(int fd, ipmp_groupinfo_t *grinfop) 2568 { 2569 ipmp_iflist_t *iflistp = grinfop->gr_iflistp; 2570 ipmp_addrlist_t *adlistp = grinfop->gr_adlistp; 2571 unsigned int retval; 2572 2573 retval = ipmp_writetlv(fd, IPMP_GROUPINFO, sizeof (*grinfop), grinfop); 2574 if (retval != IPMP_SUCCESS) 2575 return (retval); 2576 2577 retval = ipmp_writetlv(fd, IPMP_IFLIST, 2578 IPMP_IFLIST_SIZE(iflistp->il_nif), iflistp); 2579 if (retval != IPMP_SUCCESS) 2580 return (retval); 2581 2582 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2583 IPMP_ADDRLIST_SIZE(adlistp->al_naddr), adlistp)); 2584 } 2585 2586 /* 2587 * Send the interface information pointed to by `ifinfop' on file descriptor 2588 * `fd'. Returns an IPMP error code. 2589 */ 2590 static unsigned int 2591 send_ifinfo(int fd, ipmp_ifinfo_t *ifinfop) 2592 { 2593 ipmp_addrlist_t *adlist4p = ifinfop->if_targinfo4.it_targlistp; 2594 ipmp_addrlist_t *adlist6p = ifinfop->if_targinfo6.it_targlistp; 2595 unsigned int retval; 2596 2597 retval = ipmp_writetlv(fd, IPMP_IFINFO, sizeof (*ifinfop), ifinfop); 2598 if (retval != IPMP_SUCCESS) 2599 return (retval); 2600 2601 retval = ipmp_writetlv(fd, IPMP_ADDRLIST, 2602 IPMP_ADDRLIST_SIZE(adlist4p->al_naddr), adlist4p); 2603 if (retval != IPMP_SUCCESS) 2604 return (retval); 2605 2606 return (ipmp_writetlv(fd, IPMP_ADDRLIST, 2607 IPMP_ADDRLIST_SIZE(adlist6p->al_naddr), adlist6p)); 2608 } 2609 2610 /* 2611 * Send the address information pointed to by `adinfop' on file descriptor 2612 * `fd'. Returns an IPMP error code. 2613 */ 2614 static unsigned int 2615 send_addrinfo(int fd, ipmp_addrinfo_t *adinfop) 2616 { 2617 return (ipmp_writetlv(fd, IPMP_ADDRINFO, sizeof (*adinfop), adinfop)); 2618 } 2619 2620 /* 2621 * Send the group list pointed to by `grlistp' on file descriptor `fd'. 2622 * Returns an IPMP error code. 2623 */ 2624 static unsigned int 2625 send_grouplist(int fd, ipmp_grouplist_t *grlistp) 2626 { 2627 return (ipmp_writetlv(fd, IPMP_GROUPLIST, 2628 IPMP_GROUPLIST_SIZE(grlistp->gl_ngroup), grlistp)); 2629 } 2630 2631 /* 2632 * Initialize an mi_result_t structure using `error' and `syserror' and 2633 * send it on file descriptor `fd'. Returns an IPMP error code. 2634 */ 2635 static unsigned int 2636 send_result(int fd, unsigned int error, int syserror) 2637 { 2638 mi_result_t me; 2639 2640 me.me_mpathd_error = error; 2641 if (error == IPMP_FAILURE) 2642 me.me_sys_error = syserror; 2643 else 2644 me.me_sys_error = 0; 2645 2646 return (ipmp_write(fd, &me, sizeof (me))); 2647 } 2648 2649 /* 2650 * Daemonize the process. 2651 */ 2652 static boolean_t 2653 daemonize(void) 2654 { 2655 switch (fork()) { 2656 case -1: 2657 return (_B_FALSE); 2658 2659 case 0: 2660 /* 2661 * Lose our controlling terminal, and become both a session 2662 * leader and a process group leader. 2663 */ 2664 if (setsid() == -1) 2665 return (_B_FALSE); 2666 2667 /* 2668 * Under POSIX, a session leader can accidentally (through 2669 * open(2)) acquire a controlling terminal if it does not 2670 * have one. Just to be safe, fork() again so we are not a 2671 * session leader. 2672 */ 2673 switch (fork()) { 2674 case -1: 2675 return (_B_FALSE); 2676 2677 case 0: 2678 (void) chdir("/"); 2679 (void) umask(022); 2680 (void) fdwalk(closefunc, NULL); 2681 break; 2682 2683 default: 2684 _exit(EXIT_SUCCESS); 2685 } 2686 break; 2687 2688 default: 2689 _exit(EXIT_SUCCESS); 2690 } 2691 2692 return (_B_TRUE); 2693 } 2694 2695 /* 2696 * The parent has created some fds before forking on purpose, keep them open. 2697 */ 2698 static int 2699 closefunc(void *not_used, int fd) 2700 /* ARGSUSED */ 2701 { 2702 if (fd != lsock_v4 && fd != lsock_v6) 2703 (void) close(fd); 2704 return (0); 2705 } 2706 2707 /* LOGGER */ 2708 2709 #include <syslog.h> 2710 2711 /* 2712 * Logging routines. All routines log to syslog, unless the daemon is 2713 * running in the foreground, in which case the logging goes to stderr. 2714 * 2715 * The following routines are available: 2716 * 2717 * logdebug(): A printf-like function for outputting debug messages 2718 * (messages at LOG_DEBUG) that are only of use to developers. 2719 * 2720 * logtrace(): A printf-like function for outputting tracing messages 2721 * (messages at LOG_INFO) from the daemon. This is typically used 2722 * to log the receipt of interesting network-related conditions. 2723 * 2724 * logerr(): A printf-like function for outputting error messages 2725 * (messages at LOG_ERR) from the daemon. 2726 * 2727 * logperror*(): A set of functions used to output error messages 2728 * (messages at LOG_ERR); these automatically append strerror(errno) 2729 * and a newline to the message passed to them. 2730 * 2731 * NOTE: since the logging functions write to syslog, the messages passed 2732 * to them are not eligible for localization. Thus, gettext() must 2733 * *not* be used. 2734 */ 2735 2736 static int logging = 0; 2737 2738 static void 2739 initlog(void) 2740 { 2741 logging++; 2742 openlog("in.mpathd", LOG_PID, LOG_DAEMON); 2743 } 2744 2745 /* PRINTFLIKE2 */ 2746 void 2747 logmsg(int pri, const char *fmt, ...) 2748 { 2749 va_list ap; 2750 2751 va_start(ap, fmt); 2752 2753 if (logging) 2754 vsyslog(pri, fmt, ap); 2755 else 2756 (void) vfprintf(stderr, fmt, ap); 2757 va_end(ap); 2758 } 2759 2760 /* PRINTFLIKE1 */ 2761 void 2762 logperror(const char *str) 2763 { 2764 if (logging) 2765 syslog(LOG_ERR, "%s: %m\n", str); 2766 else 2767 (void) fprintf(stderr, "%s: %s\n", str, strerror(errno)); 2768 } 2769 2770 void 2771 logperror_pii(struct phyint_instance *pii, const char *str) 2772 { 2773 if (logging) { 2774 syslog(LOG_ERR, "%s (%s %s): %m\n", 2775 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name); 2776 } else { 2777 (void) fprintf(stderr, "%s (%s %s): %s\n", 2778 str, AF_STR(pii->pii_af), pii->pii_phyint->pi_name, 2779 strerror(errno)); 2780 } 2781 } 2782 2783 void 2784 logperror_li(struct logint *li, const char *str) 2785 { 2786 struct phyint_instance *pii = li->li_phyint_inst; 2787 2788 if (logging) { 2789 syslog(LOG_ERR, "%s (%s %s): %m\n", 2790 str, AF_STR(pii->pii_af), li->li_name); 2791 } else { 2792 (void) fprintf(stderr, "%s (%s %s): %s\n", 2793 str, AF_STR(pii->pii_af), li->li_name, 2794 strerror(errno)); 2795 } 2796 } 2797 2798 void 2799 close_probe_socket(struct phyint_instance *pii, boolean_t polled) 2800 { 2801 if (polled) 2802 (void) poll_remove(pii->pii_probe_sock); 2803 (void) close(pii->pii_probe_sock); 2804 pii->pii_probe_sock = -1; 2805 pii->pii_basetime_inited = 0; 2806 } 2807 2808 boolean_t 2809 addrlist_add(addrlist_t **addrsp, const char *name, uint64_t flags, 2810 struct sockaddr_storage *ssp) 2811 { 2812 addrlist_t *addrp; 2813 2814 if ((addrp = malloc(sizeof (addrlist_t))) == NULL) 2815 return (_B_FALSE); 2816 2817 (void) strlcpy(addrp->al_name, name, LIFNAMSIZ); 2818 addrp->al_flags = flags; 2819 addrp->al_addr = *ssp; 2820 addrp->al_next = *addrsp; 2821 *addrsp = addrp; 2822 return (_B_TRUE); 2823 } 2824 2825 void 2826 addrlist_free(addrlist_t **addrsp) 2827 { 2828 addrlist_t *addrp, *next_addrp; 2829 2830 for (addrp = *addrsp; addrp != NULL; addrp = next_addrp) { 2831 next_addrp = addrp->al_next; 2832 free(addrp); 2833 } 2834 *addrsp = NULL; 2835 } 2836 2837 /* 2838 * Send down a T_OPTMGMT_REQ to ip asking for all data in the various 2839 * tables defined by mib2.h. Pass the table information returned to the 2840 * supplied function. 2841 */ 2842 static int 2843 mibwalk(void (*proc)(mib_item_t *)) 2844 { 2845 mib_item_t *head_item = NULL; 2846 mib_item_t *last_item = NULL; 2847 mib_item_t *tmp; 2848 struct strbuf ctlbuf, databuf; 2849 int flags; 2850 int rval; 2851 uintptr_t buf[512 / sizeof (uintptr_t)]; 2852 struct T_optmgmt_req *tor = (struct T_optmgmt_req *)buf; 2853 struct T_optmgmt_ack *toa = (struct T_optmgmt_ack *)buf; 2854 struct T_error_ack *tea = (struct T_error_ack *)buf; 2855 struct opthdr *req, *optp; 2856 int status = -1; 2857 2858 if (mibfd == -1) { 2859 if ((mibfd = open("/dev/ip", O_RDWR)) < 0) { 2860 logperror("mibwalk(): ip open"); 2861 return (status); 2862 } 2863 } 2864 2865 tor->PRIM_type = T_SVR4_OPTMGMT_REQ; 2866 tor->OPT_offset = sizeof (struct T_optmgmt_req); 2867 tor->OPT_length = sizeof (struct opthdr); 2868 tor->MGMT_flags = T_CURRENT; 2869 2870 /* 2871 * Note: we use the special level value below so that IP will return 2872 * us information concerning IRE_MARK_TESTHIDDEN routes. 2873 */ 2874 req = (struct opthdr *)&tor[1]; 2875 req->level = EXPER_IP_AND_TESTHIDDEN; 2876 req->name = 0; 2877 req->len = 0; 2878 2879 ctlbuf.buf = (char *)&buf; 2880 ctlbuf.len = tor->OPT_length + tor->OPT_offset; 2881 2882 if (putmsg(mibfd, &ctlbuf, NULL, 0) == -1) { 2883 logperror("mibwalk(): putmsg(ctl)"); 2884 return (status); 2885 } 2886 2887 /* 2888 * The response consists of multiple T_OPTMGMT_ACK msgs, 1 msg for 2889 * each table defined in mib2.h. Each T_OPTMGMT_ACK msg contains 2890 * a control and data part. The control part contains a struct 2891 * T_optmgmt_ack followed by a struct opthdr. The 'opthdr' identifies 2892 * the level, name and length of the data in the data part. The 2893 * data part contains the actual table data. The last message 2894 * is an end-of-data (EOD), consisting of a T_OPTMGMT_ACK and a 2895 * single option with zero optlen. 2896 */ 2897 for (;;) { 2898 errno = flags = 0; 2899 ctlbuf.maxlen = sizeof (buf); 2900 rval = getmsg(mibfd, &ctlbuf, NULL, &flags); 2901 if (rval & MORECTL || rval < 0) { 2902 if (errno == EINTR) 2903 continue; 2904 logerr("mibwalk(): getmsg(ctl) ret: %d err: %d\n", 2905 rval, errno); 2906 goto error; 2907 } 2908 if (ctlbuf.len < sizeof (t_scalar_t)) { 2909 logerr("mibwalk(): ctlbuf.len %d\n", ctlbuf.len); 2910 goto error; 2911 } 2912 2913 switch (toa->PRIM_type) { 2914 case T_ERROR_ACK: 2915 if (ctlbuf.len < sizeof (struct T_error_ack)) { 2916 logerr("mibwalk(): T_ERROR_ACK ctlbuf " 2917 "too short: %d\n", ctlbuf.len); 2918 goto error; 2919 } 2920 logerr("mibwalk(): T_ERROR_ACK: TLI_err = 0x%lx: %s\n" 2921 " UNIX_err = 0x%lx\n", tea->TLI_error, 2922 t_strerror(tea->TLI_error), tea->UNIX_error); 2923 goto error; 2924 2925 case T_OPTMGMT_ACK: 2926 optp = (struct opthdr *)&toa[1]; 2927 if (ctlbuf.len < (sizeof (struct T_optmgmt_ack) + 2928 sizeof (struct opthdr))) { 2929 logerr("mibwalk(): T_OPTMGMT_ACK ctlbuf too " 2930 "short: %d\n", ctlbuf.len); 2931 goto error; 2932 } 2933 if (toa->MGMT_flags != T_SUCCESS) { 2934 logerr("mibwalk(): MGMT_flags != T_SUCCESS: " 2935 "0x%lx\n", toa->MGMT_flags); 2936 goto error; 2937 } 2938 break; 2939 2940 default: 2941 goto error; 2942 } 2943 /* The following assert also implies MGMT_flags == T_SUCCESS */ 2944 assert(toa->PRIM_type == T_OPTMGMT_ACK); 2945 2946 /* 2947 * We have reached the end of this T_OPTMGMT_ACK 2948 * message. If this is the last message i.e EOD, 2949 * break, else process the next T_OPTMGMT_ACK msg. 2950 */ 2951 if (rval == 0) { 2952 if (optp->len == 0 && optp->name == 0 && 2953 optp->level == 0) { 2954 /* This is the EOD message. */ 2955 break; 2956 } 2957 /* Not EOD but no data to retrieve */ 2958 continue; 2959 } 2960 2961 /* 2962 * We should only be here if MOREDATA was set. 2963 * Allocate an empty mib_item_t and link into the list 2964 * of MIB items. 2965 */ 2966 if ((tmp = malloc(sizeof (*tmp))) == NULL) { 2967 logperror("mibwalk(): malloc() failed."); 2968 goto error; 2969 } 2970 if (last_item != NULL) 2971 last_item->mi_next = tmp; 2972 else 2973 head_item = tmp; 2974 last_item = tmp; 2975 last_item->mi_next = NULL; 2976 last_item->mi_opthdr = *optp; 2977 last_item->mi_valp = malloc(optp->len); 2978 if (last_item->mi_valp == NULL) { 2979 logperror("mibwalk(): malloc() failed."); 2980 goto error; 2981 } 2982 2983 databuf.maxlen = last_item->mi_opthdr.len; 2984 databuf.buf = (char *)last_item->mi_valp; 2985 databuf.len = 0; 2986 2987 /* Retrieve the actual MIB data */ 2988 for (;;) { 2989 flags = 0; 2990 if ((rval = getmsg(mibfd, NULL, &databuf, 2991 &flags)) != 0) { 2992 if (rval < 0 && errno == EINTR) 2993 continue; 2994 /* 2995 * We shouldn't get MOREDATA here so treat that 2996 * as an error. 2997 */ 2998 logperror("mibwalk(): getmsg(data)"); 2999 goto error; 3000 } 3001 break; 3002 } 3003 } 3004 status = 0; 3005 /* Pass the accumulated MIB data to the supplied function pointer */ 3006 (*proc)(head_item); 3007 error: 3008 while (head_item != NULL) { 3009 tmp = head_item; 3010 head_item = tmp->mi_next; 3011 free(tmp->mi_valp); 3012 free(tmp); 3013 } 3014 return (status); 3015 } 3016 3017 /* 3018 * Parse the supplied mib2 information to get the size of routing table 3019 * entries. This is needed when running in a branded zone where the 3020 * Solaris application environment and the Solaris kernel may not be the 3021 * the same release version. 3022 */ 3023 static void 3024 mib_get_constants(mib_item_t *item) 3025 { 3026 mib2_ip_t *ipv4; 3027 mib2_ipv6IfStatsEntry_t *ipv6; 3028 3029 for (; item != NULL; item = item->mi_next) { 3030 if (item->mi_opthdr.name != 0) 3031 continue; 3032 if (item->mi_opthdr.level == MIB2_IP) { 3033 ipv4 = (mib2_ip_t *)item->mi_valp; 3034 ipRouteEntrySize = ipv4->ipRouteEntrySize; 3035 } else if (item->mi_opthdr.level == MIB2_IP6) { 3036 ipv6 = (mib2_ipv6IfStatsEntry_t *)item->mi_valp; 3037 ipv6RouteEntrySize = ipv6->ipv6RouteEntrySize; 3038 } 3039 } 3040 } 3041